Merge pull request #1 from swerik-project/development

New version
swerik-project · Mar 7, 2024 · 767149d · 767149d
2 parents 673dda5 + 0b61a23
commit 767149d
Show file tree

Hide file tree

Showing 9 changed files with 91 additions and 15 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: rcr
 Type: Package
 Title: Working with the Riksdagen Corpus
-Version: 0.2.0
-Date: 2023-12-13
+Version: 0.3.0
+Date: 2024-03-07
 Author: Mans Magnusson
 Maintainer: Mans Magnusson <mans.magnusson@statistik.uu.se>
 Description: The package contain helper functions to easily work with the Riksdagen corpus from R.

diff --git a/R/extract_record_date_from_records.R b/R/extract_record_date_from_records.R
@@ -0,0 +1,64 @@
+#' Extract Date from Record
+#'
+#' @description
+#' The function extract date from the Riksdagen Records.
+#'
+#' @param record_path a file path to a record XML file
+#' @param record_paths a vector of file paths to a record XML file
+#' @param mc.cores the number of cores to use (Linux and Mac only) in \code{mclapply}.
+#'                 Defaults to available cores - 1.
+#' @param ... further arguments supplied to \code{mclapply}.
+#'
+#' @return
+#' The function returns a \code{tibble} data frame with the following variables:
+#' \describe{
+#'   \item{record_id}{The id of the record.}
+#'   \item{record_date}{The date of the record.}
+#' }
+#'
+#' @importFrom xml2 read_xml xml_ns_strip, xml_attr
+#' @export
+extract_record_dates_from_record <- function(record_path, all=F){
+  checkmate::assert_string(record_path)
+  rcp <- get_riksdag_corpora_path()
+  rcfp <- file.path(rcp, record_path)
+  if(file.exists(rcfp)){
+    record_path <- rcfp
+  }
+  checkmate::assert_file_exists(record_path)
+
+  x <- read_xml(record_path)
+  x <- xml_ns_strip(x)
+
+  id <- xml_attr(xml_find_all(x, "TEI"),attr = "id")
+  xs <- xml_find_all(x,".//docDate")
+  df <- tibble("record_id" = id,
+               "doc_date" = as.Date(xml_text(xs)))
+
+  return(df)
+}
+
+
+#' @rdname extract_speeches_from_record
+#' @export
+extract_record_dates_from_records <- function(record_paths, mc.cores = getOption("mc.cores", detectCores() - 1L), ...){
+  checkmate::assert_character(record_paths)
+  rcp <- get_riksdag_corpora_path()
+  rcfp <- file.path(rcp, record_paths)
+  for(i in seq_along(rcfp)){
+    if(file.exists(rcfp[i])){
+      record_paths[i] <- rcfp[i]
+    }
+  }
+  checkmate::assert_file_exists(record_paths)
+
+  if(mc.cores > 1L & .Platform$OS.type == "unix"){
+    message(mc.cores, " cores are used to process the data.")
+    res <- parallel::mclapply(record_paths, extract_record_dates_from_record, mc.cores = mc.cores, ...)
+  } else {
+    res <- lapply(record_paths, extract_record_dates_from_record)
+  }
+
+  res <- bind_rows(res)
+  res[, c("record_id", "doc_date")]
+}
diff --git a/R/extract_speeches.R b/R/extract_speeches.R
@@ -15,8 +15,9 @@
 #' @param ... further arguments supplied to \code{mclapply}.
 #'
 #' @return
-#' The function returns a tibble data frame with the following variables:
+#' The function returns a \code{tibble} data frame with the following variables:
 #' \describe{
+#'   \item{record_id}{The id of the record.}
 #'   \item{speech_no}{The speech number in the record.}
 #'   \item{speech_id}{The id of the XML node to the introduction of the speaker.}
 #'   \item{who}{The id of the person giving the speech.}
@@ -43,8 +44,10 @@ extract_speeches_from_record <- function(record_path){
   x <- xml_ns_strip(x)
 
   # Extract speeches
+  id <- xml_attr(xml_find_all(x, "TEI"),attr = "id")
   xs <- xml_find_all(x, ".//note[@type = 'speaker']|.//u|.//seg")
-  df <- tibble("type_speaker" = xml_attr(xs, attr = "type") == "speaker",
+  df <- tibble("record_id" = id,
+               "type_speaker" = xml_attr(xs, attr = "type") == "speaker",
                "name" = xml_name(xs),
                "who" = xml_attr(xs, attr = "who"),
                "id" = xml_attr(xs, attr = "id"),
@@ -57,7 +60,7 @@ extract_speeches_from_record <- function(record_path){
   df <- df[df$name == "seg",]
   df$type_speaker <- NULL
   df$name <- NULL
-  df[, c("speech_no", "speech_id", "who", "id", "text")]
+  df[, c("record_id", "speech_no", "speech_id", "who", "id", "text")]
 }
 
 #' @rdname extract_speeches_from_record
@@ -80,10 +83,7 @@ extract_speeches_from_records <- function(record_paths, mc.cores = getOption("mc
     res <- lapply(record_paths, extract_speeches_from_record)
   }
 
-  for(i in seq_along(res)){
-    res[[i]]$record <- basename(record_paths[i])
-  }
   res <- bind_rows(res)
-  res[, c("record", "speech_no", "speech_id", "who", "id", "text")]
+  res[, c("record_id", "speech_no", "speech_id", "who", "id", "text")]
 }
 
diff --git a/README.md b/README.md
@@ -29,3 +29,7 @@ fps <-
 sp <- extract_speeches_from_records(fps)
 ```
 
+Similarly we can extract the dates from the records with
+```
+ds <- extract_record_dates_from_records(fps)
+```
diff --git a/tests/testthat/files/prot-1951--fk--029.xml b/tests/testthat/files/prot-1951--fk--029.xml
@@ -28,7 +28,7 @@
       </editorialDecl>
     </encodingDesc>
   </teiHeader>
-  <TEI>
+  <TEI xml:id="prot-1951--fk--029">
     <teiHeader>
       <fileDesc>
         <titleStmt>

diff --git a/tests/testthat/files/prot-1975--036.xml b/tests/testthat/files/prot-1975--036.xml
@@ -28,7 +28,7 @@
       </editorialDecl>
     </encodingDesc>
   </teiHeader>
-  <TEI>
+  <TEI xml:id="prot-1975--036">
     <teiHeader>
       <fileDesc>
         <titleStmt>

diff --git a/tests/testthat/test-extract_dates.R b/tests/testthat/test-extract_dates.R
@@ -0,0 +1,11 @@
+test_that("extracting dates works", {
+
+  tfp <- c('prot-1896--ak--042.xml', 'prot-1951--fk--029.xml', 'prot-1975--036.xml')
+  tfp <- test_path(file.path("files", tfp))
+
+  expect_silent(sp <- extract_record_dates_from_record(record_path = tfp[1]))
+  expect_error(sp <- extract_record_dates_from_record(tfp))
+  expect_silent(sp <- extract_record_dates_from_records(tfp, mc.cores = 1L))
+  expect_silent(suppressMessages(sp <- extract_speeches_from_records(record_paths = tfp, mc.cores = 2L)))
+
+})
diff --git a/tests/testthat/test-extract_speeches.R b/tests/testthat/test-extract_speeches.R
@@ -14,7 +14,7 @@ test_that("extracting speeches works", {
   tfp <- c('prot-1896--ak--042.xml', 'prot-1951--fk--029.xml', 'prot-1975--036.xml')
   tfp <- test_path(file.path("files", tfp))
 
-  expect_silent(sp <- extract_speeches_from_record(tfp[1]))
+  expect_silent(sp <- extract_speeches_from_record(record_path = tfp[1]))
   expect_error(sp <- extract_speeches_from_record(tfp))
   expect_silent(sp <- extract_speeches_from_records(tfp, mc.cores = 1L))
   expect_silent(suppressMessages(sp <- extract_speeches_from_records(record_paths = tfp, mc.cores = 2L)))

diff --git a/tests/testthat/test-test.R b/tests/testthat/test-test.R