Skip to content

Commit

Permalink
Merge pull request #1 from swerik-project/development
Browse files Browse the repository at this point in the history
New version
  • Loading branch information
MansMeg authored Mar 7, 2024
2 parents 673dda5 + 0b61a23 commit 767149d
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 15 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: rcr
Type: Package
Title: Working with the Riksdagen Corpus
Version: 0.2.0
Date: 2023-12-13
Version: 0.3.0
Date: 2024-03-07
Author: Mans Magnusson
Maintainer: Mans Magnusson <mans.magnusson@statistik.uu.se>
Description: The package contain helper functions to easily work with the Riksdagen corpus from R.
Expand Down
64 changes: 64 additions & 0 deletions R/extract_record_date_from_records.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#' Extract Date from Record
#'
#' @description
#' The function extract date from the Riksdagen Records.
#'
#' @param record_path a file path to a record XML file
#' @param record_paths a vector of file paths to a record XML file
#' @param mc.cores the number of cores to use (Linux and Mac only) in \code{mclapply}.
#' Defaults to available cores - 1.
#' @param ... further arguments supplied to \code{mclapply}.
#'
#' @return
#' The function returns a \code{tibble} data frame with the following variables:
#' \describe{
#' \item{record_id}{The id of the record.}
#' \item{record_date}{The date of the record.}
#' }
#'
#' @importFrom xml2 read_xml xml_ns_strip, xml_attr
#' @export
extract_record_dates_from_record <- function(record_path, all=F){
checkmate::assert_string(record_path)
rcp <- get_riksdag_corpora_path()
rcfp <- file.path(rcp, record_path)
if(file.exists(rcfp)){
record_path <- rcfp
}
checkmate::assert_file_exists(record_path)

x <- read_xml(record_path)
x <- xml_ns_strip(x)

id <- xml_attr(xml_find_all(x, "TEI"),attr = "id")
xs <- xml_find_all(x,".//docDate")
df <- tibble("record_id" = id,
"doc_date" = as.Date(xml_text(xs)))

return(df)
}


#' @rdname extract_speeches_from_record
#' @export
extract_record_dates_from_records <- function(record_paths, mc.cores = getOption("mc.cores", detectCores() - 1L), ...){
checkmate::assert_character(record_paths)
rcp <- get_riksdag_corpora_path()
rcfp <- file.path(rcp, record_paths)
for(i in seq_along(rcfp)){
if(file.exists(rcfp[i])){
record_paths[i] <- rcfp[i]
}
}
checkmate::assert_file_exists(record_paths)

if(mc.cores > 1L & .Platform$OS.type == "unix"){
message(mc.cores, " cores are used to process the data.")
res <- parallel::mclapply(record_paths, extract_record_dates_from_record, mc.cores = mc.cores, ...)
} else {
res <- lapply(record_paths, extract_record_dates_from_record)
}

res <- bind_rows(res)
res[, c("record_id", "doc_date")]
}
14 changes: 7 additions & 7 deletions R/extract_speeches.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
#' @param ... further arguments supplied to \code{mclapply}.
#'
#' @return
#' The function returns a tibble data frame with the following variables:
#' The function returns a \code{tibble} data frame with the following variables:
#' \describe{
#' \item{record_id}{The id of the record.}
#' \item{speech_no}{The speech number in the record.}
#' \item{speech_id}{The id of the XML node to the introduction of the speaker.}
#' \item{who}{The id of the person giving the speech.}
Expand All @@ -43,8 +44,10 @@ extract_speeches_from_record <- function(record_path){
x <- xml_ns_strip(x)

# Extract speeches
id <- xml_attr(xml_find_all(x, "TEI"),attr = "id")
xs <- xml_find_all(x, ".//note[@type = 'speaker']|.//u|.//seg")
df <- tibble("type_speaker" = xml_attr(xs, attr = "type") == "speaker",
df <- tibble("record_id" = id,
"type_speaker" = xml_attr(xs, attr = "type") == "speaker",
"name" = xml_name(xs),
"who" = xml_attr(xs, attr = "who"),
"id" = xml_attr(xs, attr = "id"),
Expand All @@ -57,7 +60,7 @@ extract_speeches_from_record <- function(record_path){
df <- df[df$name == "seg",]
df$type_speaker <- NULL
df$name <- NULL
df[, c("speech_no", "speech_id", "who", "id", "text")]
df[, c("record_id", "speech_no", "speech_id", "who", "id", "text")]
}

#' @rdname extract_speeches_from_record
Expand All @@ -80,10 +83,7 @@ extract_speeches_from_records <- function(record_paths, mc.cores = getOption("mc
res <- lapply(record_paths, extract_speeches_from_record)
}

for(i in seq_along(res)){
res[[i]]$record <- basename(record_paths[i])
}
res <- bind_rows(res)
res[, c("record", "speech_no", "speech_id", "who", "id", "text")]
res[, c("record_id", "speech_no", "speech_id", "who", "id", "text")]
}

4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,7 @@ fps <-
sp <- extract_speeches_from_records(fps)
```

Similarly we can extract the dates from the records with
```
ds <- extract_record_dates_from_records(fps)
```
2 changes: 1 addition & 1 deletion tests/testthat/files/prot-1951--fk--029.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
</editorialDecl>
</encodingDesc>
</teiHeader>
<TEI>
<TEI xml:id="prot-1951--fk--029">
<teiHeader>
<fileDesc>
<titleStmt>
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/files/prot-1975--036.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
</editorialDecl>
</encodingDesc>
</teiHeader>
<TEI>
<TEI xml:id="prot-1975--036">
<teiHeader>
<fileDesc>
<titleStmt>
Expand Down
11 changes: 11 additions & 0 deletions tests/testthat/test-extract_dates.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
test_that("extracting dates works", {

tfp <- c('prot-1896--ak--042.xml', 'prot-1951--fk--029.xml', 'prot-1975--036.xml')
tfp <- test_path(file.path("files", tfp))

expect_silent(sp <- extract_record_dates_from_record(record_path = tfp[1]))
expect_error(sp <- extract_record_dates_from_record(tfp))
expect_silent(sp <- extract_record_dates_from_records(tfp, mc.cores = 1L))
expect_silent(suppressMessages(sp <- extract_speeches_from_records(record_paths = tfp, mc.cores = 2L)))

})
2 changes: 1 addition & 1 deletion tests/testthat/test-extract_speeches.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ test_that("extracting speeches works", {
tfp <- c('prot-1896--ak--042.xml', 'prot-1951--fk--029.xml', 'prot-1975--036.xml')
tfp <- test_path(file.path("files", tfp))

expect_silent(sp <- extract_speeches_from_record(tfp[1]))
expect_silent(sp <- extract_speeches_from_record(record_path = tfp[1]))
expect_error(sp <- extract_speeches_from_record(tfp))
expect_silent(sp <- extract_speeches_from_records(tfp, mc.cores = 1L))
expect_silent(suppressMessages(sp <- extract_speeches_from_records(record_paths = tfp, mc.cores = 2L)))
Expand Down
3 changes: 0 additions & 3 deletions tests/testthat/test-test.R

This file was deleted.

0 comments on commit 767149d

Please sign in to comment.