Reduce RAM usage when handling big datasets with weekly data

rOpenGov · Jun 28, 2023 · cfdaf37 · cfdaf37
1 parent 4bf171d
commit cfdaf37
Show file tree

Hide file tree

Showing 8 changed files with 114 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Type: Package
 Package: eurostat
 Title: Tools for Eurostat Open Data
-Version: 4.0.0.9001
-Date: 2023-06-20
+Version: 4.0.0.9002
+Date: 2023-06-29
 Authors@R: c(
     person("Leo", "Lahti", , "leo.lahti@iki.fi", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-5537-637X")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -36,6 +36,7 @@ importFrom(curl,curl_download)
 importFrom(dplyr,case_when)
 importFrom(dplyr,coalesce)
 importFrom(dplyr,filter)
+importFrom(dplyr,inner_join)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(httr,RETRY)
@@ -59,6 +60,8 @@ importFrom(regions,recode_nuts)
 importFrom(regions,validate_geo_code)
 importFrom(regions,validate_nuts_regions)
 importFrom(stringi,stri_extract_first_regex)
+importFrom(stringi,stri_replace_all_fixed)
+importFrom(stringi,stri_replace_all_regex)
 importFrom(stringr,str_glue)
 importFrom(stringr,str_replace_all)
 importFrom(tibble,as_tibble)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,18 @@
+# eurostat 4.0.0.9002
+
+## Major updates
+
+* Remove legacy code related to downloading data from old bulk download facilities and temporary functions added in package version 3.7.14.
+
+## Minor updates 
+
+* Print more informative API error messages.
+
+## Bug fixes
+
+* Fix issue related to downloading quarterly data
+* Reduce RAM usage in `eurotime2date()` when handling big datasets containing weekly data and tens of millions of rows (dataset used for testing mentioned in issue #200).
+
 # eurostat 3.8.3 (2023-03-07)
 
 ## Bug fixes

diff --git a/R/eurotime2date.R b/R/eurotime2date.R
@@ -39,6 +39,7 @@
 #'
 #' @importFrom lubridate ymd
 #' @importFrom ISOweek ISOweek2date
+#' @importFrom dplyr inner_join
 #'
 #' @export
 eurotime2date <- function(x, last = FALSE) {
@@ -89,17 +90,50 @@ eurotime2date <- function(x, last = FALSE) {
     )
     return(x)
   }
-
-  levels(x) <- paste0(year, "-", period, "-", day)
-
+
   # The date as the last date of the period
   if (tcode == "W") {
-    # we will be using range 1-7 here, not 01-07
+
+    # in some datasets week number can be larger than 53, e.g. 99
+    # ISOweek2date does not support such week numbers -> they are coerced
+    # to W01-1 or W01-7
+    if (any(substr(period, 2, 3) > 53)) {
+      warning(paste("Some TIME_PERIOD fields have invalid week values (> 53).", 
+                    "Coercing invalid fields to format YYYY-W01",
+                    "(the first week of the year). If you wish to handle",
+                    "weeks with invalid values manually please use",
+                    "parameter time_format = 'raw' in get_eurostat."))
+      invalid_w_numbers <- which(substr(period, 2, 3) > 53)
+      period[invalid_w_numbers] <- "W01"
+    }
+
+    # Range is 1-7 here, not 01-07. 1 = Monday, 2 = Tuesday etc.
     day <- ifelse(last == TRUE, 7, 1)
     levels(x) <- paste0(year, "-", period, "-", day)
-    x <- ISOweek::ISOweek2date(x)
-    return(x)
+    unique_dates <- unique(x)
+    column_names <- c("orig", "date")
+    d <- data.frame(matrix(nrow = length(unique_dates), ncol = length(column_names)))
+    colnames(d) <- column_names
+    d$orig <- unique_dates
+    d$date <- ISOweek::ISOweek2date(unique_dates)
+
+    # NEW CODE: data.table
+    # d <- as.data.table(d)
+    # x <- as.data.table(x)
+
+    x <- as.data.frame(x)
+    colnames(x) <- "orig"
+
+    # NEW CODE: data.table
+    # y <- x[d, on = "orig"]$date
+
+    y <- dplyr::inner_join(x, d, by = "orig")
+    y <- y$date
+    return(y)
   }
+
+  levels(x) <- paste0(year, "-", period, "-", day)
+
   # For times other than weeks
   if (last == TRUE && tcode != "W") {
     shift <- c("Y" = 367, "S" = 186, "Q" = 96, "0" = 32, "1" = 32, "D" = 0)[tcode]

diff --git a/R/get_eurostat.R b/R/get_eurostat.R
@@ -210,7 +210,6 @@ get_eurostat <- function(id,
         # Bulk download
       } else if (filters == "none") {
 
-          message("Trying to download from the new dissemination API... \n")
           # Download from new dissemination API in TSV file format
           y_raw <- try(get_eurostat_raw(id))
           if ("try-error" %in% class(y_raw)) {

diff --git a/R/get_eurostat_raw.R b/R/get_eurostat_raw.R
@@ -45,13 +45,29 @@ get_eurostat_raw <- function(id) {
   if (.Platform$OS.type == "windows") {
     curl::curl_download(url = url, destfile = tfile)
   } else {
+    # R Packages (2e): Restore state with base::on.exit()
+    # Use timeout = 90 for bigger datasets
+    op <- options(timeout = 90)
+    on.exit(options(op), add = TRUE)
     utils::download.file(url, tfile)
   }
 
+  # OLD CODE
   dat <- readr::read_tsv(gzfile(tfile),
     na = ":",
     col_types = readr::cols(.default = readr::col_character())
   )
+
+  # NEW CODE: data.table
+  # dat <- data.table::fread(cmd = paste("gzip -dc", tfile), 
+  #                          na.strings = ":",
+  #                          colClasses = "character")
+  # The reason why data.table is not currently used is that readr::cols
+  # and readr::col_character() worked better with some datasets
+  # and because RAM usage was not that much lower with data.table
+
+  # OLD CODE
+  dat <- tibble::as_tibble(dat)
 
 
   # check validity

diff --git a/R/tidy_eurostat.R b/R/tidy_eurostat.R
@@ -29,7 +29,7 @@
 #' @references See citation("eurostat").
 #' @author Przemyslaw Biecek, Leo Lahti, Janne Huovari and Pyry Kantanen
 #'
-#' @importFrom stringi stri_extract_first_regex
+#' @importFrom stringi stri_extract_first_regex stri_replace_all_regex stri_replace_all_fixed
 #' @importFrom tidyr separate pivot_longer
 #' @importFrom dplyr filter
 #' 
@@ -55,22 +55,38 @@ tidy_eurostat <- function(dat,
   cnames2 <- cnames[length(cnames)] # for colnames
 
   # Separe variables from first column
+  # OLD CODE
   dat <- tidyr::separate(dat,
     col = colnames(dat)[1],
     into = cnames1,
-    sep = ",", 
+    sep = ",",
     convert = FALSE
   )
+
+  # NEW CODE: data.table
+  # defining dat as data.table object is necessary to use data.table functions
+  # dat <- data.table::as.data.table(dat)
 
   # Get variable from column names
+  # OLD CODE
   dat <- tidyr::pivot_longer(data = dat,
                               cols = -seq_along(cnames1),
                               names_to = cnames2,
                               values_to = "values")
+
+  # NEW CODE: data.table
+  # dat <- data.table::melt(data = dat,
+  #                         measure.vars = setdiff(names(dat), cnames1),
+  #                         variable.name = cnames2,
+  #                         value.name = "values")
 
   # to save memory (and backward compatibility)
+  # OLD CODE
   dat <- dplyr::filter(dat, !is.na(values))
 
+  # NEW CODE: data.table
+  # dat <- na.omit(dat, "values")
+
   ## separate flags into separate column
   if (keepFlags == TRUE) {
     dat$flags <- as.vector(
@@ -82,14 +98,26 @@ tidy_eurostat <- function(dat,
   }
 
   # clean time and values
-  dat$TIME_PERIOD <- gsub("X", "", dat$TIME_PERIOD)
-  dat$values <- as.numeric(gsub("[^0-9.-]+", "", as.character(dat$values)))
-
+  # OLD CODE
+  # dat$TIME_PERIOD <- gsub("X", "", dat$TIME_PERIOD, fixed = TRUE)
+  # dat$values <- as.numeric(gsub("[^0-9.-]+", "", as.character(dat$values)))
+  # NEW CODE: use stringi instead of gsub for faster execution
+  dat$TIME_PERIOD <- stringi::stri_replace_all_fixed(dat$TIME_PERIOD, "X", "")
+  dat$values <- as.numeric(stringi::stri_replace_all_regex(as.character(dat$values), "[^0-9.-]+", ""))
+
   # variable columns
   var_cols <- names(dat)[!(names(dat) %in% c("TIME_PERIOD", "values"))]
+  # selected_cols <- c(var_cols, "TIME_PERIOD", "values")
 
   # reorder to standard order
+  # OLD CODE
   dat <- dat[c(var_cols, "TIME_PERIOD", "values")]
+
+  # NEW CODE: data.table
+  # either this way
+  # dat <- dat[, ..selected_cols]
+  # or this way
+  # data.table::setcolorder(dat, c(var_cols, "TIME_PERIOD", "values"))
 
   # columns from var_cols are converted into factors
   # avoid convert = FALSE since it converts T into TRUE instead of TOTAL
@@ -131,7 +159,7 @@ tidy_eurostat <- function(dat,
       )
     }
   } else {
-    
+
     if (length(freqs) > 1 & time_format != "raw") {
       message(
         "Data includes several time frequencies. Select a single frequency \n",
@@ -157,6 +185,9 @@ tidy_eurostat <- function(dat,
                                           time_format = time_format)
   }
 
+  # NEW CODE: data.table
+  # This is needed if we still want to return tibbles at the end
+  # dat <- tibble::as_tibble(dat)
   dat
 }
 

diff --git a/man/eurostat-package.Rd b/man/eurostat-package.Rd