Skip to content

Commit

Permalink
Reduce RAM usage when handling big datasets with weekly data
Browse files Browse the repository at this point in the history
  • Loading branch information
pitkant committed Jun 28, 2023
1 parent 4bf171d commit cfdaf37
Show file tree
Hide file tree
Showing 8 changed files with 114 additions and 16 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Type: Package
Package: eurostat
Title: Tools for Eurostat Open Data
Version: 4.0.0.9001
Date: 2023-06-20
Version: 4.0.0.9002
Date: 2023-06-29
Authors@R: c(
person("Leo", "Lahti", , "leo.lahti@iki.fi", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5537-637X")),
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ importFrom(curl,curl_download)
importFrom(dplyr,case_when)
importFrom(dplyr,coalesce)
importFrom(dplyr,filter)
importFrom(dplyr,inner_join)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
importFrom(httr,RETRY)
Expand All @@ -59,6 +60,8 @@ importFrom(regions,recode_nuts)
importFrom(regions,validate_geo_code)
importFrom(regions,validate_nuts_regions)
importFrom(stringi,stri_extract_first_regex)
importFrom(stringi,stri_replace_all_fixed)
importFrom(stringi,stri_replace_all_regex)
importFrom(stringr,str_glue)
importFrom(stringr,str_replace_all)
importFrom(tibble,as_tibble)
Expand Down
15 changes: 15 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# eurostat 4.0.0.9002

## Major updates

* Remove legacy code related to downloading data from old bulk download facilities and temporary functions added in package version 3.7.14.

## Minor updates

* Print more informative API error messages.

## Bug fixes

* Fix issue related to downloading quarterly data
* Reduce RAM usage in `eurotime2date()` when handling big datasets containing weekly data and tens of millions of rows (dataset used for testing mentioned in issue #200).

# eurostat 3.8.3 (2023-03-07)

## Bug fixes
Expand Down
46 changes: 40 additions & 6 deletions R/eurotime2date.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#'
#' @importFrom lubridate ymd
#' @importFrom ISOweek ISOweek2date
#' @importFrom dplyr inner_join
#'
#' @export
eurotime2date <- function(x, last = FALSE) {
Expand Down Expand Up @@ -89,17 +90,50 @@ eurotime2date <- function(x, last = FALSE) {
)
return(x)
}

levels(x) <- paste0(year, "-", period, "-", day)


# The date as the last date of the period
if (tcode == "W") {
# we will be using range 1-7 here, not 01-07

# in some datasets week number can be larger than 53, e.g. 99
# ISOweek2date does not support such week numbers -> they are coerced
# to W01-1 or W01-7
if (any(substr(period, 2, 3) > 53)) {
warning(paste("Some TIME_PERIOD fields have invalid week values (> 53).",
"Coercing invalid fields to format YYYY-W01",
"(the first week of the year). If you wish to handle",
"weeks with invalid values manually please use",
"parameter time_format = 'raw' in get_eurostat."))
invalid_w_numbers <- which(substr(period, 2, 3) > 53)
period[invalid_w_numbers] <- "W01"
}

# Range is 1-7 here, not 01-07. 1 = Monday, 2 = Tuesday etc.
day <- ifelse(last == TRUE, 7, 1)
levels(x) <- paste0(year, "-", period, "-", day)
x <- ISOweek::ISOweek2date(x)
return(x)
unique_dates <- unique(x)
column_names <- c("orig", "date")
d <- data.frame(matrix(nrow = length(unique_dates), ncol = length(column_names)))
colnames(d) <- column_names
d$orig <- unique_dates
d$date <- ISOweek::ISOweek2date(unique_dates)

# NEW CODE: data.table
# d <- as.data.table(d)
# x <- as.data.table(x)

x <- as.data.frame(x)
colnames(x) <- "orig"

# NEW CODE: data.table
# y <- x[d, on = "orig"]$date

y <- dplyr::inner_join(x, d, by = "orig")
y <- y$date
return(y)
}

levels(x) <- paste0(year, "-", period, "-", day)

# For times other than weeks
if (last == TRUE && tcode != "W") {
shift <- c("Y" = 367, "S" = 186, "Q" = 96, "0" = 32, "1" = 32, "D" = 0)[tcode]
Expand Down
1 change: 0 additions & 1 deletion R/get_eurostat.R
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ get_eurostat <- function(id,
# Bulk download
} else if (filters == "none") {

message("Trying to download from the new dissemination API... \n")
# Download from new dissemination API in TSV file format
y_raw <- try(get_eurostat_raw(id))
if ("try-error" %in% class(y_raw)) {
Expand Down
16 changes: 16 additions & 0 deletions R/get_eurostat_raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,29 @@ get_eurostat_raw <- function(id) {
if (.Platform$OS.type == "windows") {
curl::curl_download(url = url, destfile = tfile)
} else {
# R Packages (2e): Restore state with base::on.exit()
# Use timeout = 90 for bigger datasets
op <- options(timeout = 90)
on.exit(options(op), add = TRUE)
utils::download.file(url, tfile)
}

# OLD CODE
dat <- readr::read_tsv(gzfile(tfile),
na = ":",
col_types = readr::cols(.default = readr::col_character())
)

# NEW CODE: data.table
# dat <- data.table::fread(cmd = paste("gzip -dc", tfile),
# na.strings = ":",
# colClasses = "character")
# The reason why data.table is not currently used is that readr::cols
# and readr::col_character() worked better with some datasets
# and because RAM usage was not that much lower with data.table

# OLD CODE
dat <- tibble::as_tibble(dat)


# check validity
Expand Down
43 changes: 37 additions & 6 deletions R/tidy_eurostat.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#' @references See citation("eurostat").
#' @author Przemyslaw Biecek, Leo Lahti, Janne Huovari and Pyry Kantanen
#'
#' @importFrom stringi stri_extract_first_regex
#' @importFrom stringi stri_extract_first_regex stri_replace_all_regex stri_replace_all_fixed
#' @importFrom tidyr separate pivot_longer
#' @importFrom dplyr filter
#'
Expand All @@ -55,22 +55,38 @@ tidy_eurostat <- function(dat,
cnames2 <- cnames[length(cnames)] # for colnames

# Separe variables from first column
# OLD CODE
dat <- tidyr::separate(dat,
col = colnames(dat)[1],
into = cnames1,
sep = ",",
sep = ",",
convert = FALSE
)

# NEW CODE: data.table
# defining dat as data.table object is necessary to use data.table functions
# dat <- data.table::as.data.table(dat)

# Get variable from column names
# OLD CODE
dat <- tidyr::pivot_longer(data = dat,
cols = -seq_along(cnames1),
names_to = cnames2,
values_to = "values")

# NEW CODE: data.table
# dat <- data.table::melt(data = dat,
# measure.vars = setdiff(names(dat), cnames1),
# variable.name = cnames2,
# value.name = "values")

# to save memory (and backward compatibility)
# OLD CODE
dat <- dplyr::filter(dat, !is.na(values))

# NEW CODE: data.table
# dat <- na.omit(dat, "values")

## separate flags into separate column
if (keepFlags == TRUE) {
dat$flags <- as.vector(
Expand All @@ -82,14 +98,26 @@ tidy_eurostat <- function(dat,
}

# clean time and values
dat$TIME_PERIOD <- gsub("X", "", dat$TIME_PERIOD)
dat$values <- as.numeric(gsub("[^0-9.-]+", "", as.character(dat$values)))

# OLD CODE
# dat$TIME_PERIOD <- gsub("X", "", dat$TIME_PERIOD, fixed = TRUE)
# dat$values <- as.numeric(gsub("[^0-9.-]+", "", as.character(dat$values)))
# NEW CODE: use stringi instead of gsub for faster execution
dat$TIME_PERIOD <- stringi::stri_replace_all_fixed(dat$TIME_PERIOD, "X", "")
dat$values <- as.numeric(stringi::stri_replace_all_regex(as.character(dat$values), "[^0-9.-]+", ""))

# variable columns
var_cols <- names(dat)[!(names(dat) %in% c("TIME_PERIOD", "values"))]
# selected_cols <- c(var_cols, "TIME_PERIOD", "values")

# reorder to standard order
# OLD CODE
dat <- dat[c(var_cols, "TIME_PERIOD", "values")]

# NEW CODE: data.table
# either this way
# dat <- dat[, ..selected_cols]
# or this way
# data.table::setcolorder(dat, c(var_cols, "TIME_PERIOD", "values"))

# columns from var_cols are converted into factors
# avoid convert = FALSE since it converts T into TRUE instead of TOTAL
Expand Down Expand Up @@ -131,7 +159,7 @@ tidy_eurostat <- function(dat,
)
}
} else {

if (length(freqs) > 1 & time_format != "raw") {
message(
"Data includes several time frequencies. Select a single frequency \n",
Expand All @@ -157,6 +185,9 @@ tidy_eurostat <- function(dat,
time_format = time_format)
}

# NEW CODE: data.table
# This is needed if we still want to return tibbles at the end
# dat <- tibble::as_tibble(dat)
dat
}

Expand Down
2 changes: 1 addition & 1 deletion man/eurostat-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit cfdaf37

Please sign in to comment.