diff --git a/DESCRIPTION b/DESCRIPTION index f007f32..fc1cc94 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,13 +31,11 @@ BugReports: https://github.com/epinowcast/baselinenowcast/issues Depends: R (>= 4.0.0) Imports: - data.table, - cli, - checkmate, - dplyr + cli Suggests: epinowcast, bookdown, + dplyr, ggplot2, spelling, rmarkdown, diff --git a/R/get_delay_estimate.R b/R/get_delay_estimate.R index 17a9f53..f23820e 100644 --- a/R/get_delay_estimate.R +++ b/R/get_delay_estimate.R @@ -8,10 +8,9 @@ #' modified from the code originally developed by the Karlsruhe Institute #' of Technology RESPINOW German Hospitalization Nowcasting Hub, #' Modified from: https://github.com/KITmetricslab/RESPINOW-Hub/blob/7cce3ae2728116e8c8cc0e4ab29074462c24650e/code/baseline/functions.R#L55 #nolint -#' @param triangle dataframe of the reporting triangle, with one column for the -#' reference date and the remaining columns specifying the delay (units not -#' specified). Values indicate the number of new observations assigned to -#' the reference date at the specified delay. +#' @param triangle matrix of the reporting triangle, with rows representing +#' the time points of reference and columns representing the delays +#' indexed at 0 #' @param max_delay integer indicating the maximum delay to estimate, in units #' of the delay. The default is to use the whole reporting triangle, #' `ncol(triangle) -1`. @@ -33,31 +32,30 @@ #' latest_date = "2021-10-01" #' ) #' pobs <- enw_preprocess_data(nat_germany_hosp, max_delay = 21) -#' triangle_raw <- pobs$reporting_triangle[[1]] -#' delay_df <- get_delay_estimate(triangle_raw[, -1], +#' triangle_raw <- pobs$reporting_triangle |> +#' dplyr::select(-`.group`, -reference_date) |> +#' as.matrix() |> +#' unname() +#' delay_df <- get_delay_estimate(triangle_raw, #' max_delay = 20, #' n_history = 30 #' ) get_delay_estimate <- function(triangle, - max_delay = ncol(triangle) - 2, + max_delay = ncol(triangle) - 1, n_history = nrow(triangle)) { # Check that the input reporting triangle is formatted properly. - validate_triangle(triangle) + validate_triangle(triangle, + max_delay = max_delay, + n_history = n_history + ) # Filter the triangle down to nrow = n_history + 1, ncol = max_delay - max_date <- max(triangle$reference_date, na.rm = TRUE) - n_history - trunc_triangle <- preprocess_reporting_triangle(triangle, max_delay) |> - dplyr::filter(reference_date >= max_date) |> - as.data.frame() # nolint - # Make the date the rowname, so the matrix is just the entries - integer_cols <- which(colnames(trunc_triangle) %in% (grep("^\\d+$", names(trunc_triangle), value = TRUE))) # nolint - # the `..` is because its a data.table, we probably don't want to expect this from users. #nolint - rt <- as.matrix(trunc_triangle[, integer_cols]) - dates <- as.character(trunc_triangle$reference_date) - rownames(rt) <- dates + nr0 <- nrow(triangle) + trunc_triangle <- triangle[(nr0 - n_history + 1):nr0, 1:(max_delay + 1)] + rt <- handle_neg_vals(trunc_triangle) n_delays <- ncol(rt) n_dates <- nrow(rt) - factor <- matrix(nrow = max_delay - 1, ncol = 1) + factor <- vector(length = max_delay - 1) expectation <- rt for (co in 2:(n_delays)) { block_top_left <- rt[1:(n_dates - co + 1), 1:(co - 1), drop = FALSE] @@ -66,26 +64,18 @@ get_delay_estimate <- function(triangle, block_bottom_left <- expectation[(n_dates - co + 2):n_dates, 1:(co - 1), drop = FALSE ] + # We compute the expectation so that we can get the delay estimate expectation[(n_dates - co + 2):n_dates, co] <- factor[co - 1] * rowSums( block_bottom_left ) } - # I think this should just be estimated using the lower half of the reporting - # triangle to get the latest delay estimate? - pmf <- vector(length = n_delays) - pmf[1] <- expectation[n_dates, 1] / sum(expectation[n_dates, 1:n_delays]) - for (i in 2:n_delays) { - pmf[i] <- sum(expectation[(n_dates - i + 2):n_dates, i]) / sum(expectation[(n_dates - i + 2):n_dates, 1:n_delays]) - } - - # Or use the whole thing. This might actually be the same... - alternate_pmf <- colSums(expectation) / sum(expectation) - + # Use the completed reporting square to get the point estimate of the delay + # distribution + pmf <- colSums(expectation) / sum(expectation) delay_df <- data.frame( delay = 0:max_delay, - pmf = pmf, - atlernate_pmf = alternate_pmf + pmf = pmf ) return(delay_df) } diff --git a/R/handle_neg_vals.R b/R/handle_neg_vals.R new file mode 100644 index 0000000..b954a35 --- /dev/null +++ b/R/handle_neg_vals.R @@ -0,0 +1,41 @@ +#' Handle negative values in the reporting triangle +#' @description +#' Takes in a reporting triangle and returns a matrix in the same format +#' as the input triangle, but with negative values of reporting handled via +#' passing them to the subsequent days delay. +#' Modified from https://github.com/KITmetricslab/RESPINOW-Hub/blob/main/code/baseline/functions.R #nolint +#' @param triangle the reporting triangle as a matrix, where rows are the +#' time points and columns are the delays, already truncated to the maximum +#' delay and the number of historical observations +#' @return pos_triangle a positive integer matrix with negative values of +#' reporting handled via passing them to the subsequent days delay +handle_neg_vals <- function(triangle) { + integer_cols <- seq_len(ncol(triangle)) + pos_triangle <- triangle + for (i in seq_len(nrow(triangle))) { + to_subtract <- 0 + row <- pos_triangle[i, ] + # Loop over the columns starting from the last column back to max delay + # column, and if there is a negative value, we add this to the + # next day and set that one as 0. + for (j in rev(integer_cols)) { + value <- row[[j]] + if (!is.na(value)) { + # Either adds 0 or the previous days negative value + value <- value + to_subtract + if (value < 0) { + # Want to subtract from subsequent day + to_subtract <- value + pos_triangle[i, j] <- 0 # Set the negative value in the RT to 0 + } else { + pos_triangle[i, j] <- value + to_subtract <- 0 + } + } + } + } + for (col in integer_cols) { + pos_triangle[[col]] <- as.integer(pos_triangle[[col]]) + } + return(pos_triangle) +} diff --git a/R/preprocess_reporting_triangle.R b/R/preprocess_reporting_triangle.R deleted file mode 100644 index e14b62f..0000000 --- a/R/preprocess_reporting_triangle.R +++ /dev/null @@ -1,55 +0,0 @@ -#' Pre-process the reporting triangle -#' @description -#' Takes in a reporting triangle with the reference date as the first column, -#' and all subsequent columns named by their delay (unitless, so for example a -#' delay of 3 days or weeks would have the column name `3`), and values -#' indicating the number of new confirmed cases assigned to that reference -#' date with that particular delay. It returns a dataframe in the same format -#' as the input triangle, but truncated to include only the maximum delay -#' number of columns and with negative values of reporting handled via passing -#' them to the subsequent days delay. -#' Modified from https://github.com/KITmetricslab/RESPINOW-Hub/blob/main/code/baseline/functions.R #nolint -#' @param triangle the reporting triangle as a data.frame, with rows as the -#' reference date and columns as the delay, in any units. Assumes that the -#' delays will be in the format `{delay}` with no suffix for the unit -#' @param max_delay the maximum delay, in the delay units, to filter the -#' reporting triangle -#' @return trunc_triangle a dataframe in the same format as `triangle`, but -#' truncated to include only the maximum delay number of delay columns -#' and with negative values of reporting handled via passing them to the -#' subsequent days delay -preprocess_reporting_triangle <- function(triangle, max_delay) { - # restrict to the first columns in the max - cols_subset <- which(colnames(triangle) == max_delay) - trunc_triangle <- triangle[, 1:cols_subset] - # columns containing integer names: - integer_cols <- which(colnames(trunc_triangle) %in% (grep("^\\d+$", names(trunc_triangle), value = TRUE))) # nolint - # Loop over each row - for (i in seq_len(nrow(trunc_triangle))) { - to_subtract <- 0 - row <- trunc_triangle[i, ] - # Loop over the columns starting from the last column back to max delay - # column, and if there is a negative value, we add this to the - # next day and set that one as 0. - for (j in rev(integer_cols)) { - value <- row[[j]] - if (!is.na(value)) { - # Either adds 0 or the previous days negative value - value <- value + to_subtract - if (value < 0) { - # Want to subtract from subsequent day - to_subtract <- value - trunc_triangle[i, j] <- 0 # Set the negative value in the RT to 0 - } else { - trunc_triangle[i, j] <- value - to_subtract <- 0 - } - } - } - } - # Convert 'value' columns to integer type - for (col in integer_cols) { - trunc_triangle[[col]] <- as.integer(trunc_triangle[[col]]) - } - return(data.table::as.data.table(trunc_triangle)) -} diff --git a/R/validate.R b/R/validate.R index 0637476..ec6e04f 100644 --- a/R/validate.R +++ b/R/validate.R @@ -2,24 +2,48 @@ #' @description #' Various checks to make sure that the reporting triangle passed in to #' `estimate_delay()` is formatted properly. -#' @param triangle a dataframe formatted as a reporting triangle +#' @param triangle a matrix of values with rows indicating the time points and +#' columns indicating the delays +#' @inheritParams get_delay_estimate #' #' @returns NULL, invisibly -validate_triangle <- function(triangle) { - # check column names includes reference date - checkmate::assert_names(names(triangle), must.include = "reference_date") - # check that the columns are integer digits - checkmate::assert_integerish( - as.numeric(grep("^\\d+$", names(triangle), value = TRUE)) - ) - # Check that this returns integers not decimals - checkmate::assert_integerish( - as.numeric(grep("^\\d+$", names(triangle), value = TRUE)) - ) - # Check that grep returns something of length greater than 1 - if (length(as.numeric(grep("^\\d+$", names(triangle), value = TRUE))) < 1) { +validate_triangle <- function(triangle, + max_delay, + n_history) { + if (nrow(triangle) <= ncol(triangle)) { cli::cli_abort( - message = "Attempt to convert column names to integers failed" + message = c( + "Number of observations must be greater than the maximum", + "delay" + ) + ) + } + + if (nrow(triangle) < n_history) { + cli::cli_abort( + message = c( + "Number of observations in input data not sufficient for", + "user specified number of historical observations to use", + "for estimaton." + ) + ) + } + + if (ncol(triangle) < (max_delay + 1)) { + cli::cli_abort( + message = c( + "Number of delays in input data not sufficient for", + "user specified maximum delay" + ) + ) + } + + if ((max_delay < 1 || n_history < 1)) { + cli::cli_abort( + message = c( + "Insufficient `max_delay` or `n_history`, must be greater than or", + " equal to 1." + ) ) } } diff --git a/man/get_delay_estimate.Rd b/man/get_delay_estimate.Rd index 9957f74..036359c 100644 --- a/man/get_delay_estimate.Rd +++ b/man/get_delay_estimate.Rd @@ -6,15 +6,14 @@ \usage{ get_delay_estimate( triangle, - max_delay = ncol(triangle) - 2, + max_delay = ncol(triangle) - 1, n_history = nrow(triangle) ) } \arguments{ -\item{triangle}{dataframe of the reporting triangle, with one column for the -reference date and the remaining columns specifying the delay (units not -specified). Values indicate the number of new observations assigned to -the reference date at the specified delay.} +\item{triangle}{matrix of the reporting triangle, with rows representing +the time points of reference and columns representing the delays +indexed at 0} \item{max_delay}{integer indicating the maximum delay to estimate, in units of the delay. The default is to use the whole reporting triangle, @@ -49,8 +48,10 @@ nat_germany_hosp <- enw_filter_report_dates( latest_date = "2021-10-01" ) pobs <- enw_preprocess_data(nat_germany_hosp, max_delay = 21) -triangle_raw <- pobs$reporting_triangle[[1]] -delay_df <- get_delay_estimate(triangle_raw[, -1], +triangle_raw <- pobs$reporting_triangle |> + dplyr::select(- `.group`, -reference_date) |> + as.matrix() |> unname() +delay_df <- get_delay_estimate(triangle_raw, max_delay = 20, n_history = 30 ) diff --git a/man/handle_neg_vals.Rd b/man/handle_neg_vals.Rd new file mode 100644 index 0000000..2e027d3 --- /dev/null +++ b/man/handle_neg_vals.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/handle_neg_vals.R +\name{handle_neg_vals} +\alias{handle_neg_vals} +\title{Handle negative values in the reporting triangle} +\usage{ +handle_neg_vals(triangle) +} +\arguments{ +\item{triangle}{the reporting triangle as a matrix, where rows are the +time points and columns are the delays, already truncated to the maximum +delay and the number of historical observations} +} +\value{ +pos_triangle a positive integer matrix with negative values of +reporting handled via passing them to the subsequent days delay +} +\description{ +Takes in a reporting triangle and returns a matrix in the same format +as the input triangle, but with negative values of reporting handled via passing +them to the subsequent days delay. +Modified from https://github.com/KITmetricslab/RESPINOW-Hub/blob/main/code/baseline/functions.R #nolint +} diff --git a/man/validate_triangle.Rd b/man/validate_triangle.Rd index d04fc75..89055b7 100644 --- a/man/validate_triangle.Rd +++ b/man/validate_triangle.Rd @@ -4,10 +4,20 @@ \alias{validate_triangle} \title{Validate triangle} \usage{ -validate_triangle(triangle) +validate_triangle(triangle, max_delay, n_history) } \arguments{ -\item{triangle}{a dataframe formatted as a reporting triangle} +\item{triangle}{a matrix of values with rows indicating the time points and +columns indicating the delays} + +\item{max_delay}{integer indicating the maximum delay to estimate, in units +of the delay. The default is to use the whole reporting triangle, +\code{ncol(triangle) -1}.} + +\item{n_history}{integer indicating the number of reference dates to be +used in the estimate of the reporting delay, always starting from the most +recent reporting delay. The default is to use the whole reporting triangle, +so \code{nrow(triangle)-1}} } \value{ NULL, invisibly diff --git a/tests/testthat/test-get_delay_estimate.R b/tests/testthat/test-get_delay_estimate.R index 7aa9433..08ddb3c 100644 --- a/tests/testthat/test-get_delay_estimate.R +++ b/tests/testthat/test-get_delay_estimate.R @@ -1,20 +1,12 @@ test_that("get_delay_estimate function works correctly", { set.seed(123) - triangle <- data.table::data.table( - reference_date = as.Date("2023-01-01") + 0:29, - `0` = rpois(30, 100), - `1` = rpois(30, 50), - `2` = rpois(30, 25), - `3` = rpois(30, 10), - `4` = rpois(30, 5) - ) + triangle <- matrix(rpois(30 * 5, lambda = 25), nrow = 30, ncol = 5) # Test 1: Basic functionality result <- get_delay_estimate(triangle) expect_is(result, "data.frame") - expect_identical(as.integer(nrow(result)), as.integer(ncol(triangle) - 1)) + expect_identical(as.integer(nrow(result)), as.integer(ncol(triangle))) expect_identical(colnames(result), c("delay", "pmf")) - expect_true(all(result$delay == 0:(ncol(triangle) - 2))) expect_true(all(result$pmf >= 0 & result$pmf <= 1)) expect_equal(sum(result$pmf), 1, tolerance = 1e-6) @@ -42,22 +34,7 @@ test_that("get_delay_estimate function works correctly", { result2 <- get_delay_estimate(triangle) expect_identical(result1, result2) - # Test 7: Handling different input types - result_df <- get_delay_estimate(as.data.frame(triangle)) - result_dt <- get_delay_estimate(data.table::as.data.table(triangle)) - expect_identical(result_dt, result_df) - - # Test 8: Check that the function errors if its not passed a triangle + # Test 7: Check that the function errors if its not passed a triangle triangle_single_day <- triangle[1, ] expect_error(get_delay_estimate(triangle_single_day)) - - # Test 9: Check that function throws error if column names are incorrect - triangle_renamed <- as.data.frame(triangle) - triangle_renamed <- stats::setNames( - triangle_renamed, - c("date", paste0("delay_", 0:4)) - ) - expect_error(get_delay_estimate(triangle_renamed), - regexp = "Names must include the elements" - ) }) diff --git a/vignettes/baselinenowcast.Rmd b/vignettes/baselinenowcast.Rmd index 228d1d0..5aeffc9 100644 --- a/vignettes/baselinenowcast.Rmd +++ b/vignettes/baselinenowcast.Rmd @@ -192,7 +192,7 @@ reporting triangle. # observed long above. observed_long <- epinowcast::germany_covid19_hosp[location == "DE"][age_group == "00+"] |> # nolint enw_filter_report_dates(latest_date = nowcast_date) |> - enw_filter_reference_dates(include_days = n_history) + enw_filter_reference_dates(include_days = n_history - 1) head(observed_long) # Get the reporting triangle, adding an additional day because epinowcast # we want the max_delay + 1 entries since 0 is a valid delay. @@ -227,22 +227,21 @@ The `get_delay_estimate()` function expects the following inputs: - `n_history`: an integer indicating the number of observations by reference date to use to fit the delay distribution. This must be less than the number of rows in `triangle`. -`enw_preoprocess()` returns a `triangle` with an additional column `.group` -which will not be used in our estimate of the delay distribution, as -`estimate_delay()` performs the estimate on a single strata only. -If your data contains multiple groups, we recommend either aggregating across -groups (for a fully pooled estimate) or estimating the delay individually for -the strata of your choice (no pooling). -`baselinenowcast` does not support a hierarchical, or partially pooled, -estimate of the delay by design. +`enw_preoprocess()` returns a `triangle` with the columns `.group` and +`.reference_date` and delays indicated by the column names. While we will +eventually write methods that will map from other input formats such as this, +we will start by demonstrating the functionality on the raw matrix of the +reporting triangle. + ```{r estimate-delay} triangle <- triangle_full |> - dplyr::select(-`.group`) |> # remove unecessary `.group` column - as.data.frame() # baseline nowcast expects dataframe as an input + dplyr::select(-`.group`, -reference_date) |> # remove unnecessary columns + as.matrix() |> + unname() head(triangle) -delay_df <- estimate_delay( +delay_df <- get_delay_estimate( triangle = triangle, max_delay = max_delay, n_history = n_history