Skip to content

Commit

Permalink
change to using matrix as method
Browse files Browse the repository at this point in the history
  • Loading branch information
Kaitlyn Johnson committed Jan 30, 2025
1 parent 34dd383 commit 1e64384
Show file tree
Hide file tree
Showing 10 changed files with 161 additions and 153 deletions.
6 changes: 2 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,11 @@ BugReports: https://github.com/epinowcast/baselinenowcast/issues
Depends:
R (>= 4.0.0)
Imports:
data.table,
cli,
checkmate,
dplyr
cli
Suggests:
epinowcast,
bookdown,
dplyr,
ggplot2,
spelling,
rmarkdown,
Expand Down
54 changes: 22 additions & 32 deletions R/get_delay_estimate.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
#' modified from the code originally developed by the Karlsruhe Institute
#' of Technology RESPINOW German Hospitalization Nowcasting Hub,
#' Modified from: https://github.com/KITmetricslab/RESPINOW-Hub/blob/7cce3ae2728116e8c8cc0e4ab29074462c24650e/code/baseline/functions.R#L55 #nolint
#' @param triangle dataframe of the reporting triangle, with one column for the
#' reference date and the remaining columns specifying the delay (units not
#' specified). Values indicate the number of new observations assigned to
#' the reference date at the specified delay.
#' @param triangle matrix of the reporting triangle, with rows representing
#' the time points of reference and columns representing the delays
#' indexed at 0
#' @param max_delay integer indicating the maximum delay to estimate, in units
#' of the delay. The default is to use the whole reporting triangle,
#' `ncol(triangle) -1`.
Expand All @@ -33,31 +32,30 @@
#' latest_date = "2021-10-01"
#' )
#' pobs <- enw_preprocess_data(nat_germany_hosp, max_delay = 21)
#' triangle_raw <- pobs$reporting_triangle[[1]]
#' delay_df <- get_delay_estimate(triangle_raw[, -1],
#' triangle_raw <- pobs$reporting_triangle |>
#' dplyr::select(-`.group`, -reference_date) |>
#' as.matrix() |>
#' unname()
#' delay_df <- get_delay_estimate(triangle_raw,
#' max_delay = 20,
#' n_history = 30
#' )
get_delay_estimate <- function(triangle,
max_delay = ncol(triangle) - 2,
max_delay = ncol(triangle) - 1,
n_history = nrow(triangle)) {
# Check that the input reporting triangle is formatted properly.
validate_triangle(triangle)
validate_triangle(triangle,
max_delay = max_delay,
n_history = n_history
)

# Filter the triangle down to nrow = n_history + 1, ncol = max_delay
max_date <- max(triangle$reference_date, na.rm = TRUE) - n_history
trunc_triangle <- preprocess_reporting_triangle(triangle, max_delay) |>
dplyr::filter(reference_date >= max_date) |>
as.data.frame() # nolint
# Make the date the rowname, so the matrix is just the entries
integer_cols <- which(colnames(trunc_triangle) %in% (grep("^\\d+$", names(trunc_triangle), value = TRUE))) # nolint
# the `..` is because its a data.table, we probably don't want to expect this from users. #nolint
rt <- as.matrix(trunc_triangle[, integer_cols])
dates <- as.character(trunc_triangle$reference_date)
rownames(rt) <- dates
nr0 <- nrow(triangle)
trunc_triangle <- triangle[(nr0 - n_history + 1):nr0, 1:(max_delay + 1)]
rt <- handle_neg_vals(trunc_triangle)
n_delays <- ncol(rt)
n_dates <- nrow(rt)
factor <- matrix(nrow = max_delay - 1, ncol = 1)
factor <- vector(length = max_delay - 1)
expectation <- rt
for (co in 2:(n_delays)) {
block_top_left <- rt[1:(n_dates - co + 1), 1:(co - 1), drop = FALSE]
Expand All @@ -66,26 +64,18 @@ get_delay_estimate <- function(triangle,
block_bottom_left <- expectation[(n_dates - co + 2):n_dates, 1:(co - 1),
drop = FALSE
]
# We compute the expectation so that we can get the delay estimate
expectation[(n_dates - co + 2):n_dates, co] <- factor[co - 1] * rowSums(
block_bottom_left
)
}
# I think this should just be estimated using the lower half of the reporting
# triangle to get the latest delay estimate?
pmf <- vector(length = n_delays)
pmf[1] <- expectation[n_dates, 1] / sum(expectation[n_dates, 1:n_delays])
for (i in 2:n_delays) {
pmf[i] <- sum(expectation[(n_dates - i + 2):n_dates, i]) / sum(expectation[(n_dates - i + 2):n_dates, 1:n_delays])
}

# Or use the whole thing. This might actually be the same...
alternate_pmf <- colSums(expectation) / sum(expectation)


# Use the completed reporting square to get the point estimate of the delay
# distribution
pmf <- colSums(expectation) / sum(expectation)
delay_df <- data.frame(
delay = 0:max_delay,
pmf = pmf,
atlernate_pmf = alternate_pmf
pmf = pmf
)
return(delay_df)
}
41 changes: 41 additions & 0 deletions R/handle_neg_vals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#' Handle negative values in the reporting triangle
#' @description
#' Takes in a reporting triangle and returns a matrix in the same format
#' as the input triangle, but with negative values of reporting handled via
#' passing them to the subsequent days delay.
#' Modified from https://github.com/KITmetricslab/RESPINOW-Hub/blob/main/code/baseline/functions.R #nolint
#' @param triangle the reporting triangle as a matrix, where rows are the
#' time points and columns are the delays, already truncated to the maximum
#' delay and the number of historical observations
#' @return pos_triangle a positive integer matrix with negative values of
#' reporting handled via passing them to the subsequent days delay
handle_neg_vals <- function(triangle) {
integer_cols <- seq_len(ncol(triangle))
pos_triangle <- triangle
for (i in seq_len(nrow(triangle))) {
to_subtract <- 0
row <- pos_triangle[i, ]
# Loop over the columns starting from the last column back to max delay
# column, and if there is a negative value, we add this to the
# next day and set that one as 0.
for (j in rev(integer_cols)) {
value <- row[[j]]
if (!is.na(value)) {
# Either adds 0 or the previous days negative value
value <- value + to_subtract
if (value < 0) {
# Want to subtract from subsequent day
to_subtract <- value
pos_triangle[i, j] <- 0 # Set the negative value in the RT to 0
} else {
pos_triangle[i, j] <- value
to_subtract <- 0
}
}
}
}
for (col in integer_cols) {
pos_triangle[[col]] <- as.integer(pos_triangle[[col]])
}
return(pos_triangle)
}
55 changes: 0 additions & 55 deletions R/preprocess_reporting_triangle.R

This file was deleted.

54 changes: 39 additions & 15 deletions R/validate.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,48 @@
#' @description
#' Various checks to make sure that the reporting triangle passed in to
#' `estimate_delay()` is formatted properly.
#' @param triangle a dataframe formatted as a reporting triangle
#' @param triangle a matrix of values with rows indicating the time points and
#' columns indicating the delays
#' @inheritParams get_delay_estimate
#'
#' @returns NULL, invisibly
validate_triangle <- function(triangle) {
# check column names includes reference date
checkmate::assert_names(names(triangle), must.include = "reference_date")
# check that the columns are integer digits
checkmate::assert_integerish(
as.numeric(grep("^\\d+$", names(triangle), value = TRUE))
)
# Check that this returns integers not decimals
checkmate::assert_integerish(
as.numeric(grep("^\\d+$", names(triangle), value = TRUE))
)
# Check that grep returns something of length greater than 1
if (length(as.numeric(grep("^\\d+$", names(triangle), value = TRUE))) < 1) {
validate_triangle <- function(triangle,
max_delay,
n_history) {
if (nrow(triangle) <= ncol(triangle)) {
cli::cli_abort(
message = "Attempt to convert column names to integers failed"
message = c(
"Number of observations must be greater than the maximum",
"delay"
)
)
}

if (nrow(triangle) < n_history) {
cli::cli_abort(
message = c(
"Number of observations in input data not sufficient for",
"user specified number of historical observations to use",
"for estimaton."
)
)
}

if (ncol(triangle) < (max_delay + 1)) {
cli::cli_abort(
message = c(
"Number of delays in input data not sufficient for",
"user specified maximum delay"
)
)
}

if ((max_delay < 1 || n_history < 1)) {
cli::cli_abort(
message = c(
"Insufficient `max_delay` or `n_history`, must be greater than or",
" equal to 1."
)
)
}
}
15 changes: 8 additions & 7 deletions man/get_delay_estimate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions man/handle_neg_vals.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 12 additions & 2 deletions man/validate_triangle.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

29 changes: 3 additions & 26 deletions tests/testthat/test-get_delay_estimate.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,12 @@
test_that("get_delay_estimate function works correctly", {
set.seed(123)
triangle <- data.table::data.table(
reference_date = as.Date("2023-01-01") + 0:29,
`0` = rpois(30, 100),
`1` = rpois(30, 50),
`2` = rpois(30, 25),
`3` = rpois(30, 10),
`4` = rpois(30, 5)
)
triangle <- matrix(rpois(30 * 5, lambda = 25), nrow = 30, ncol = 5)

# Test 1: Basic functionality
result <- get_delay_estimate(triangle)
expect_is(result, "data.frame")
expect_identical(as.integer(nrow(result)), as.integer(ncol(triangle) - 1))
expect_identical(as.integer(nrow(result)), as.integer(ncol(triangle)))
expect_identical(colnames(result), c("delay", "pmf"))
expect_true(all(result$delay == 0:(ncol(triangle) - 2)))
expect_true(all(result$pmf >= 0 & result$pmf <= 1))
expect_equal(sum(result$pmf), 1, tolerance = 1e-6)

Expand Down Expand Up @@ -42,22 +34,7 @@ test_that("get_delay_estimate function works correctly", {
result2 <- get_delay_estimate(triangle)
expect_identical(result1, result2)

# Test 7: Handling different input types
result_df <- get_delay_estimate(as.data.frame(triangle))
result_dt <- get_delay_estimate(data.table::as.data.table(triangle))
expect_identical(result_dt, result_df)

# Test 8: Check that the function errors if its not passed a triangle
# Test 7: Check that the function errors if its not passed a triangle
triangle_single_day <- triangle[1, ]
expect_error(get_delay_estimate(triangle_single_day))

# Test 9: Check that function throws error if column names are incorrect
triangle_renamed <- as.data.frame(triangle)
triangle_renamed <- stats::setNames(
triangle_renamed,
c("date", paste0("delay_", 0:4))
)
expect_error(get_delay_estimate(triangle_renamed),
regexp = "Names must include the elements"
)
})
Loading

0 comments on commit 1e64384

Please sign in to comment.