diff --git a/DESCRIPTION b/DESCRIPTION index 59c1b93..c2c0a96 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Imports: Rcpp LinkingTo: Rcpp URL: https://github.com/metrumresearchgroup/lastdose BugReports: https://github.com/metrumresearchgroup/lastdose/issues -Suggests: testthat +Suggests: testthat, withr Encoding: UTF-8 LazyData: true RoxygenNote: 7.1.1 diff --git a/NEWS.md b/NEWS.md index 159fbcb..f138f0f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,15 @@ # lastdose (development version) +- Change default value for `include_tafd` to FALSE (#29) +- Add option `lastdose.include_tafd` (#29) +- Add option `lastdose.id_col` (#27) +- Add option `lastdose.time_units` (#27) +- Search column names for candidate ID columns using `find_id_col()`; + this function is unexported (#25) +- Search column names for candidate time columns using `find_time_col()`; + this function is unexported (#25) +- Handle missing values (`NA`) in the time column (#30) + # lastdose 0.3.2 - Fix bug where `II` column was not properly detected resulting in incorrect diff --git a/R/lastdose.R b/R/lastdose.R index 2cf3829..a509bb2 100644 --- a/R/lastdose.R +++ b/R/lastdose.R @@ -15,11 +15,15 @@ NULL #' @param time_col character name for the `TIME` column; this could be time after #' first dose or time after first record or time relative to any origin; input #' may be `numeric` or `POSIXct` (e.g. `DATETIME`); if `POSIXct`, a numeric -#' value will be calculated based on the value of `time_units` +#' value will be calculated based on the value of `time_units`. The data frame +#' will be searched for the first matching candidate time column using +#' [find_time_col()]. #' @param time_units for calculating time when the time column inherits #' `POSIXct`; you may use any value that is valid for [difftime()] #' @param id_col character name for the subject `ID` column; may be numeric -#' or character; if character, a numeric value is derived +#' or character; if character, a numeric value is derived. The data frame +#' will be searched for the first matching candidate `ID` column using +#' [find_id_col()]. #' @param back_calc if `TRUE`, then the time before the first dose #' is calculated for records prior to the first dosing record when #' at least one dosing record is found in the data set. Records before @@ -41,13 +45,31 @@ NULL #' @param include_tafd `logical`; if `FALSE`, then `TAFD` data is not appended #' to the data set. Only used for the [lastdose()] function. #' +#' @section Options: +#' +#' These are options that can be set to customize `lastdose` behavior +#' for the current context. See `?options` for how to set an option. +#' +#' - `lastdose.time_units`: sets the default time unit that is used to calculate +#' relative times when the time column is represented as date-time data +#' (`POSIXct`) +#' - `lastdose.id_col`: sets the default value for the `id_col` argument +#' to last dose; this identifies the column that is to be used to distinguish +#' individuals; the data in this column may be numeric or character +#' - `lastdose.include_tafd`: sets default value for `include_tafd`; if `TRUE` +#' then the time since the first dose record (EVID 1 or EVID 4) in the data +#' set will be automatically appended to the output data frame when +#' calling `lastdose()`; `tafd` is always included when calling +#' `lastdose_df()` and `lastdose_list()` +#' #' @details #' -#' When calling [lastdose()] to modify the data frame, three columns will be +#' When calling [lastdose()] to modify the data frame, two columns will be #' added (by default): `TAD` indicating the time after the most-recent dose, -#' `TAFD` indicating the time after the first dose and `LDOS` indicating the -#' amount of the most recent dose. This default behavior can be modified with -#' the `include_ldos` and `include_tafd` arguments. +#' and `LDOS` indicating the amount of the most recent dose. `TAFD` indicating +#' the time after the first dose record (EVID 1 or 4) can be added via the +#' `include_tafd` argument and users can opt out from adding `LDOS` with the +#' `include_ldos` argument. #' #' When calling [lastdose_list()] or [lastdose_df()], the respective items are #' accessible with `tad`, `tafd`, and `ldos` (note the lower case form here to @@ -102,8 +124,9 @@ NULL #' #' #' @export -lastdose <- function(data,..., include_ldos = TRUE, include_tafd = TRUE) { - ans <- lastdose_list(data,...) +lastdose <- function(data, ..., include_ldos = TRUE, + include_tafd = getOption("lastdose.include_tafd", FALSE)) { + ans <- lastdose_list(data, ...) data[["TAD"]] <- ans[["tad"]] if(include_tafd) data[["TAFD"]] <- ans[["tafd"]] if(include_ldos) data[["LDOS"]] <- ans[["ldos"]] @@ -113,40 +136,38 @@ lastdose <- function(data,..., include_ldos = TRUE, include_tafd = TRUE) { #' @rdname lastdose #' @export lastdose_list <- function(data, - time_col = "TIME", - time_units = NULL, - id_col = "ID", + time_col = find_time_col(data), + time_units = getOption("lastdose.time_units", NULL), + id_col = find_id_col(data), fill = -99, back_calc = TRUE, addl_ties = c("obs_first", "dose_first"), comments = find_comments(data)) { - if(length(comments)==1) { + + if(length(comments) == 1) { comments <- rep(comments,nrow(data)) } - if(!length(comments)==nrow(data)) { - stop("'comments' must be have length equal to the number of rows in 'data'",call.=FALSE) + if(length(comments) != nrow(data)) { + stop( + "'comments' must be have length equal to the number of rows in 'data'", + call. = FALSE + ) } addl_ties <- match.arg(addl_ties) sort1 <- addl_ties == "obs_first" lower_names <- tolower(names(data)) - wid <- match(id_col, names(data)) - if(is.na(wid)) { - stop("did not find id column `", id_col, "` in `data`", call.=FALSE) - } - col_id <- data[[wid]] - if(is.character(col_id)) { - col_id <- match(col_id, unique(col_id)) - } - if(!is.numeric(col_id)) { - stop("id column is required to be numeric", call.=FALSE) - } wtime <- match(time_col, names(data)) if(is.na(wtime)) { - stop("did not find time column `", time_col, "` in `data`", call.=FALSE) + stop("did not find time column `", time_col, "` in `data`", call. = FALSE) + } + has_na_time <- anyNA(data[[wtime]]) + if(has_na_time) { + na_time <- is.na(data[[wtime]]) + data <- data[!na_time,, drop = FALSE] } col_time <- data[[wtime]] if(inherits(col_time, "POSIXct")) { - if(missing(time_units)) { + if(is.null(time_units)) { stop( "`time_units` is required when time column inherits `POSIXct`", call.=FALSE @@ -164,6 +185,17 @@ lastdose_list <- function(data, if(!is.numeric(col_time)) { stop("time column is required to be numeric", call.=FALSE) } + wid <- match(id_col, names(data)) + if(is.na(wid)) { + stop("did not find id column `", id_col, "` in `data`", call.=FALSE) + } + col_id <- data[[wid]] + if(is.character(col_id)) { + col_id <- match(col_id, unique(col_id)) + } + if(!is.numeric(col_id)) { + stop("id column is required to be numeric", call.=FALSE) + } wamt <- match("amt", lower_names) if(is.na(wamt)) { stop("column AMT or amt is required in the data set", call.=FALSE) @@ -214,19 +246,25 @@ lastdose_list <- function(data, sort1, comments ) + if(has_na_time) { + re_order <- order(c(which(!na_time), which(na_time))) + for(j in seq_along(ans)) { + ans[[j]] <- ans[[j]][re_order] + } + } ans } #' @rdname lastdose #' @export -lastdose_df <- function(data,...) { - ans <- lastdose_list(data,...) +lastdose_df <- function(data, ...) { + ans <- lastdose_list(data, ...) data.frame( tad = ans[["tad"]], tafd = ans[["tafd"]], ldos = ans[["ldos"]], - stringsAsFactors=FALSE,check.names=FALSE, - fix.empty.names=FALSE, row.names=NULL + stringsAsFactors = FALSE, check.names = FALSE, + fix.empty.names = FALSE, row.names = NULL ) } @@ -260,11 +298,11 @@ lastdose_df <- function(data,...) { #' #' #' @export -find_comments <- function(x,...) UseMethod("find_comments") +find_comments <- function(x, ...) UseMethod("find_comments") #' @rdname find_comments #' #' @export -find_comments.data.frame <- function(x,...) { +find_comments.data.frame <- function(x, ...) { if(!inherits(x[["C"]], c("logical", "character"))) { if(exists("C", x)) { warning( @@ -279,7 +317,7 @@ find_comments.data.frame <- function(x,...) { #' @rdname find_comments #' @export -find_comments.character <- function(x,...) { +find_comments.character <- function(x, ...) { !(is.na(x)|x=='.') } @@ -288,3 +326,66 @@ find_comments.character <- function(x,...) { find_comments.logical <- function(x, ...) { x & !is.na(x) } + +#' Find TIME column +#' +#' Search data frame names for the first matching candidate TIME column name. +#' See `details`. +#' +#' @param data a data.frame to search +#' @details +#' Column names will be searched against the following candidates +#' +#' - `TIME` +#' - `DATETIME` +#' +#' The first the first candidate to be matched will be returned. If there +#' are no matches, an error is generated. +#' +#' @examples +#' data <- data.frame(A = 1, DATETIME = 2, TIME = 3, Z = 99) +#' lastdose:::find_time_col(data) +#' +find_time_col <- function(data) { + stopifnot(is.data.frame(data)) + ans <- intersect(c("TIME", "DATETIME"), names(data)) + if(length(ans)==0) { + stop("could not find a TIME column in `data`", call. = FALSE) + } + ans[1] +} + +#' Find ID column +#' +#' Search data frame names for the first matching candidate ID column name. +#' See `details`. +#' +#' @param data a data.frame to search +#' @details +#' Column names will be searched against the following candidates +#' +#' - `getOption("lastdose.id_col")` +#' - `ID` +#' - `USUBJID` +#' - `SUBJID` +#' - `PTNO` +#' - `SUBJ` +#' +#' The first the first candidate to be matched will be returned. If there +#' are no matches, an error is generated. +#' +#' @examples +#' data <- data.frame(A = 1, B = 2, PTNO = 3, ID = 4, Z = 99) +#' lastdose:::find_id_col(data) +#' +find_id_col <- function(data) { + stopifnot(is.data.frame(data)) + op <- getOption("lastdose.id_col", NULL) + can <- c(op, "ID", "USUBJID", "SUBJID", "PTNO", "SUBJ") + ans <- intersect(can, names(data)) + if(length(ans)==0) { + stop("could not find a subject identifier column in `data`", call. = FALSE) + } + ans[1] +} + diff --git a/man/find_id_col.Rd b/man/find_id_col.Rd new file mode 100644 index 0000000..f861c33 --- /dev/null +++ b/man/find_id_col.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lastdose.R +\name{find_id_col} +\alias{find_id_col} +\title{Find ID column} +\usage{ +find_id_col(data) +} +\arguments{ +\item{data}{a data.frame to search} +} +\description{ +Search data frame names for the first matching candidate ID column name. +See \code{details}. +} +\details{ +Column names will be searched against the following candidates +\itemize{ +\item \code{getOption("lastdose.id_col")} +\item \code{ID} +\item \code{USUBJID} +\item \code{SUBJID} +\item \code{PTNO} +\item \code{SUBJ} +} + +The first the first candidate to be matched will be returned. If there +are no matches, an error is generated. +} +\examples{ +data <- data.frame(A = 1, B = 2, PTNO = 3, ID = 4, Z = 99) +lastdose:::find_id_col(data) + +} diff --git a/man/find_time_col.Rd b/man/find_time_col.Rd new file mode 100644 index 0000000..d644932 --- /dev/null +++ b/man/find_time_col.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lastdose.R +\name{find_time_col} +\alias{find_time_col} +\title{Find TIME column} +\usage{ +find_time_col(data) +} +\arguments{ +\item{data}{a data.frame to search} +} +\description{ +Search data frame names for the first matching candidate TIME column name. +See \code{details}. +} +\details{ +Column names will be searched against the following candidates +\itemize{ +\item \code{TIME} +\item \code{DATETIME} +} + +The first the first candidate to be matched will be returned. If there +are no matches, an error is generated. +} +\examples{ +data <- data.frame(A = 1, DATETIME = 2, TIME = 3, Z = 99) +lastdose:::find_time_col(data) + +} diff --git a/man/lastdose.Rd b/man/lastdose.Rd index 1586d96..11c6dd0 100644 --- a/man/lastdose.Rd +++ b/man/lastdose.Rd @@ -6,13 +6,18 @@ \alias{lastdose_df} \title{Calculate last dose amount and times since previous doses} \usage{ -lastdose(data, ..., include_ldos = TRUE, include_tafd = TRUE) +lastdose( + data, + ..., + include_ldos = TRUE, + include_tafd = getOption("lastdose.include_tafd", FALSE) +) lastdose_list( data, - time_col = "TIME", - time_units = NULL, - id_col = "ID", + time_col = find_time_col(data), + time_units = getOption("lastdose.time_units", NULL), + id_col = find_id_col(data), fill = -99, back_calc = TRUE, addl_ties = c("obs_first", "dose_first"), @@ -35,13 +40,17 @@ to the data set. Only used for the \code{\link[=lastdose]{lastdose()}} function \item{time_col}{character name for the \code{TIME} column; this could be time after first dose or time after first record or time relative to any origin; input may be \code{numeric} or \code{POSIXct} (e.g. \code{DATETIME}); if \code{POSIXct}, a numeric -value will be calculated based on the value of \code{time_units}} +value will be calculated based on the value of \code{time_units}. The data frame +will be searched for the first matching candidate time column using +\code{\link[=find_time_col]{find_time_col()}}.} \item{time_units}{for calculating time when the time column inherits \code{POSIXct}; you may use any value that is valid for \code{\link[=difftime]{difftime()}}} \item{id_col}{character name for the subject \code{ID} column; may be numeric -or character; if character, a numeric value is derived} +or character; if character, a numeric value is derived. The data frame +will be searched for the first matching candidate \code{ID} column using +\code{\link[=find_id_col]{find_id_col()}}.} \item{fill}{the value for \code{TAD} and \code{TAFD} that is used for records when no doses are found for an individual or when \code{back_calc} is \code{FALSE}.} @@ -70,11 +79,12 @@ to add (or potentially replace) columns to the input data frame; as either \code{list} or \code{data.frame} format without modifying the input data. } \details{ -When calling \code{\link[=lastdose]{lastdose()}} to modify the data frame, three columns will be +When calling \code{\link[=lastdose]{lastdose()}} to modify the data frame, two columns will be added (by default): \code{TAD} indicating the time after the most-recent dose, -\code{TAFD} indicating the time after the first dose and \code{LDOS} indicating the -amount of the most recent dose. This default behavior can be modified with -the \code{include_ldos} and \code{include_tafd} arguments. +and \code{LDOS} indicating the amount of the most recent dose. \code{TAFD} indicating +the time after the first dose record (EVID 1 or 4) can be added via the +\code{include_tafd} argument and users can opt out from adding \code{LDOS} with the +\code{include_ldos} argument. When calling \code{\link[=lastdose_list]{lastdose_list()}} or \code{\link[=lastdose_df]{lastdose_df()}}, the respective items are accessible with \code{tad}, \code{tafd}, and \code{ldos} (note the lower case form here to @@ -119,6 +129,26 @@ dosing and both an an additional dose and an observation happen at 24 hours, \code{obs_first} will set the observation \code{TAD} to 24 and \code{dose_first} will set the observation \code{TAD} to 0. } +\section{Options}{ + + +These are options that can be set to customize \code{lastdose} behavior +for the current context. See \code{?options} for how to set an option. +\itemize{ +\item \code{lastdose.time_units}: sets the default time unit that is used to calculate +relative times when the time column is represented as date-time data +(\code{POSIXct}) +\item \code{lastdose.id_col}: sets the default value for the \code{id_col} argument +to last dose; this identifies the column that is to be used to distinguish +individuals; the data in this column may be numeric or character +\item \code{lastdose.include_tafd}: sets default value for \code{include_tafd}; if \code{TRUE} +then the time since the first dose record (EVID 1 or EVID 4) in the data +set will be automatically appended to the output data frame when +calling \code{lastdose()}; \code{tafd} is always included when calling +\code{lastdose_df()} and \code{lastdose_list()} +} +} + \examples{ file <- system.file("csv/data1.csv", package="lastdose") diff --git a/src/lastdose.cpp b/src/lastdose.cpp index fd5a099..23ab7f3 100644 --- a/src/lastdose.cpp +++ b/src/lastdose.cpp @@ -103,7 +103,7 @@ Rcpp::List lastdose_impl(Rcpp::NumericVector id, double last_time = -1E9; for(int j = idstart[i]; j <= idend[i]; ++j) { // If time is missing - if(Rcpp::NumericVector::is_na(time[j])) { + if(isna(time[j])) { tad[crow] = NA_REAL; ldos[crow] = NA_REAL; ++crow; @@ -121,7 +121,7 @@ Rcpp::List lastdose_impl(Rcpp::NumericVector id, } // Deal with missing dose bool missing_amt = Rcpp::NumericVector::is_na(amt[j]); - if(is_dose(evid[j],comment[j])) { + if(is_dose(evid[j], comment[j])) { if(!found_dose) { found_dose = true; tofd[i] = time[j]; diff --git a/tests/testthat/test-lastdose.R b/tests/testthat/test-lastdose.R index df912b9..083715e 100644 --- a/tests/testthat/test-lastdose.R +++ b/tests/testthat/test-lastdose.R @@ -24,6 +24,8 @@ test_that("doses at time zero", { test_that("time after first dose", { x <- lastdose(set4) + expect_false(exists("TAFD", x)) + x <- lastdose(set4, include_tafd = TRUE) expect_true(exists("TAFD", x)) dose_rows <- which(set4$evid==1) time_of_first_dose <- set4$TIME[dose_rows[1]] @@ -119,6 +121,53 @@ test_that("error for missing values in ID,evid,ii,addl", { } }) +test_that("handle missing values in time colunn", { + dd <- df[df$set==4,] + dd$TIME <- as.numeric(dd$TIME) + dd0 <- dd + set.seed(1010) + i <- sample(seq(nrow(dd)), size = 18) + i <- i[!dd$evid[i]==1] + dd$TIME[i] <- NA_real_ + ans1 <- lastdose(dd) + ans2 <- lastdose(dd0) + w <- setdiff(seq(nrow(dd)),i) + for(col in names(ans1)) { + expect_identical(ans1[w,col], ans2[w,col]) + } + ans3 <- ans1[i,] + expect_true(all(is.na(ans3$TAD))) + expect_true(all(is.na(ans3$LDOS))) + expect_true(all(is.na(ans3$TAFD))) + ans4 <- ans1[w,] + expect_false(any(is.na(ans4$TAD))) + expect_false(any(is.na(ans4$LDOS))) + expect_false(any(is.na(ans4$TAFD))) + + file <- system.file("csv", "data_big.RDS", package = "lastdose") + data <- readRDS(file) + set.seed(21032) + x <- sample(seq(nrow(data)), 1000) + x <- x[data$EVID[x] ==0] + data2 <- data + data2$TIME[x] <- NA_real_ + out1 <- lastdose(data, include_tafd = TRUE) + out2 <- lastdose(data2, include_tafd = TRUE) + ans <- as.numeric(c(0, length(x))) + smr <- function(x,y) { + x <- x-y + c(sum(x, na.rm = TRUE), sum(is.na(x))) + } + a <- smr(out1$TAD, out2$TAD) + b <- smr(out1$TIME, out2$TIME) + c <- smr(out1$LDOS, out2$LDOS) + d <- smr(out1$TAFD, out2$TAFD) + expect_identical(a, ans) + expect_identical(b, ans) + expect_identical(c, ans) + expect_identical(d, ans) +}) + test_that("NA amt is error for dosing record, ok otherwise", { dd <- set1 dd$amt[5] <- NA_real_ @@ -162,23 +211,64 @@ test_that("undefined behavior when checking ADDL and II issue-11", { test_that("user-named time and id columns", { d1 <- subset(set1, ID==1) d2 <- d1 - d2$TAFD <- d2$TIME + d2$xTAFD <- d2$TIME d2$TIME <- NULL expect_identical( lastdose_df(d1), - lastdose_df(d2, time_col = "TAFD") + lastdose_df(d2, time_col = "xTAFD") ) expect_error(lastdose(d2), msg = "did not find time column") d2 <- d1 - d2$USUBJID <- "A" + d2$xUSUBJID <- "A" d2$ID <- NULL expect_identical( lastdose_df(d1), - lastdose_df(d2, id_col = "USUBJID") + lastdose_df(d2, id_col = "xUSUBJID") ) + if(requireNamespace("withr")) { + expect_identical( + lastdose_df(d2, id_col = "xUSUBJID"), + withr::with_options( + list(lastdose.id_col = "xUSUBJID"), + lastdose_df(d2) + ) + ) + } expect_error(lastdose(d2)) }) +test_that("find time column from candidate list", { + dd <- subset(set1, ID==1) + time <- dd$TIME + dd$TIME <- NULL + tr <- c("TIME", "DATETIME") + for(col in tr) { + dd[[col]] <- time + expect_is(lastdose(dd), "data.frame") + dd[[col]] <- NULL + } +}) + +test_that("find ID column from candidate list", { + dd <- subset(set1, ID==1)[1:3,] + ID <- dd$ID + dd$ID <- NULL + tr <- c("ID", "USUBJID", "SUBJID", "PTNO", "SUBJ") + for(col in tr) { + dd[[col]] <- ID + expect_is(lastdose(dd), "data.frame") + dd[[col]] <- NULL + } + if(requireNamespace("withr")) { + dd[["i_d"]] <- ID + ans1 <- withr::with_options( + list(lastdose.id_col = "i_d"), + lastdose(dd) + ) + expect_is(ans1, "data.frame") + } +}) + test_that("POSIXct datetime is converted to numeric time", { d1 <- subset(set1, ID <= 2) d2 <- d1 @@ -192,6 +282,14 @@ test_that("POSIXct datetime is converted to numeric time", { expect_error( lastdose(d1, time_units = "seconds") ) + if(requireNamespace("withr")) { + ans1 <- withr::with_options( + list(lastdose.time_units = "hours"), + lastdose(d1) + ) + ans2 <- lastdose(d1, time_units = "hours") + expect_identical(ans1, ans2) + } }) test_that("logical comment column is ok", {