Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Censored data 2 eh #224

Merged
merged 4 commits into from
Mar 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ export(TADABigdataRetrieval)
export(TADAReadWQPWebServices)
export(TADAdataRetrieval)
export(autoclean)
export(idCensoredData)
export(identifyPotentialDuplicates)
export(simpleCensoredMethods)
export(summarizeCensoredData)
importFrom(magrittr,"%>%")
181 changes: 145 additions & 36 deletions R/CensoredDataSuite.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,80 @@
#' Prepare Censored Data
#'
#' This function identifies censored data records and characterizes them as non-detects
#' or over-detects based on the ResultDetectionConditionText and DetectionQuantitationLimitTypeName.
#' It also identifies records populated with limits and conditions that cannot be grouped
#' as over- or non-detects as "Other", and flags records where there is a conflict
#' between ResultDetectionConditionText and DetectionQuantitationLimitTypeName.
#' This function is used by default in the simpleCensoredMethods and summarizeCensoredData
#' functions, but can be used on its own.
#'
#' @param .data A TADA dataframe
#'
#' @return A TADA dataframe with additional column named TADA.CensoredData.Flag, which indicates if there are disagreements in ResultDetectionCondition and DetectionQuantitationLimitTypeName.
#'
#' @export
#'

idCensoredData <- function(.data){
# check .data has all of the required columns
expected_cols <- c(
"ResultDetectionConditionText",
"DetectionQuantitationLimitTypeName",
"TADA.ResultMeasureValueDataTypes.Flag"
)
checkColumns(.data, expected_cols)

## First step: identify censored data
cens = .data%>%dplyr::filter(TADA.ResultMeasureValueDataTypes.Flag=="Result Value/Unit Copied from Detection Limit")
not_cens = .data%>%dplyr::filter(!ResultIdentifier%in%cens$ResultIdentifier)
not_cens$TADA.CensoredData.Flag = "Uncensored"

## Bring in det cond reference table
cond.ref = GetDetCondRef()%>%dplyr::rename(ResultDetectionConditionText = Name)%>%dplyr::select(ResultDetectionConditionText, TADA.Detection_Type)

## Join to censored data
cens = dplyr::left_join(cens, cond.ref, by = "ResultDetectionConditionText")

conds = unique(cens$ResultDetectionConditionText)
if(any(!conds%in%cond.ref$ResultDetectionConditionText)){
missing_conds = conds[!conds%in%cond.ref$ResultDetectionConditionText]
missing_conds = paste(missing_conds, collapse = ", ")
warning(paste0("ResultDetectionConditionText column in dataset contains value(s) ",missing_conds, " which is/are not represented in the ResultDetectionConditionText WQX domain table. These data records are placed under the TADA.CensoredData.Flag: Censored but not Categorized, and will not be used in censored data handling methods. Please contact TADA administrators to resolve."))
}

## Bring in det limit type reference table
limtype.ref = GetDetLimitRef()%>%dplyr::rename(DetectionQuantitationLimitTypeName = Name)%>%dplyr::select(DetectionQuantitationLimitTypeName, TADA.Limit_Type)

## Join to censored data
cens = dplyr::left_join(cens, limtype.ref, by = "DetectionQuantitationLimitTypeName")

limits = unique(cens$DetectionQuantitationLimitTypeName)
if(any(!limits%in%limtype.ref$DetectionQuantitationLimitTypeName)){
missing_lims = limits[!limits%in%limtype.ref$DetectionQuantitationLimitTypeName]
missing_lims = paste(missing_lims, collapse = ", ")
warning(paste0("DetectionQuantitationLimitTypeName column dataset contains value(s) ",missing_lims, " which is/are not represented in the DetectionQuantitationLimitTypeName WQX domain table. These data records are placed under the TADA.CensoredData.Flag: Censored but not Categorized, and will not be used in censored data handling methods. Please contact TADA administrators to resolve."))
}

## Create flag for condition and limit type combinations
cens$TADA.CensoredData.Flag = "Censored but not Categorized"
cens$TADA.CensoredData.Flag = ifelse(cens$TADA.Detection_Type%in%c("Non-Detect")&cens$TADA.Limit_Type%in%c("Non-Detect"),"Non-Detect",cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag = ifelse(cens$TADA.Detection_Type%in%c("Over-Detect")&cens$TADA.Limit_Type%in%c("Over-Detect"),"Over-Detect",cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag = ifelse(cens$TADA.Detection_Type%in%c("Other")&cens$TADA.Limit_Type%in%c("Other"),"Other Condition/Limit Populated",cens$TADA.CensoredData.Flag)
cens$TADA.CensoredData.Flag = ifelse(cens$TADA.Detection_Type%in%c("Non-Detect","Over-Detect","Other")&cens$TADA.Limit_Type%in%c("Non-Detect","Over-Detect","Other")&!cens$TADA.Detection_Type==cens$TADA.Limit_Type,"Conflict between Condition and Limit",cens$TADA.CensoredData.Flag)

## warn when some limit metadata may be problematic
if("Conflict between Condition and Limit"%in%cens$TADA.CensoredData.Flag){
num = length(cens$TADA.CensoredData.Flag[cens$TADA.CensoredData.Flag=="Conflict between Condition and Limit"])
warning(paste0(num," records in supplied dataset have conflicting detection condition and detection limit type information. These records will not be included in detection limit handling calculations."))
}

cens = cens%>%dplyr::select(-TADA.Detection_Type, -TADA.Limit_Type)

cens.check = plyr::rbind.fill(cens, not_cens)
return(cens.check)
}


#' Simple Tools for Censored Data Handling
#'
#' This function determines if detection limit type and detection condition are parsimonious
Expand Down Expand Up @@ -33,8 +110,9 @@ simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier
expected_cols <- c(
"ResultDetectionConditionText",
"DetectionQuantitationLimitTypeName",
"TADA.ResultMeasureValue.DataTypeFlag"
"TADA.ResultMeasureValueDataTypes.Flag"
)
checkColumns(.data, expected_cols)

# check that multiplier is provided if method = "multiplier"
if(nd_method=="multiplier"&nd_multiplier=="null"){
Expand All @@ -44,43 +122,18 @@ simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier
stop("Please provide a multiplier for the upper detection limit handling method of 'multiplier'. Typically, the multiplier value is between 0 and 1.")
}

## First step: identify censored data
cens = .data%>%dplyr::filter(TADA.ResultMeasureValueDataTypes.Flag=="Result Value/Unit Copied from Detection Limit")
not_cens = .data%>%dplyr::filter(!ResultIdentifier%in%cens$ResultIdentifier)

## Bring in det cond reference table
cond.ref = GetDetCondRef()%>%dplyr::rename(ResultDetectionConditionText = Name)%>%dplyr::select(ResultDetectionConditionText, TADA.Detection_Type)

## Join to censored data
cens = dplyr::left_join(cens, cond.ref)

## Bring in det limit type reference table
limtype.ref = GetDetLimitRef()%>%dplyr::rename(DetectionQuantitationLimitTypeName = Name)%>%dplyr::select(DetectionQuantitationLimitTypeName, TADA.Limit_Type)

## Join to censored data
cens = dplyr::left_join(cens, limtype.ref)

## Create flag for condition and limit type combinations
cens = cens%>%dplyr::mutate(TADA.CensoredData.Flag = dplyr::case_when(
TADA.Detection_Type=="Non-Detect"&TADA.Limit_Type=="Non-Detect" ~ as.character("Non-Detect"),
TADA.Detection_Type=="Over-Detect"&TADA.Limit_Type=="Over-Detect" ~ as.character("Over-Detect"),
TADA.Detection_Type=="Other"&TADA.Limit_Type=="Other" ~ as.character("Other Condition/Limit Populated"),
!TADA.Detection_Type==TADA.Limit_Type ~ as.character("Conflict between Condition and Limit")
))

## warn when some limit metadata may be problematic
if("Conflict between Condition and Limit"%in%cens$TADA.CensoredData.Flag){
num = length(cens$TADA.CensoredData.Flag[cens$TADA.CensoredData.Flag=="Conflict between Condition and Limit"])
warning(paste0(num," records in supplied dataset have conflicting detection condition and detection limit type information. These records will not be included in detection limit handling calculations."))
# If user has not previously run idCensoredData function, run it here to get required columns
if(!"TADA.CensoredData.Flag"%in%names(.data)){
cens.data = idCensoredData(.data)
}else{
cens.data = .data
}

cens = cens%>%dplyr::select(-TADA.Detection_Type, -TADA.Limit_Type)

# split out over detects and non detects
nd = subset(cens, cens$TADA.CensoredData.Flag=="Non-Detect")
od = subset(cens, cens$TADA.CensoredData.Flag=="Over-Detect")
other = subset(cens, !cens$ResultIdentifier%in%c(nd$ResultIdentifier,od$ResultIdentifier))
nd = subset(cens.data, cens.data$TADA.CensoredData.Flag=="Non-Detect")
od = subset(cens.data, cens.data$TADA.CensoredData.Flag=="Over-Detect")
all_others = subset(cens.data, !cens.data$ResultIdentifier%in%c(nd$ResultIdentifier,od$ResultIdentifier))

# ND handling
if(dim(nd)[1]>0){
if(nd_method=="multiplier"){
Expand Down Expand Up @@ -108,7 +161,63 @@ simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier
}
}

.data = plyr::rbind.fill(not_cens, nd, od, other)
.data = plyr::rbind.fill(nd, od, all_others)
.data = OrderTADACols(.data)
return(.data)
}

#' Summarize Censored Data
#'
#' This function creates a summary table of the percentage of non-detects by
#' specified ID columns. It can be used to determine the best method for handling
#' censored data estimation methods that depend upon the distribution of the dataset.
#'
#' @param .data A TADA dataframe
#' @param spec_cols A vector of column names to be used as aggregating variables when summarizing censored data information.
#' @return A summary dataframe yielding sample ncounts, censored data ncounts,
#' and percent of dataset that is censored, aggregated by user-defined grouping
#' variables. Also produces a column "TADA.Censored.Note" that identifies
#' when there is sufficient non-censored data to estimate censored data using statistical
#' methods including Maximum Likelihood Estimation, Robust ROS and Kaplan Meier.
#' The decision tree used to identify applicable statistical analyses is based
#' on the Baseline Assessment of Left-Censored Environmental Data Using R Tech Note.
#' More info can be found here: https://www.epa.gov/sites/default/files/2016-05/documents/tech_notes_10_jun2014_r.pdf
#'
#'
#' @export

summarizeCensoredData <- function(.data, spec_cols = c("TADA.CharacteristicName","TADA.ResultMeasure.MeasureUnitCode","TADA.ResultSampleFractionText","TADA.MethodSpecificationName")){

if(any(is.na(.data$TADA.ResultMeasureValue))){
warning("Dataset contains data missing both a result value and a detection limit. Suggest removing or handling before summarizing.")
}

if(!"TADA.CensoredData.Flag"%in%names(.data)){
cens = idCensoredData(.data)
}else{
cens = .data
}

sum_low = cens%>%dplyr::group_by_at(spec_cols)%>%
dplyr::filter(TADA.CensoredData.Flag%in%c("Non-Detect", "Uncensored"))%>%
dplyr::summarise(Sample_Count = length(unique(ResultIdentifier)), Censored_Count = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Non-Detect"]), Percent_Censored = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Non-Detect"])/length(TADA.CensoredData.Flag)*100, Censoring_Levels = length(unique(TADA.ResultMeasureValue[TADA.CensoredData.Flag=="Non-Detect"])))%>%
dplyr::filter(Censored_Count>0)%>%
dplyr::mutate("TADA.CensoredData.Flag" ="Non-Detect")

sum_hi = cens%>%dplyr::group_by_at(spec_cols)%>%
dplyr::filter(TADA.CensoredData.Flag%in%c("Over-Detect", "Uncensored"))%>%
dplyr::summarise(Sample_Count = length(unique(ResultIdentifier)), Censored_Count = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Over-Detect"]), Percent_Censored = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Over-Detect"])/length(TADA.CensoredData.Flag)*100, Censoring_Levels = length(unique(TADA.ResultMeasureValue[TADA.CensoredData.Flag=="Over-Detect"])))%>%
dplyr::filter(Censored_Count>0)%>%
dplyr::mutate("TADA.CensoredData.Flag" ="Over-Detect")

sum_all = plyr::rbind.fill(sum_low, sum_hi)

sum_all = sum_all%>%dplyr::mutate(TADA.Censored.Note = dplyr::case_when(
Percent_Censored>80 ~ as.character("Percent censored too high for estimation methods"),
Percent_Censored<50&Censoring_Levels>1 ~ as.character("Kaplan-Meier"),
Percent_Censored<50&Sample_Count>=50 ~ as.character("Robust Regression Order Statistics"),
Sample_Count<50 ~ as.character("Robust Regression Order Statistics"),
Percent_Censored>=50&Sample_Count<50 ~ as.character("Maximum Likelihood Estimation")
))
return(sum_all)
}
2 changes: 1 addition & 1 deletion docs/articles/CONTRIBUTING.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading