From a1cdf1a3ddea1b6cded81a767566431ee9058210 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 22 Jul 2024 12:38:04 +0000 Subject: [PATCH 01/22] update how the design diagram is inserted in the package design vignette --- vignettes/design_principle.Rmd | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vignettes/design_principle.Rmd b/vignettes/design_principle.Rmd index a9ab09a3..65b6e0fc 100644 --- a/vignettes/design_principle.Rmd +++ b/vignettes/design_principle.Rmd @@ -26,10 +26,7 @@ Data cleaning is an important phase for ensuring the efficacy of downstream anal The {cleanepi} R package is designed to offer functional programming-style data cleansing tasks. To streamline the organization of data cleaning operations, we have categorized them into distinct groups referred to as **modules**. These modules are based on overarching goals derived from commonly anticipated data cleaning procedures. Each module features a primary function along with additional helper functions tailored to accomplish specific tasks. It's important to note that, except for few cases where the outcome a helper function can impact on the cleaning task, only the main function of each module will be exported. This deliberate choice empowers users to execute individual cleaning tasks as needed, enhancing flexibility and usability. -```{r echo=FALSE, comment="Figure1"} -knitr::include_graphics(file.path("..", "man", "figures", - "cleanepi_design_diagram.drawio.png")) -``` +![cleanepi design diagram](../man/figures/cleanepi_design_diagram.drawio.png) At the core of {cleanepi}, the pivotal function `clean_data()` serves as a wrapper encapsulating all the modules, as illustrated in the figure above. This function is intended to be the primary entry point for users seeking to cleanse their data. It performs the cleaning operations as requested by the user through the set of parameters that need to be explicitly defined. Furthermore, multiple cleaning operations can be performed sequentially using the “pipe” operator (`%>%`). In addition, this package also has two surrogate functions: From 64b6dc14bcf503fb53ae11b771cf01682c84cac2 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 22 Jul 2024 17:22:36 +0000 Subject: [PATCH 02/22] fix bug in scan_data() function --- R/clean_data_helpers.R | 44 +++++++++++++++++++++++++++++++----------- man/scan_columns.Rd | 2 +- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 4e37ebd8..77d4e459 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -9,7 +9,7 @@ #' #' @keywords internal #' -scan_columns <- function(x) { +scan_columns <- function(x, type) { # --- save the variable length --- n_rows <- length(x) @@ -18,8 +18,12 @@ scan_columns <- function(x) { x <- x[!is.na(x)] # --- get the proportion of numeric values --- - tmp <- suppressWarnings(as.numeric(x)) - are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L) + are_numeric <- 0L + verdict <- type == "logical" | type == "factor" + if (!verdict) { + tmp <- suppressWarnings(as.numeric(x)) + are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L) + } # --- get the proportion of date values --- x <- x[which(is.na(tmp))] @@ -39,11 +43,18 @@ scan_columns <- function(x) { } # --- get the proportion of logical values --- - are_logical <- round((sum(is.logical(x)) / n_rows), 6L) + are_logical <- 0 + if (type == "logical") { + are_logical <- round((1.0 - (are_na + are_date)), 6L) + } # --- get the proportion of character values --- - are_character <- round((1.0 - (are_na + are_numeric + - are_date + are_logical)), 6L) + are_character <- 0L + if (!verdict) { + are_character <- round((1.0 - (are_na + are_numeric + + are_date + are_logical)), 6L) + } + # --- return the output --- return(c(are_na, are_numeric, are_date, are_character, are_logical)) @@ -66,11 +77,22 @@ scan_columns <- function(x) { #' package = "cleanepi")) #' ) scan_data <- function(data) { - scan_result <- data.frame(t(apply(data, 2L, scan_columns))) - names(scan_result) <- c("missing", "numeric", "date", "character", + # when scanning through the data, logical and factor columns will be treated + # differently from the others. It means only the percent of missing and Date + # values will be evaluated for these columns. + # The percent of numeric and character value will be set automatically to 0 as + # to prevent from the effects of the conversion to numeric and character. + # + types <- as.character(sapply(data, class)) + scan_result <- NULL + j <- 1 + for (i in names(data)) { + scan_result <- rbind(scan_result, scan_columns(data[[i]], types[j])) + j <- j + 1 + } + scan_result <- as.data.frame(scan_result) + names(scan_result) <- c("missing", "numeric", "date", "character", "logical") - row_names <- rownames(scan_result) - rownames(scan_result) <- NULL - scan_result <- cbind(Field_names = row_names, scan_result) + scan_result <- cbind(Field_names = names(data), scan_result) return(scan_result) } diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd index 742fc433..d974f677 100644 --- a/man/scan_columns.Rd +++ b/man/scan_columns.Rd @@ -6,7 +6,7 @@ containing different data types such as numeric, Date, character, and logical.} \usage{ -scan_columns(x) +scan_columns(x, type) } \arguments{ \item{x}{A vector of ones or a combination of various data types.} From f069c59db927bdd0765420201506cd92c51136d7 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 22 Jul 2024 17:22:36 +0000 Subject: [PATCH 03/22] fix bug in scan_data() function --- R/clean_data_helpers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 77d4e459..51ed31e7 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -83,7 +83,7 @@ scan_data <- function(data) { # The percent of numeric and character value will be set automatically to 0 as # to prevent from the effects of the conversion to numeric and character. # - types <- as.character(sapply(data, class)) + types <- as.character(vapply(data, class, character(1L))) scan_result <- NULL j <- 1 for (i in names(data)) { From 1df81b979c06109bb5d24f466d8d9173c5216517 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Tue, 23 Jul 2024 09:20:07 +0000 Subject: [PATCH 04/22] Automatic readme update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f7d1ddd5..2b5465a3 100644 --- a/README.md +++ b/README.md @@ -498,7 +498,7 @@ cleaned_data <- clean_data( #> checking subject IDs format #> Warning: Detected incorrect subject ids at lines: 3, 5, 7 #> Use the correct_subject_ids() function to adjust them. -#> convertingsexeninto numeric +#> converting sex, en into numeric #> performing dictionary-based cleaning ``` From 078514f070c60bb2da43a46c941454f547aa531b Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 22 Jul 2024 17:22:36 +0000 Subject: [PATCH 05/22] fix bug in scan_data() function --- DESCRIPTION | 2 +- R/clean_data_helpers.R | 115 ++++++++++++++++++++++++-------- man/cleanepi-package.Rd | 2 +- man/scan_columns.Rd | 2 + man/scan_lgl_and_fct_columns.Rd | 21 ++++++ 5 files changed, 111 insertions(+), 31 deletions(-) create mode 100644 man/scan_lgl_and_fct_columns.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 10b01639..54b48d9e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ Authors@R: c( person("Nuredin", "Mohammed", , "Nuredin.Mohammed@lshtm.ac.uk", role = "aut"), person("Bubacarr", "Bah", , "Bubacarr.Bah1@lshtm.ac.uk", role = "aut", comment = c(ORCID = "0000-0003-3318-6668")), - person("Hugo", "Gruson", , "hugo@data.org", role = "rev", + person("Hugo", "Gruson", , "hugo@data.org", role = c("ctb", "rev"), comment = c(ORCID = "0000-0002-4094-1476")), person("Pratik R.", "Gupte", , "pratik.gupte@lshtm.ac.uk", role ="rev", comment = c(ORCID = "0000-0001-5294-7819")), diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 51ed31e7..ffab9559 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -1,8 +1,66 @@ + +#' Calculate the percentage of missing and other data type values from a vector +#' of factor or logical values. +#' +#' @param x A vector of ones or a combination of various data types. +#' +#' @returns A vector of 5 elements representing the percentage of missing, +#' numeric, date, character, and logical values found in the input vector. +#' +#' @keywords internal +#' +scan_lgl_and_fct_columns <- function(x) { + ## for logical and factor columns, we will use their `levels` to determine + ## the proportion of the different types + are_numeric <- are_date <- are_logical <- are_character <- 0L + + # save the variable length + n_rows <- length(x) + + # get the proportion of NA + are_na <- round((sum(is.na(x)) / n_rows), 6L) + x <- x[!is.na(x)] + + # get the proportion of logical values + if (is.logical(x)) { + are_logical <- round((sum(x) + sum(!x)) / n_rows, 6L) + return(c(are_na, are_numeric, are_date, are_character, are_logical)) + } + + # get the proportion of numeric values + l <- levels(x) + numeric_levels <- l[!is.na(suppressWarnings(as.numeric(l)))] + if (length(numeric_levels) > 0L) { + are_numeric <- round((sum(x %in% numeric_levels) / n_rows), 6L) + } + + date_levels <- l[!is.na(suppressWarnings( + as.Date( + lubridate::parse_date_time( + l, + orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", + "dmY", "mdY", "mYd", "dYm") + ) + ) + ))] + if (length(date_levels) > 0L) { + are_date <- round((sum(x %in% date_levels) / n_rows), 6L) + } + + are_character <- round((1.0 - (are_na + are_numeric + + are_date + are_logical)), 6L) + + # return the output + return(c(are_na, are_numeric, are_date, are_character, are_logical)) +} + + #' Calculate the percentage of missing and other data type values in a vector #' containing different data types such as numeric, Date, character, and #' logical. #' #' @param x A vector of ones or a combination of various data types. +#' @param type A character with the the vector type. #' #' @returns A vector of 5 elements representing the percentage of missing, #' numeric, date, character, and logical values found in the input vector. @@ -10,28 +68,38 @@ #' @keywords internal #' scan_columns <- function(x, type) { - # --- save the variable length --- + ## The processing path is determine with the logical `verdict` + verdict <- type == "logical" || type == "factor" + + ## logical and factor are processed differently from the other. + ## Logical vectors are expected to contain `TRUE` or `FALSE` or `NA`. For + ## columns of this type, we will check the %NA, %TRUE and %FALSE. The others + ## will be set at 0. + ## For factor columns, the processing is based on their levels. Levels will be + ## checked for numeric, date, logical, NA, and character + if (verdict) { + return(scan_lgl_and_fct_columns(x)) + } + + # save the variable length n_rows <- length(x) - # --- get the proportion of NA --- + # get the proportion of NA are_na <- round((sum(is.na(x)) / n_rows), 6L) x <- x[!is.na(x)] - # --- get the proportion of numeric values --- - are_numeric <- 0L - verdict <- type == "logical" | type == "factor" - if (!verdict) { - tmp <- suppressWarnings(as.numeric(x)) - are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L) - } + # get the proportion of numeric values + are_numeric <- are_logical <- 0L + tmp <- suppressWarnings(as.numeric(x)) + are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L) - # --- get the proportion of date values --- + # get the proportion of date values x <- x[which(is.na(tmp))] are_date <- 0L if (!is.null(lubridate::guess_formats(x, c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", "dmY", "mdY", "mYd", "dYm")))) { - x <- suppressWarnings( + x <- suppressWarnings( as.Date( lubridate::parse_date_time( x, orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", @@ -42,21 +110,14 @@ scan_columns <- function(x, type) { are_date <- round((sum(!is.na(x)) / n_rows), 6L) } - # --- get the proportion of logical values --- - are_logical <- 0 - if (type == "logical") { - are_logical <- round((1.0 - (are_na + are_date)), 6L) - } - - # --- get the proportion of character values --- + # get the proportion of character values are_character <- 0L if (!verdict) { are_character <- round((1.0 - (are_na + are_numeric + are_date + are_logical)), 6L) } - - # --- return the output --- + # return the output return(c(are_na, are_numeric, are_date, are_character, are_logical)) } @@ -82,15 +143,11 @@ scan_data <- function(data) { # values will be evaluated for these columns. # The percent of numeric and character value will be set automatically to 0 as # to prevent from the effects of the conversion to numeric and character. - # - types <- as.character(vapply(data, class, character(1L))) - scan_result <- NULL - j <- 1 - for (i in names(data)) { - scan_result <- rbind(scan_result, scan_columns(data[[i]], types[j])) - j <- j + 1 - } - scan_result <- as.data.frame(scan_result) + types <- vapply(data, class, character(1L)) + scan_result <- vapply(seq_len(ncol(data)), function(col_index) { + scan_columns(data[[col_index]], types[[col_index]]) + }, numeric(5L)) + scan_result <- as.data.frame(t(scan_result)) names(scan_result) <- c("missing", "numeric", "date", "character", "logical") scan_result <- cbind(Field_names = names(data), scan_result) diff --git a/man/cleanepi-package.Rd b/man/cleanepi-package.Rd index bd186e3c..5346424f 100644 --- a/man/cleanepi-package.Rd +++ b/man/cleanepi-package.Rd @@ -31,7 +31,7 @@ Authors: Other contributors: \itemize{ \item Thibaut Jombart \email{thibautjombart@gmail.com} (Thibault is the owner of the code in guess_dates.R file.) [copyright holder] - \item Hugo Gruson \email{hugo@data.org} (\href{https://orcid.org/0000-0002-4094-1476}{ORCID}) [reviewer] + \item Hugo Gruson \email{hugo@data.org} (\href{https://orcid.org/0000-0002-4094-1476}{ORCID}) [contributor, reviewer] \item Pratik R. Gupte \email{pratik.gupte@lshtm.ac.uk} (\href{https://orcid.org/0000-0001-5294-7819}{ORCID}) [reviewer] \item James M. Azam \email{james.azam@lshtm.ac.uk} (\href{https://orcid.org/0000-0001-5782-7330}{ORCID}) [reviewer] \item Joshua W. Lambert \email{joshua.lambert@lshtm.ac.uk} (\href{https://orcid.org/0000-0001-5218-3046}{ORCID}) [reviewer] diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd index d974f677..6960ed5e 100644 --- a/man/scan_columns.Rd +++ b/man/scan_columns.Rd @@ -10,6 +10,8 @@ scan_columns(x, type) } \arguments{ \item{x}{A vector of ones or a combination of various data types.} + +\item{type}{A character with the the vector type.} } \value{ A vector of 5 elements representing the percentage of missing, diff --git a/man/scan_lgl_and_fct_columns.Rd b/man/scan_lgl_and_fct_columns.Rd new file mode 100644 index 00000000..302b0fae --- /dev/null +++ b/man/scan_lgl_and_fct_columns.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_data_helpers.R +\name{scan_lgl_and_fct_columns} +\alias{scan_lgl_and_fct_columns} +\title{Calculate the percentage of missing and other data type values from a vector +of factor or logical values.} +\usage{ +scan_lgl_and_fct_columns(x) +} +\arguments{ +\item{x}{A vector of ones or a combination of various data types.} +} +\value{ +A vector of 5 elements representing the percentage of missing, +numeric, date, character, and logical values found in the input vector. +} +\description{ +Calculate the percentage of missing and other data type values from a vector +of factor or logical values. +} +\keyword{internal} From 26daa9e57ec85a79b6d8ebdb23e82a0663877976 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Thu, 25 Jul 2024 10:15:24 +0000 Subject: [PATCH 06/22] account for other column types --- R/clean_data_helpers.R | 43 +++++++++++++++++------- man/get_class.Rd | 18 ++++++++++ tests/testthat/test-clean_data_helpers.R | 31 +++++++++++++---- 3 files changed, 74 insertions(+), 18 deletions(-) create mode 100644 man/get_class.Rd diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index ffab9559..edcd9a44 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -69,7 +69,8 @@ scan_lgl_and_fct_columns <- function(x) { #' scan_columns <- function(x, type) { ## The processing path is determine with the logical `verdict` - verdict <- type == "logical" || type == "factor" + type <- unlist(strsplit(type, ",")) + verdict <- type %in% c("logical", "factor") ## logical and factor are processed differently from the other. ## Logical vectors are expected to contain `TRUE` or `FALSE` or `NA`. For @@ -77,7 +78,7 @@ scan_columns <- function(x, type) { ## will be set at 0. ## For factor columns, the processing is based on their levels. Levels will be ## checked for numeric, date, logical, NA, and character - if (verdict) { + if (any(verdict)) { return(scan_lgl_and_fct_columns(x)) } @@ -88,18 +89,17 @@ scan_columns <- function(x, type) { are_na <- round((sum(is.na(x)) / n_rows), 6L) x <- x[!is.na(x)] - # get the proportion of numeric values - are_numeric <- are_logical <- 0L - tmp <- suppressWarnings(as.numeric(x)) - are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L) - # get the proportion of date values - x <- x[which(is.na(tmp))] are_date <- 0L + if (any(type %in% c("POSIXct", "POSIXt"))) { + tmp <- as.Date(x) + are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) + x <- x[is.na(tmp)] + } if (!is.null(lubridate::guess_formats(x, c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", "dmY", "mdY", "mYd", "dYm")))) { - x <- suppressWarnings( + tmp <- suppressWarnings( as.Date( lubridate::parse_date_time( x, orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", @@ -107,12 +107,20 @@ scan_columns <- function(x, type) { ) ) ) - are_date <- round((sum(!is.na(x)) / n_rows), 6L) + are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) + x <- x[is.na(tmp)] + } + + # get the proportion of numeric values + are_numeric <- are_logical <- 0L + if (length(x) > 0L) { + tmp <- suppressWarnings(as.numeric(x)) + are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L) } # get the proportion of character values are_character <- 0L - if (!verdict) { + if (!all(verdict)) { are_character <- round((1.0 - (are_na + are_numeric + are_date + are_logical)), 6L) } @@ -121,6 +129,17 @@ scan_columns <- function(x, type) { return(c(are_na, are_numeric, are_date, are_character, are_logical)) } +#' Get class of a vector +#' +#' @param x The input vector +#' +#' @return A character with the class of the input vector +#' @keywords internal +#' +get_class <- function(x) { + paste(class(x), collapse = ",") +} + #' Scan a data frame to determine the percentage of `missing`, `numeric`, #' `Date`, `character`, and `logical` values in every column. #' @@ -143,7 +162,7 @@ scan_data <- function(data) { # values will be evaluated for these columns. # The percent of numeric and character value will be set automatically to 0 as # to prevent from the effects of the conversion to numeric and character. - types <- vapply(data, class, character(1L)) + types <- vapply(data, get_class, character(1L)) scan_result <- vapply(seq_len(ncol(data)), function(col_index) { scan_columns(data[[col_index]], types[[col_index]]) }, numeric(5L)) diff --git a/man/get_class.Rd b/man/get_class.Rd new file mode 100644 index 00000000..d80d7854 --- /dev/null +++ b/man/get_class.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_data_helpers.R +\name{get_class} +\alias{get_class} +\title{Get class of a vector} +\usage{ +get_class(x) +} +\arguments{ +\item{x}{The input vector} +} +\value{ +A character with the class of the input vector +} +\description{ +Get class of a vector +} +\keyword{internal} diff --git a/tests/testthat/test-clean_data_helpers.R b/tests/testthat/test-clean_data_helpers.R index 76def95f..f1711ffd 100644 --- a/tests/testthat/test-clean_data_helpers.R +++ b/tests/testthat/test-clean_data_helpers.R @@ -1,11 +1,30 @@ -test_that("scan_data works", { - scan_result <- scan_data( - data = readRDS(system.file("extdata", "messy_data.RDS", - package = "cleanepi")) - ) +test_that("scan_data works as expected", { + dat <- readRDS(system.file("extdata", "messy_data.RDS", + package = "cleanepi")) + scan_result <- scan_data(data = dat) expect_s3_class(scan_result, "data.frame") expect_named(scan_result, c("Field_names", "missing", "numeric", "date", "character", "logical")) expect_identical(ncol(scan_result), 6L) - expect_identical(nrow(scan_result), 9L) + expect_identical(nrow(scan_result), ncol(dat)) + expect_identical(scan_result[["Field_names"]], names(dat)) + + # using a dataset with many data types + data(iris) + iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), + replace = TRUE)) + iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) + iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), + to = as.Date("2024-08-30"), + length.out = nrow(iris))) + iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) + scan_result <- scan_data(data = iris) + expect_identical(ncol(scan_result), 6L) + expect_identical(nrow(scan_result), ncol(iris)) + expect_identical(scan_result[["Field_names"]], names(iris)) + expect_identical(sum(scan_result[["numeric"]]), 4) + expect_identical(sum(scan_result[["missing"]]), 0) + expect_identical(sum(scan_result[["date"]]), 2) + expect_identical(sum(scan_result[["character"]]), 2) + expect_identical(sum(scan_result[["logical"]]), 1) }) From 5f2b59f51c5852175054341ffc08d5d7f495d798 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Thu, 25 Jul 2024 10:15:24 +0000 Subject: [PATCH 07/22] account for other column types --- R/clean_data_helpers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index edcd9a44..788c3b10 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -69,7 +69,7 @@ scan_lgl_and_fct_columns <- function(x) { #' scan_columns <- function(x, type) { ## The processing path is determine with the logical `verdict` - type <- unlist(strsplit(type, ",")) + type <- unlist(strsplit(type, ",", fixed = TRUE)) verdict <- type %in% c("logical", "factor") ## logical and factor are processed differently from the other. From ae5c4906472fbd44bfde2030dea45716590ad0b5 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 29 Jul 2024 17:04:50 +0000 Subject: [PATCH 08/22] refactor the scan_data() function --- R/clean_data_helpers.R | 295 +++++++++++++---------- man/get_class.Rd | 18 -- man/scan_in_character.Rd | 19 ++ man/scan_in_double.Rd | 19 ++ man/scan_in_integer.Rd | 19 ++ man/scan_in_logical.Rd | 19 ++ man/scan_lgl_and_fct_columns.Rd | 21 -- tests/testthat/test-clean_data_helpers.R | 12 +- tests/testthat/test-utils.R | 6 +- 9 files changed, 254 insertions(+), 174 deletions(-) delete mode 100644 man/get_class.Rd create mode 100644 man/scan_in_character.Rd create mode 100644 man/scan_in_double.Rd create mode 100644 man/scan_in_integer.Rd create mode 100644 man/scan_in_logical.Rd delete mode 100644 man/scan_lgl_and_fct_columns.Rd diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 788c3b10..ba990ffb 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -1,19 +1,64 @@ -#' Calculate the percentage of missing and other data type values from a vector -#' of factor or logical values. +#' Calculate the percentage of missing and other data type values in a vector +#' containing different data types such as numeric, Date, character, and +#' logical. #' #' @param x A vector of ones or a combination of various data types. +#' @param type A character with the the vector type. #' #' @returns A vector of 5 elements representing the percentage of missing, #' numeric, date, character, and logical values found in the input vector. #' #' @keywords internal #' -scan_lgl_and_fct_columns <- function(x) { - ## for logical and factor columns, we will use their `levels` to determine - ## the proportion of the different types - are_numeric <- are_date <- are_logical <- are_character <- 0L +scan_columns <- function(x, type) { + res <- switch(type, + double = scan_in_double(x), + integer = scan_in_integer(x), + logical = scan_in_logical(x), + character = scan_in_character(x)) + return(res) +} +#' Scan a data frame to determine the percentage of `missing`, `numeric`, +#' `Date`, `character`, and `logical` values in every column. +#' +#' @param data A data frame or linelist +#' +#' @returns A data frame or linelist with the same columns as the input data +#' and 5 rows representing the percentage of missing, numeric, date, character, +#' and logical values in each column. +#' +#' @export +#' +#' @examples +#' scan_result <- scan_data( +#' data = readRDS(system.file("extdata", "messy_data.RDS", +#' package = "cleanepi")) +#' ) +scan_data <- function(data) { + types <- vapply(data, typeof, character(1L)) + scan_result <- vapply(seq_len(ncol(data)), function(col_index) { + scan_columns(data[[col_index]], types[[col_index]]) + }, numeric(7L)) + scan_result <- as.data.frame(t(scan_result)) + names(scan_result) <- c("missing", "numeric", "date", "character", + "logical", "date-time", "factor") + scan_result <- cbind(Field_names = names(data), scan_result) + return(scan_result) +} + +#' Scan through a double column +#' +#' @param x The input vector +#' +#' @return A numeric vector with the proportion of the different types of data +#' that were detected within the input vector. +#' @keywords internal +#' +scan_in_double <- function(x) { + are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- + are_logical <- are_na <- 0.0 # save the variable length n_rows <- length(x) @@ -21,66 +66,102 @@ scan_lgl_and_fct_columns <- function(x) { are_na <- round((sum(is.na(x)) / n_rows), 6L) x <- x[!is.na(x)] - # get the proportion of logical values - if (is.logical(x)) { - are_logical <- round((sum(x) + sum(!x)) / n_rows, 6L) - return(c(are_na, are_numeric, are_date, are_character, are_logical)) + # doubles are either numeric (attributes = NULL), or Date (has a 'class' + # attributes = Date), or date-time (has a 'class' attributes = POSIXt) + if ("class" %in% names(attributes(x))) { + if ("Date" %in% attributes(x)[["class"]]) { + are_date <- round((length(x) / n_rows), 6L) + } else if ("POSIXt" %in% attributes(x)[["class"]]) { + are_date_time <- round((length(x) / n_rows), 6L) + } + } else { + are_numeric <- round((length(x) / n_rows), 6L) } + return( + c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, + are_factor) + ) +} - # get the proportion of numeric values - l <- levels(x) - numeric_levels <- l[!is.na(suppressWarnings(as.numeric(l)))] - if (length(numeric_levels) > 0L) { - are_numeric <- round((sum(x %in% numeric_levels) / n_rows), 6L) - } +#' Scan through an integer column +#' +#' @param x The input vector +#' +#' @return A numeric vector with the proportion of the different types of data +#' that were detected within the input vector. +#' @keywords internal +#' +scan_in_integer <- function(x) { + are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- + are_logical <- are_na <- 0.0 + # save the variable length + n_rows <- length(x) - date_levels <- l[!is.na(suppressWarnings( - as.Date( - lubridate::parse_date_time( - l, - orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", - "dmY", "mdY", "mYd", "dYm") - ) - ) - ))] - if (length(date_levels) > 0L) { - are_date <- round((sum(x %in% date_levels) / n_rows), 6L) - } + # get the proportion of NA + are_na <- round((sum(is.na(x)) / n_rows), 6L) + x <- x[!is.na(x)] - are_character <- round((1.0 - (are_na + are_numeric + - are_date + are_logical)), 6L) + # integers are either numeric (attributes = NULL), or factors (has a 'class' + # and 'levels' attributes) + if (is.null(attributes(x))) { + are_numeric <- round((length(x) / n_rows), 6L) + } else if (identical(names(attributes(x)), c("levels", "class"))) { + are_factor <- round((length(x) / n_rows), 6L) + } - # return the output - return(c(are_na, are_numeric, are_date, are_character, are_logical)) + return( + c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, + are_factor) + ) } - -#' Calculate the percentage of missing and other data type values in a vector -#' containing different data types such as numeric, Date, character, and -#' logical. +#' Scan through a logical column #' -#' @param x A vector of ones or a combination of various data types. -#' @param type A character with the the vector type. +#' @param x The input vector #' -#' @returns A vector of 5 elements representing the percentage of missing, -#' numeric, date, character, and logical values found in the input vector. +#' @return A numeric vector with the proportion of the different types of data +#' that were detected within the input vector. +#' @keywords internal +#' +scan_in_logical <- function(x) { + are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- + are_logical <- are_na <- 0.0 + + # logical are simply logical. We will only determine the %NA and %logical + # save the variable length + n_rows <- length(x) + + # get the proportion of NA + are_na <- round((sum(is.na(x)) / n_rows), 6L) + x <- x[!is.na(x)] + + # get the proportion of logical + are_logical <- round((length(x) / n_rows), 6L) + return( + c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, + are_factor) + ) +} + +#' Scan through a character column +#' +#' @param x The input vector #' +#' @return A numeric vector with the proportion of the different types of data +#' that were detected within the input vector. #' @keywords internal #' -scan_columns <- function(x, type) { - ## The processing path is determine with the logical `verdict` - type <- unlist(strsplit(type, ",", fixed = TRUE)) - verdict <- type %in% c("logical", "factor") - - ## logical and factor are processed differently from the other. - ## Logical vectors are expected to contain `TRUE` or `FALSE` or `NA`. For - ## columns of this type, we will check the %NA, %TRUE and %FALSE. The others - ## will be set at 0. - ## For factor columns, the processing is based on their levels. Levels will be - ## checked for numeric, date, logical, NA, and character - if (any(verdict)) { - return(scan_lgl_and_fct_columns(x)) - } +scan_in_character <- function(x) { + # There might be, within a character column, values of type: + # character, numeric, date, date-time, NA, and logical + # In this function, we check the presence of these different types within a + # character column. + # Note that numeric values can actually be of 'Date' or 'date-time' type. + # Given that any numeric can be converted into Date, we will not check for + # Date or date-time values within the numeric. + + are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- + are_logical <- are_na <- 0.0 # save the variable length n_rows <- length(x) @@ -89,86 +170,46 @@ scan_columns <- function(x, type) { are_na <- round((sum(is.na(x)) / n_rows), 6L) x <- x[!is.na(x)] - # get the proportion of date values - are_date <- 0L - if (any(type %in% c("POSIXct", "POSIXt"))) { - tmp <- as.Date(x) - are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) - x <- x[is.na(tmp)] + # get double values and evaluate the proportion numeric values + doubles <- x[!is.na(suppressWarnings(as.double(x)))] + if (length(doubles) > 0L) { + are_numeric <- round((length(doubles) / n_rows), 6L) } - if (!is.null(lubridate::guess_formats(x, c("ymd", "ydm", "dmy", "mdy", "myd", - "dym", "Ymd", "Ydm", "dmY", "mdY", - "mYd", "dYm")))) { - tmp <- suppressWarnings( - as.Date( - lubridate::parse_date_time( - x, orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", - "dmY", "mdY", "mYd", "dYm") + + # get character values and check for the presence of Date and date-time + characters <- x[is.na(suppressWarnings(as.double(x)))] + if (length(characters) > 0L) { + # get the proportion of date values + if (!is.null(lubridate::guess_formats(characters, + c("ymd", "ydm", "dmy", "mdy", "myd", + "dym", "Ymd", "Ydm", "dmY", "mdY", + "mYd", "dYm")))) { + tmp <- suppressWarnings( + as.Date( + lubridate::parse_date_time( + characters, + orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", + "dmY", "mdY", "mYd", "dYm") + ) ) ) - ) - are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) - x <- x[is.na(tmp)] + are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) + x <- x[is.na(tmp)] + characters <- characters[is.na(tmp)] + } } - # get the proportion of numeric values - are_numeric <- are_logical <- 0L - if (length(x) > 0L) { - tmp <- suppressWarnings(as.numeric(x)) - are_numeric <- round((sum(!is.na(tmp)) / n_rows), 6L) - } + # get the proportion of logical values + logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE" + are_logical <- round((sum(logicals) / n_rows), 6L) # get the proportion of character values - are_character <- 0L - if (!all(verdict)) { - are_character <- round((1.0 - (are_na + are_numeric + - are_date + are_logical)), 6L) - } + are_character <- round((1.0 - (are_na + are_numeric + + are_date + are_logical)), 6L) # return the output - return(c(are_na, are_numeric, are_date, are_character, are_logical)) -} - -#' Get class of a vector -#' -#' @param x The input vector -#' -#' @return A character with the class of the input vector -#' @keywords internal -#' -get_class <- function(x) { - paste(class(x), collapse = ",") -} - -#' Scan a data frame to determine the percentage of `missing`, `numeric`, -#' `Date`, `character`, and `logical` values in every column. -#' -#' @param data A data frame or linelist -#' -#' @returns A data frame or linelist with the same columns as the input data -#' and 5 rows representing the percentage of missing, numeric, date, character, -#' and logical values in each column. -#' -#' @export -#' -#' @examples -#' scan_result <- scan_data( -#' data = readRDS(system.file("extdata", "messy_data.RDS", -#' package = "cleanepi")) -#' ) -scan_data <- function(data) { - # when scanning through the data, logical and factor columns will be treated - # differently from the others. It means only the percent of missing and Date - # values will be evaluated for these columns. - # The percent of numeric and character value will be set automatically to 0 as - # to prevent from the effects of the conversion to numeric and character. - types <- vapply(data, get_class, character(1L)) - scan_result <- vapply(seq_len(ncol(data)), function(col_index) { - scan_columns(data[[col_index]], types[[col_index]]) - }, numeric(5L)) - scan_result <- as.data.frame(t(scan_result)) - names(scan_result) <- c("missing", "numeric", "date", "character", - "logical") - scan_result <- cbind(Field_names = names(data), scan_result) - return(scan_result) + return( + c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, + are_factor) + ) } diff --git a/man/get_class.Rd b/man/get_class.Rd deleted file mode 100644 index d80d7854..00000000 --- a/man/get_class.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{get_class} -\alias{get_class} -\title{Get class of a vector} -\usage{ -get_class(x) -} -\arguments{ -\item{x}{The input vector} -} -\value{ -A character with the class of the input vector -} -\description{ -Get class of a vector -} -\keyword{internal} diff --git a/man/scan_in_character.Rd b/man/scan_in_character.Rd new file mode 100644 index 00000000..e1e912fc --- /dev/null +++ b/man/scan_in_character.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_data_helpers.R +\name{scan_in_character} +\alias{scan_in_character} +\title{Scan through a character column} +\usage{ +scan_in_character(x) +} +\arguments{ +\item{x}{The input vector} +} +\value{ +A numeric vector with the proportion of the different types of data +that were detected within the input vector. +} +\description{ +Scan through a character column +} +\keyword{internal} diff --git a/man/scan_in_double.Rd b/man/scan_in_double.Rd new file mode 100644 index 00000000..3c277218 --- /dev/null +++ b/man/scan_in_double.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_data_helpers.R +\name{scan_in_double} +\alias{scan_in_double} +\title{Scan through a double column} +\usage{ +scan_in_double(x) +} +\arguments{ +\item{x}{The input vector} +} +\value{ +A numeric vector with the proportion of the different types of data +that were detected within the input vector. +} +\description{ +Scan through a double column +} +\keyword{internal} diff --git a/man/scan_in_integer.Rd b/man/scan_in_integer.Rd new file mode 100644 index 00000000..d807eed6 --- /dev/null +++ b/man/scan_in_integer.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_data_helpers.R +\name{scan_in_integer} +\alias{scan_in_integer} +\title{Scan through an integer column} +\usage{ +scan_in_integer(x) +} +\arguments{ +\item{x}{The input vector} +} +\value{ +A numeric vector with the proportion of the different types of data +that were detected within the input vector. +} +\description{ +Scan through an integer column +} +\keyword{internal} diff --git a/man/scan_in_logical.Rd b/man/scan_in_logical.Rd new file mode 100644 index 00000000..61f22803 --- /dev/null +++ b/man/scan_in_logical.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clean_data_helpers.R +\name{scan_in_logical} +\alias{scan_in_logical} +\title{Scan through a logical column} +\usage{ +scan_in_logical(x) +} +\arguments{ +\item{x}{The input vector} +} +\value{ +A numeric vector with the proportion of the different types of data +that were detected within the input vector. +} +\description{ +Scan through a logical column +} +\keyword{internal} diff --git a/man/scan_lgl_and_fct_columns.Rd b/man/scan_lgl_and_fct_columns.Rd deleted file mode 100644 index 302b0fae..00000000 --- a/man/scan_lgl_and_fct_columns.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_lgl_and_fct_columns} -\alias{scan_lgl_and_fct_columns} -\title{Calculate the percentage of missing and other data type values from a vector -of factor or logical values.} -\usage{ -scan_lgl_and_fct_columns(x) -} -\arguments{ -\item{x}{A vector of ones or a combination of various data types.} -} -\value{ -A vector of 5 elements representing the percentage of missing, -numeric, date, character, and logical values found in the input vector. -} -\description{ -Calculate the percentage of missing and other data type values from a vector -of factor or logical values. -} -\keyword{internal} diff --git a/tests/testthat/test-clean_data_helpers.R b/tests/testthat/test-clean_data_helpers.R index f1711ffd..d504f0f7 100644 --- a/tests/testthat/test-clean_data_helpers.R +++ b/tests/testthat/test-clean_data_helpers.R @@ -4,8 +4,8 @@ test_that("scan_data works as expected", { scan_result <- scan_data(data = dat) expect_s3_class(scan_result, "data.frame") expect_named(scan_result, c("Field_names", "missing", "numeric", "date", - "character", "logical")) - expect_identical(ncol(scan_result), 6L) + "character", "logical", "date-time", "factor")) + expect_identical(ncol(scan_result), 8L) expect_identical(nrow(scan_result), ncol(dat)) expect_identical(scan_result[["Field_names"]], names(dat)) @@ -19,12 +19,14 @@ test_that("scan_data works as expected", { length.out = nrow(iris))) iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) scan_result <- scan_data(data = iris) - expect_identical(ncol(scan_result), 6L) + expect_identical(ncol(scan_result), 8L) expect_identical(nrow(scan_result), ncol(iris)) expect_identical(scan_result[["Field_names"]], names(iris)) expect_identical(sum(scan_result[["numeric"]]), 4) expect_identical(sum(scan_result[["missing"]]), 0) - expect_identical(sum(scan_result[["date"]]), 2) - expect_identical(sum(scan_result[["character"]]), 2) + expect_identical(sum(scan_result[["date"]]), 1) + expect_identical(sum(scan_result[["character"]]), 0) expect_identical(sum(scan_result[["logical"]]), 1) + expect_identical(sum(scan_result[["date-time"]]), 1) + expect_identical(sum(scan_result[["factor"]]), 2) }) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 1cbe991b..1f188491 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -110,9 +110,9 @@ test_that("add_to_report works as expected", { expect_type(report, "list") expect_length(report, 1L) expect_named(report, "scanning_result") - expect_named(report[["scanning_result"]], c("Field_names", "missing", - "numeric", "date", "character", - "logical")) + expect_named(report[["scanning_result"]], + c("Field_names", "missing", "numeric", "date", "character", + "logical", "date-time", "factor")) expect_identical(nrow(report[["scanning_result"]]), ncol(data)) }) From eb64d9fe9064d9c320bfb2a189613e03a930eb31 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 29 Jul 2024 17:04:50 +0000 Subject: [PATCH 09/22] refactor the scan_data() function --- R/clean_data_helpers.R | 51 ++++++++++++++++++++++++------------------ man/scan_columns.Rd | 8 +++---- man/scan_data.Rd | 19 +++++++++++----- 3 files changed, 47 insertions(+), 31 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index ba990ffb..b86a4225 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -1,7 +1,7 @@ #' Calculate the percentage of missing and other data type values in a vector -#' containing different data types such as numeric, Date, character, and -#' logical. +#' containing different data types such as numeric, Date, character, +#' logical, date-time, factor. #' #' @param x A vector of ones or a combination of various data types. #' @param type A character with the the vector type. @@ -21,13 +21,15 @@ scan_columns <- function(x, type) { } #' Scan a data frame to determine the percentage of `missing`, `numeric`, -#' `Date`, `character`, and `logical` values in every column. +#' `Date`, `character`, `logical`, `date-time`, and `factor` values in every +#' column. #' #' @param data A data frame or linelist #' -#' @returns A data frame or linelist with the same columns as the input data -#' and 5 rows representing the percentage of missing, numeric, date, character, -#' and logical values in each column. +#' @returns A data frame or linelist with the same number of rows as the number +#' of columns of the input data, and 8 column representing the field names, +#' the percentage of missing, numeric, date, character, logical, date-time, +#' and factor values in each column. #' #' @export #' @@ -36,6 +38,12 @@ scan_columns <- function(x, type) { #' data = readRDS(system.file("extdata", "messy_data.RDS", #' package = "cleanepi")) #' ) +#' +#' @details +#' For columns of type character, the detected numeric values could actually be +#' of type Date or date-time. This is because R coerces some Date values into +#' numeric when the date is imported from an MS Excel file. +#' scan_data <- function(data) { types <- vapply(data, typeof, character(1L)) scan_result <- vapply(seq_len(ncol(data)), function(col_index) { @@ -178,25 +186,24 @@ scan_in_character <- function(x) { # get character values and check for the presence of Date and date-time characters <- x[is.na(suppressWarnings(as.double(x)))] - if (length(characters) > 0L) { + if (length(characters) > 0L && + !is.null(lubridate::guess_formats(characters, + c("ymd", "ydm", "dmy", "mdy", "myd", + "dym", "Ymd", "Ydm", "dmY", "mdY", + "mYd", "dYm")))) { # get the proportion of date values - if (!is.null(lubridate::guess_formats(characters, - c("ymd", "ydm", "dmy", "mdy", "myd", - "dym", "Ymd", "Ydm", "dmY", "mdY", - "mYd", "dYm")))) { - tmp <- suppressWarnings( - as.Date( - lubridate::parse_date_time( - characters, - orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", - "dmY", "mdY", "mYd", "dYm") - ) + tmp <- suppressWarnings( + as.Date( + lubridate::parse_date_time( + characters, + orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", + "dmY", "mdY", "mYd", "dYm") ) ) - are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) - x <- x[is.na(tmp)] - characters <- characters[is.na(tmp)] - } + ) + are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) + x <- x[is.na(tmp)] + characters <- characters[is.na(tmp)] } # get the proportion of logical values diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd index 6960ed5e..b359875d 100644 --- a/man/scan_columns.Rd +++ b/man/scan_columns.Rd @@ -3,8 +3,8 @@ \name{scan_columns} \alias{scan_columns} \title{Calculate the percentage of missing and other data type values in a vector -containing different data types such as numeric, Date, character, and -logical.} +containing different data types such as numeric, Date, character, +logical, date-time, factor.} \usage{ scan_columns(x, type) } @@ -19,7 +19,7 @@ numeric, date, character, and logical values found in the input vector. } \description{ Calculate the percentage of missing and other data type values in a vector -containing different data types such as numeric, Date, character, and -logical. +containing different data types such as numeric, Date, character, +logical, date-time, factor. } \keyword{internal} diff --git a/man/scan_data.Rd b/man/scan_data.Rd index 10d9bca7..cd6b62dd 100644 --- a/man/scan_data.Rd +++ b/man/scan_data.Rd @@ -3,7 +3,8 @@ \name{scan_data} \alias{scan_data} \title{Scan a data frame to determine the percentage of \code{missing}, \code{numeric}, -\code{Date}, \code{character}, and \code{logical} values in every column.} +\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every +column.} \usage{ scan_data(data) } @@ -11,17 +12,25 @@ scan_data(data) \item{data}{A data frame or linelist} } \value{ -A data frame or linelist with the same columns as the input data -and 5 rows representing the percentage of missing, numeric, date, character, -and logical values in each column. +A data frame or linelist with the same number of rows as the number +of columns of the input data, and 8 column representing the field names, +the percentage of missing, numeric, date, character, logical, date-time, +and factor values in each column. } \description{ Scan a data frame to determine the percentage of \code{missing}, \code{numeric}, -\code{Date}, \code{character}, and \code{logical} values in every column. +\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every +column. +} +\details{ +For columns of type character, the detected numeric values could actually be +of type Date or date-time. This is because R coerces some Date values into +numeric when the date is imported from an MS Excel file. } \examples{ scan_result <- scan_data( data = readRDS(system.file("extdata", "messy_data.RDS", package = "cleanepi")) ) + } From 17fee39e9f17cc84bfa749da44717471064fbb95 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Thu, 1 Aug 2024 00:38:19 +0000 Subject: [PATCH 10/22] update the scan_date() function - only scan through character columns --- R/clean_data_helpers.R | 283 +++++++----------- .../printing-rmd/skeleton/skeleton.Rmd | 2 +- man/scan_columns.Rd | 25 -- man/scan_data.Rd | 41 ++- man/scan_in_character.Rd | 2 +- man/scan_in_double.Rd | 19 -- man/scan_in_integer.Rd | 19 -- man/scan_in_logical.Rd | 19 -- tests/testthat/test-clean_data_helpers.R | 32 +- tests/testthat/test-utils.R | 4 +- 10 files changed, 151 insertions(+), 295 deletions(-) delete mode 100644 man/scan_columns.Rd delete mode 100644 man/scan_in_double.Rd delete mode 100644 man/scan_in_integer.Rd delete mode 100644 man/scan_in_logical.Rd diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index b86a4225..8e0bd5bd 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -1,159 +1,64 @@ - -#' Calculate the percentage of missing and other data type values in a vector -#' containing different data types such as numeric, Date, character, -#' logical, date-time, factor. -#' -#' @param x A vector of ones or a combination of various data types. -#' @param type A character with the the vector type. -#' -#' @returns A vector of 5 elements representing the percentage of missing, -#' numeric, date, character, and logical values found in the input vector. -#' -#' @keywords internal -#' -scan_columns <- function(x, type) { - res <- switch(type, - double = scan_in_double(x), - integer = scan_in_integer(x), - logical = scan_in_logical(x), - character = scan_in_character(x)) - return(res) -} - -#' Scan a data frame to determine the percentage of `missing`, `numeric`, -#' `Date`, `character`, `logical`, `date-time`, and `factor` values in every -#' column. +#' Scan through all character columns of a data frame to determine the +#' proportion of `missing`, `numeric`, `Date`, `character`, `logical`, values. #' #' @param data A data frame or linelist #' -#' @returns A data frame or linelist with the same number of rows as the number -#' of columns of the input data, and 8 column representing the field names, -#' the percentage of missing, numeric, date, character, logical, date-time, -#' and factor values in each column. +#' @returns A data frame with the same number of rows as the number of character +#' columns of the input data, and six (06) columns representing the field +#' names, the percentage of missing, numeric, date, character, and logical, +#' values in each column. #' #' @export #' #' @examples +#' # scan through a data frame of characters #' scan_result <- scan_data( #' data = readRDS(system.file("extdata", "messy_data.RDS", #' package = "cleanepi")) #' ) #' -#' @details -#' For columns of type character, the detected numeric values could actually be -#' of type Date or date-time. This is because R coerces some Date values into -#' numeric when the date is imported from an MS Excel file. +#' # scan through a data frame with two character columns +#' scan_result <- scan_data( +#' data = readRDS(system.file("extdata", "test_linelist.RDS", +#' package = "cleanepi")) +#' ) +#' +#' # scan through a data frame with no character columns +#' data(iris) +#' iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), +#' replace = TRUE)) +#' iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) +#' iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), +#' to = as.Date("2024-08-30"), +#' length.out = nrow(iris))) +#' iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) +#' scan_result <- scan_data(data = iris) #' scan_data <- function(data) { - types <- vapply(data, typeof, character(1L)) + # scan through all columns of the data and the identify character columns + types <- vapply(data, typeof, character(1L)) + target_columns <- types[types == "character"] + + # send an message if there is no character column found within the input data + if (length(target_columns) == 0L) { + message("No character column found in the provided data.") + return(invisible(NA)) + } + + # scan through the character columns + data <- data[, names(target_columns)] scan_result <- vapply(seq_len(ncol(data)), function(col_index) { - scan_columns(data[[col_index]], types[[col_index]]) - }, numeric(7L)) + scan_in_character(data[[col_index]]) + }, numeric(5L)) scan_result <- as.data.frame(t(scan_result)) - names(scan_result) <- c("missing", "numeric", "date", "character", - "logical", "date-time", "factor") + names(scan_result) <- c("missing", "numeric", "date", "character", "logical") scan_result <- cbind(Field_names = names(data), scan_result) return(scan_result) } -#' Scan through a double column -#' -#' @param x The input vector -#' -#' @return A numeric vector with the proportion of the different types of data -#' that were detected within the input vector. -#' @keywords internal -#' -scan_in_double <- function(x) { - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 - # save the variable length - n_rows <- length(x) - - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) - x <- x[!is.na(x)] - - # doubles are either numeric (attributes = NULL), or Date (has a 'class' - # attributes = Date), or date-time (has a 'class' attributes = POSIXt) - if ("class" %in% names(attributes(x))) { - if ("Date" %in% attributes(x)[["class"]]) { - are_date <- round((length(x) / n_rows), 6L) - } else if ("POSIXt" %in% attributes(x)[["class"]]) { - are_date_time <- round((length(x) / n_rows), 6L) - } - } else { - are_numeric <- round((length(x) / n_rows), 6L) - } - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) -} - -#' Scan through an integer column -#' -#' @param x The input vector -#' -#' @return A numeric vector with the proportion of the different types of data -#' that were detected within the input vector. -#' @keywords internal -#' -scan_in_integer <- function(x) { - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 - # save the variable length - n_rows <- length(x) - - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) - x <- x[!is.na(x)] - - # integers are either numeric (attributes = NULL), or factors (has a 'class' - # and 'levels' attributes) - if (is.null(attributes(x))) { - are_numeric <- round((length(x) / n_rows), 6L) - } else if (identical(names(attributes(x)), c("levels", "class"))) { - are_factor <- round((length(x) / n_rows), 6L) - } - - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) -} - -#' Scan through a logical column -#' -#' @param x The input vector -#' -#' @return A numeric vector with the proportion of the different types of data -#' that were detected within the input vector. -#' @keywords internal -#' -scan_in_logical <- function(x) { - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 - - # logical are simply logical. We will only determine the %NA and %logical - # save the variable length - n_rows <- length(x) - - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) - x <- x[!is.na(x)] - - # get the proportion of logical - are_logical <- round((length(x) / n_rows), 6L) - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) -} - #' Scan through a character column #' -#' @param x The input vector +#' @param x The input character vector #' #' @return A numeric vector with the proportion of the different types of data #' that were detected within the input vector. @@ -161,62 +66,82 @@ scan_in_logical <- function(x) { #' scan_in_character <- function(x) { # There might be, within a character column, values of type: - # character, numeric, date, date-time, NA, and logical + # character, numeric, date (date or date-time), NA, and logical # In this function, we check the presence of these different types within a # character column. - # Note that numeric values can actually be of 'Date' or 'date-time' type. - # Given that any numeric can be converted into Date, we will not check for - # Date or date-time values within the numeric. - - are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- - are_logical <- are_na <- 0.0 # save the variable length - n_rows <- length(x) + initial_length <- length(x) - # get the proportion of NA - are_na <- round((sum(is.na(x)) / n_rows), 6L) + # get the count of missing data (NA) + na_count <- sum(is.na(x)) x <- x[!is.na(x)] - # get double values and evaluate the proportion numeric values - doubles <- x[!is.na(suppressWarnings(as.double(x)))] - if (length(doubles) > 0L) { - are_numeric <- round((length(doubles) / n_rows), 6L) - } - - # get character values and check for the presence of Date and date-time - characters <- x[is.na(suppressWarnings(as.double(x)))] - if (length(characters) > 0L && - !is.null(lubridate::guess_formats(characters, - c("ymd", "ydm", "dmy", "mdy", "myd", - "dym", "Ymd", "Ydm", "dmY", "mdY", - "mYd", "dYm")))) { - # get the proportion of date values - tmp <- suppressWarnings( - as.Date( - lubridate::parse_date_time( - characters, - orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", - "dmY", "mdY", "mYd", "dYm") - ) + # We will check if there is any Date values within the variable by parsing the + # values, looking for the ones that fit any of the predefined format. + # When there is one or more Date values, we will convert the variable into + # numeric and determine if any of them is a Date (a numeric, which after + # conversion to Date, fall within the interval + # [50 years back from today's date, today's date]). That way the Date count is + # the count of date identified from the parsing + the count of Dates within + # the numeric values. + # When there is no Date values identified from the parsing, the variable + # is converted into numeric. The numeric count is the sum of numeric values. + # The logical count is the number of TRUE and FALSE written in both lower + # and upper case within the variable + # The remaining values will be of type character. + + # parsing the vector, looking for date values + tmp <- suppressWarnings( + as.Date( + lubridate::parse_date_time( + x, + orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", + "dmY", "mdY", "mYd", "dYm") ) ) - are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) - x <- x[is.na(tmp)] - characters <- characters[is.na(tmp)] + ) + + # getting the date and numeric count as describe above + date_count <- numeric_count <- 0L + if (sum(!is.na(tmp)) > 0L) { + # Setting the first date to 50 years before the current date + target_interval <- sort( + seq.Date(Sys.Date(), length.out = 2L, by = "-50 years") + ) + + # get the date count + date_count <- date_count + sum(!is.na(tmp)) + + # convert to numeric and check for the presence of Date among the numeric + tmp2 <- x[is.na(tmp)] + tmp3 <- suppressWarnings(as.numeric(tmp2)) + if (sum(!is.na(tmp3)) > 0L) { + y <- lubridate::as_date( + tmp3[!is.na(tmp3)], + origin = target_interval[[1L]] + ) + # second count of date values coming from date within numeric + date_count <- date_count + sum(!is.na(y)) + numeric_count <- sum(is.na(y)) + } + } else { + tmp <- suppressWarnings(as.numeric(x)) + numeric_count <- sum(!is.na(tmp)) } - # get the proportion of logical values - logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE" - are_logical <- round((sum(logicals) / n_rows), 6L) + # get logical count + logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE" + logical_count <- sum(logicals) - # get the proportion of character values - are_character <- round((1.0 - (are_na + are_numeric + - are_date + are_logical)), 6L) + # get the character count + character_count <- initial_length - + (na_count + logical_count + numeric_count + date_count) - # return the output - return( - c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, - are_factor) - ) + # transform into proportions + props <- round( + c(na_count, numeric_count, date_count, character_count, logical_count) / + initial_length, 4L) + + return(props) } diff --git a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd index 837d556c..e13e407b 100644 --- a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd +++ b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd @@ -59,7 +59,7 @@ p.compact { ```{r cleanepi-source-data, eval=TRUE, echo=FALSE} # EXTRACT THE REPORT SECTIONS scanning_result <- params[["scanning_result"]] -is_data_scanned <- !is.null(scanning_result) +is_data_scanned <- !is.na(scanning_result) standardized_column_names <- params[["colnames"]] are_column_standardised <- !is.null(standardized_column_names) out_of_range_dates <- params[["out_of_range_dates"]] diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd deleted file mode 100644 index b359875d..00000000 --- a/man/scan_columns.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_columns} -\alias{scan_columns} -\title{Calculate the percentage of missing and other data type values in a vector -containing different data types such as numeric, Date, character, -logical, date-time, factor.} -\usage{ -scan_columns(x, type) -} -\arguments{ -\item{x}{A vector of ones or a combination of various data types.} - -\item{type}{A character with the the vector type.} -} -\value{ -A vector of 5 elements representing the percentage of missing, -numeric, date, character, and logical values found in the input vector. -} -\description{ -Calculate the percentage of missing and other data type values in a vector -containing different data types such as numeric, Date, character, -logical, date-time, factor. -} -\keyword{internal} diff --git a/man/scan_data.Rd b/man/scan_data.Rd index cd6b62dd..90a2ee21 100644 --- a/man/scan_data.Rd +++ b/man/scan_data.Rd @@ -2,9 +2,8 @@ % Please edit documentation in R/clean_data_helpers.R \name{scan_data} \alias{scan_data} -\title{Scan a data frame to determine the percentage of \code{missing}, \code{numeric}, -\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every -column.} +\title{Scan through all character columns of a data frame to determine the +proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values.} \usage{ scan_data(data) } @@ -12,25 +11,37 @@ scan_data(data) \item{data}{A data frame or linelist} } \value{ -A data frame or linelist with the same number of rows as the number -of columns of the input data, and 8 column representing the field names, -the percentage of missing, numeric, date, character, logical, date-time, -and factor values in each column. +A data frame with the same number of rows as the number of character +columns of the input data, and six (06) columns representing the field +names, the percentage of missing, numeric, date, character, and logical, +values in each column. } \description{ -Scan a data frame to determine the percentage of \code{missing}, \code{numeric}, -\code{Date}, \code{character}, \code{logical}, \code{date-time}, and \code{factor} values in every -column. -} -\details{ -For columns of type character, the detected numeric values could actually be -of type Date or date-time. This is because R coerces some Date values into -numeric when the date is imported from an MS Excel file. +Scan through all character columns of a data frame to determine the +proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values. } \examples{ +# scan through a data frame of characters scan_result <- scan_data( data = readRDS(system.file("extdata", "messy_data.RDS", package = "cleanepi")) ) +# scan through a data frame with two character columns +scan_result <- scan_data( + data = readRDS(system.file("extdata", "test_linelist.RDS", + package = "cleanepi")) + ) + +# scan through a data frame with no character columns +data(iris) +iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), + replace = TRUE)) +iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) +iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), + to = as.Date("2024-08-30"), + length.out = nrow(iris))) +iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) +scan_result <- scan_data(data = iris) + } diff --git a/man/scan_in_character.Rd b/man/scan_in_character.Rd index e1e912fc..83559951 100644 --- a/man/scan_in_character.Rd +++ b/man/scan_in_character.Rd @@ -7,7 +7,7 @@ scan_in_character(x) } \arguments{ -\item{x}{The input vector} +\item{x}{The input character vector} } \value{ A numeric vector with the proportion of the different types of data diff --git a/man/scan_in_double.Rd b/man/scan_in_double.Rd deleted file mode 100644 index 3c277218..00000000 --- a/man/scan_in_double.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_in_double} -\alias{scan_in_double} -\title{Scan through a double column} -\usage{ -scan_in_double(x) -} -\arguments{ -\item{x}{The input vector} -} -\value{ -A numeric vector with the proportion of the different types of data -that were detected within the input vector. -} -\description{ -Scan through a double column -} -\keyword{internal} diff --git a/man/scan_in_integer.Rd b/man/scan_in_integer.Rd deleted file mode 100644 index d807eed6..00000000 --- a/man/scan_in_integer.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_in_integer} -\alias{scan_in_integer} -\title{Scan through an integer column} -\usage{ -scan_in_integer(x) -} -\arguments{ -\item{x}{The input vector} -} -\value{ -A numeric vector with the proportion of the different types of data -that were detected within the input vector. -} -\description{ -Scan through an integer column -} -\keyword{internal} diff --git a/man/scan_in_logical.Rd b/man/scan_in_logical.Rd deleted file mode 100644 index 61f22803..00000000 --- a/man/scan_in_logical.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/clean_data_helpers.R -\name{scan_in_logical} -\alias{scan_in_logical} -\title{Scan through a logical column} -\usage{ -scan_in_logical(x) -} -\arguments{ -\item{x}{The input vector} -} -\value{ -A numeric vector with the proportion of the different types of data -that were detected within the input vector. -} -\description{ -Scan through a logical column -} -\keyword{internal} diff --git a/tests/testthat/test-clean_data_helpers.R b/tests/testthat/test-clean_data_helpers.R index d504f0f7..880b2c9d 100644 --- a/tests/testthat/test-clean_data_helpers.R +++ b/tests/testthat/test-clean_data_helpers.R @@ -1,15 +1,15 @@ test_that("scan_data works as expected", { - dat <- readRDS(system.file("extdata", "messy_data.RDS", - package = "cleanepi")) + # using a dataset with character columns only + dat <- readRDS(system.file("extdata", "messy_data.RDS", package = "cleanepi")) scan_result <- scan_data(data = dat) expect_s3_class(scan_result, "data.frame") expect_named(scan_result, c("Field_names", "missing", "numeric", "date", - "character", "logical", "date-time", "factor")) - expect_identical(ncol(scan_result), 8L) + "character", "logical")) + expect_identical(ncol(scan_result), 6L) expect_identical(nrow(scan_result), ncol(dat)) expect_identical(scan_result[["Field_names"]], names(dat)) - # using a dataset with many data types + # using a dataset with no character column data(iris) iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), replace = TRUE)) @@ -19,14 +19,16 @@ test_that("scan_data works as expected", { length.out = nrow(iris))) iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) scan_result <- scan_data(data = iris) - expect_identical(ncol(scan_result), 8L) - expect_identical(nrow(scan_result), ncol(iris)) - expect_identical(scan_result[["Field_names"]], names(iris)) - expect_identical(sum(scan_result[["numeric"]]), 4) - expect_identical(sum(scan_result[["missing"]]), 0) - expect_identical(sum(scan_result[["date"]]), 1) - expect_identical(sum(scan_result[["character"]]), 0) - expect_identical(sum(scan_result[["logical"]]), 1) - expect_identical(sum(scan_result[["date-time"]]), 1) - expect_identical(sum(scan_result[["factor"]]), 2) + expect_identical(scan_result, NA) + expect_message(scan_data(data = iris), + "No character column found in the provided data.") + + # using a data with some character columns + dat <- readRDS(system.file("extdata", "test_linelist.RDS", + package = "cleanepi")) + scan_result <- suppressWarnings(scan_data(data = dat)) + expect_identical(ncol(scan_result), 6L) + expect_identical(nrow(scan_result), 2L) + expect_false(nrow(scan_result) == ncol(dat)) + expect_identical(scan_result[["Field_names"]], c("id", "age_class")) }) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 1f188491..dafb13f5 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -112,8 +112,8 @@ test_that("add_to_report works as expected", { expect_named(report, "scanning_result") expect_named(report[["scanning_result"]], c("Field_names", "missing", "numeric", "date", "character", - "logical", "date-time", "factor")) - expect_identical(nrow(report[["scanning_result"]]), ncol(data)) + "logical")) + expect_identical(nrow(report[["scanning_result"]]), 6L) }) From 38772533da89e7e6e78d46aa0ea0e1b0b3f8bd07 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 5 Aug 2024 23:44:43 +0000 Subject: [PATCH 11/22] add/update rationale and documentation of the function --- R/clean_data_helpers.R | 9 +++++---- man/scan_data.Rd | 9 +++++---- vignettes/design_principle.Rmd | 11 ++++++++++- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 8e0bd5bd..34b62aaa 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -3,10 +3,11 @@ #' #' @param data A data frame or linelist #' -#' @returns A data frame with the same number of rows as the number of character -#' columns of the input data, and six (06) columns representing the field -#' names, the percentage of missing, numeric, date, character, and logical, -#' values in each column. +#' @returns A data frame if the input data contains columns of type character. +#' It invisibly returns `NA` otherwise. The returned data frame will have the +#' same number of rows as the number of character columns, and six +#' columns representing their column names, proportion of missing, numeric, +#' date, character, and logical values. #' #' @export #' diff --git a/man/scan_data.Rd b/man/scan_data.Rd index 90a2ee21..82ffe197 100644 --- a/man/scan_data.Rd +++ b/man/scan_data.Rd @@ -11,10 +11,11 @@ scan_data(data) \item{data}{A data frame or linelist} } \value{ -A data frame with the same number of rows as the number of character -columns of the input data, and six (06) columns representing the field -names, the percentage of missing, numeric, date, character, and logical, -values in each column. +A data frame if the input data contains columns of type character. +It invisibly returns \code{NA} otherwise. The returned data frame will have the +same number of rows as the number of character columns, and six +columns representing their column names, proportion of missing, numeric, +date, character, and logical values. } \description{ Scan through all character columns of a data frame to determine the diff --git a/vignettes/design_principle.Rmd b/vignettes/design_principle.Rmd index 65b6e0fc..fb55f1f7 100644 --- a/vignettes/design_principle.Rmd +++ b/vignettes/design_principle.Rmd @@ -31,7 +31,16 @@ The {cleanepi} R package is designed to offer functional programming-style data At the core of {cleanepi}, the pivotal function `clean_data()` serves as a wrapper encapsulating all the modules, as illustrated in the figure above. This function is intended to be the primary entry point for users seeking to cleanse their data. It performs the cleaning operations as requested by the user through the set of parameters that need to be explicitly defined. Furthermore, multiple cleaning operations can be performed sequentially using the “pipe” operator (`%>%`). In addition, this package also has two surrogate functions: -1. `scan_data()`: This function enables users to assess the data types present in each column of their dataset. +1. `scan_data()`: Columns of type `character` might contain values of other types such as `numeric`, `date`, `logical`. This function enables users to assess the data types present in each character column of their dataset. The composition in data types of character columns will inform the user about what actions need to be performed on the data. Most frequent scenarios involve the presence of: + * `date` values in either `Date` or `numeric` format (when date column is imported from MS Excel), + * character values in a logical column (a `not available` within a column of `TRUE` or `FALSE`), + * numbers written in letters. + +When the input data contains character columns, the function returns a data frame with the same number of row as the character columns and six columns representing their column names, proportion of missing, numeric, date, character, and logical values. For data with many character columns, this format allows for the easy display of the output data frame in the report, where the user will be presented with the first fifteenth rows and buttons to display the subsequent rows. +The sum of the proportion across all columns is expected to be 1 as it represent the ratio between the count of each data type and the total number of observations in the data. + +> There is no ambiguity for the columns of type Date, logical, and numeric. Values in such columns are expected to be of the same type. Hence, the function will not be applied on columns other than character columns. Consequently, it invisibly returns `NA` when applied on a dataset with no character columns, after printing out a message about the absence of character columns from the input dataset. + 2. `print_report()`: By utilizing this function, users can visualize the report generated from each applied cleaning task, facilitating transparency and understanding of the data cleaning process. ## Scope From bf533abfd34b83aad7d534a54c433ce938bf60be Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Tue, 6 Aug 2024 09:29:24 +0000 Subject: [PATCH 12/22] account for hugo's suggestions --- R/clean_data_helpers.R | 53 +++++++++++++++++----------------- man/scan_data.Rd | 2 +- vignettes/design_principle.Rmd | 2 +- 3 files changed, 28 insertions(+), 29 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 34b62aaa..8aafec52 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -22,7 +22,7 @@ #' scan_result <- scan_data( #' data = readRDS(system.file("extdata", "test_linelist.RDS", #' package = "cleanepi")) -#' ) +#' ) #' #' # scan through a data frame with no character columns #' data(iris) @@ -37,8 +37,8 @@ #' scan_data <- function(data) { # scan through all columns of the data and the identify character columns - types <- vapply(data, typeof, character(1L)) - target_columns <- types[types == "character"] + types <- vapply(data, typeof, character(1L)) + target_columns <- which(types == "character") # send an message if there is no character column found within the input data if (length(target_columns) == 0L) { @@ -46,14 +46,14 @@ scan_data <- function(data) { return(invisible(NA)) } + # unclass the data to prevent from warnings when dealing with linelist, and # scan through the character columns - data <- data[, names(target_columns)] - scan_result <- vapply(seq_len(ncol(data)), function(col_index) { - scan_in_character(data[[col_index]]) - }, numeric(5L)) + data <- as.data.frame(unclass(data))[, target_columns] + scan_result <- vapply(data, scan_in_character, numeric(5L)) scan_result <- as.data.frame(t(scan_result)) names(scan_result) <- c("missing", "numeric", "date", "character", "logical") - scan_result <- cbind(Field_names = names(data), scan_result) + scan_result <- cbind(Field_names = rownames(scan_result), scan_result) + rownames(scan_result) <- NULL return(scan_result) } @@ -80,20 +80,20 @@ scan_in_character <- function(x) { # We will check if there is any Date values within the variable by parsing the # values, looking for the ones that fit any of the predefined format. - # When there is one or more Date values, we will convert the variable into - # numeric and determine if any of them is a Date (a numeric, which after - # conversion to Date, fall within the interval + # When there is one or more Date values, we will convert the remaining + # values into numeric and determine if any of them is a Date (a numeric, which + # after conversion to Date, fall within the interval # [50 years back from today's date, today's date]). That way the Date count is # the count of date identified from the parsing + the count of Dates within # the numeric values. # When there is no Date values identified from the parsing, the variable # is converted into numeric. The numeric count is the sum of numeric values. # The logical count is the number of TRUE and FALSE written in both lower - # and upper case within the variable - # The remaining values will be of type character. + # and upper cases within the variable. + # The remaining values will be considered of type character. # parsing the vector, looking for date values - tmp <- suppressWarnings( + are_date <- suppressWarnings( as.Date( lubridate::parse_date_time( x, @@ -105,30 +105,28 @@ scan_in_character <- function(x) { # getting the date and numeric count as describe above date_count <- numeric_count <- 0L - if (sum(!is.na(tmp)) > 0L) { + if (sum(!is.na(are_date)) > 0L) { # Setting the first date to 50 years before the current date - target_interval <- sort( - seq.Date(Sys.Date(), length.out = 2L, by = "-50 years") - ) + oldest_date <- seq.Date(Sys.Date(), length.out = 2L, by = "-50 years")[[2L]] # get the date count - date_count <- date_count + sum(!is.na(tmp)) + date_count <- date_count + sum(!is.na(are_date)) # convert to numeric and check for the presence of Date among the numeric - tmp2 <- x[is.na(tmp)] - tmp3 <- suppressWarnings(as.numeric(tmp2)) - if (sum(!is.na(tmp3)) > 0L) { + non_date <- x[is.na(are_date)] + are_numeric <- suppressWarnings(as.numeric(non_date)) + if (sum(!is.na(are_numeric)) > 0L) { y <- lubridate::as_date( - tmp3[!is.na(tmp3)], - origin = target_interval[[1L]] + are_numeric[!is.na(are_numeric)], + origin = oldest_date ) # second count of date values coming from date within numeric date_count <- date_count + sum(!is.na(y)) numeric_count <- sum(is.na(y)) } } else { - tmp <- suppressWarnings(as.numeric(x)) - numeric_count <- sum(!is.na(tmp)) + are_numeric <- suppressWarnings(as.numeric(x)) + numeric_count <- sum(!is.na(are_numeric)) } # get logical count @@ -142,7 +140,8 @@ scan_in_character <- function(x) { # transform into proportions props <- round( c(na_count, numeric_count, date_count, character_count, logical_count) / - initial_length, 4L) + initial_length, 4L + ) return(props) } diff --git a/man/scan_data.Rd b/man/scan_data.Rd index 82ffe197..a9d511d9 100644 --- a/man/scan_data.Rd +++ b/man/scan_data.Rd @@ -32,7 +32,7 @@ scan_result <- scan_data( scan_result <- scan_data( data = readRDS(system.file("extdata", "test_linelist.RDS", package = "cleanepi")) - ) +) # scan through a data frame with no character columns data(iris) diff --git a/vignettes/design_principle.Rmd b/vignettes/design_principle.Rmd index fb55f1f7..9a7d6f3f 100644 --- a/vignettes/design_principle.Rmd +++ b/vignettes/design_principle.Rmd @@ -36,7 +36,7 @@ In addition, this package also has two surrogate functions: * character values in a logical column (a `not available` within a column of `TRUE` or `FALSE`), * numbers written in letters. -When the input data contains character columns, the function returns a data frame with the same number of row as the character columns and six columns representing their column names, proportion of missing, numeric, date, character, and logical values. For data with many character columns, this format allows for the easy display of the output data frame in the report, where the user will be presented with the first fifteenth rows and buttons to display the subsequent rows. +When the input data contains character columns, the function returns a data frame with the same number of row as the character columns and six columns representing their column names, proportion of missing, numeric, date, character, and logical values. We transpose the result relative to the input dataset (columns in the input are returned as row) to avoid horizontal scrolling in the case of datasets with a large number of character columns. The sum of the proportion across all columns is expected to be 1 as it represent the ratio between the count of each data type and the total number of observations in the data. > There is no ambiguity for the columns of type Date, logical, and numeric. Values in such columns are expected to be of the same type. Hence, the function will not be applied on columns other than character columns. Consequently, it invisibly returns `NA` when applied on a dataset with no character columns, after printing out a message about the absence of character columns from the input dataset. From 6dbe40f057b96b0644bc63e97ef67dba5cdc9bd6 Mon Sep 17 00:00:00 2001 From: Karim MANE <84502011+Karim-Mane@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:45:40 +0000 Subject: [PATCH 13/22] Update R/clean_data_helpers.R Co-authored-by: Hugo Gruson <10783929+Bisaloo@users.noreply.github.com> --- R/clean_data_helpers.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 8aafec52..84304c46 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -48,7 +48,7 @@ scan_data <- function(data) { # unclass the data to prevent from warnings when dealing with linelist, and # scan through the character columns - data <- as.data.frame(unclass(data))[, target_columns] + data <- as.data.frame(data)[, target_columns] scan_result <- vapply(data, scan_in_character, numeric(5L)) scan_result <- as.data.frame(t(scan_result)) names(scan_result) <- c("missing", "numeric", "date", "character", "logical") From 437a1be880ed1ea9454deedd330aede8ea46f2cd Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Tue, 6 Aug 2024 12:46:50 +0000 Subject: [PATCH 14/22] update design vignette and function documentation: made when looking for similar occurence of `six (06)` --- R/convert_to_numeric.R | 9 +++---- R/find_and_remove_duplicates.R | 2 ++ man/convert_to_numeric.Rd | 8 +++---- vignettes/design_principle.Rmd | 43 +++++++++++++++++----------------- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/R/convert_to_numeric.R b/R/convert_to_numeric.R index 3879cbef..ba007b00 100644 --- a/R/convert_to_numeric.R +++ b/R/convert_to_numeric.R @@ -1,10 +1,10 @@ #' Convert columns into numeric #' #' When the function is invoked without specifying the column names to be -#' converted, it automatically scans for columns containing exclusively missing, -#' numeric, and character values. Furthermore, it identifies columns where the -#' proportion of numeric values is at least twice the percentage of character -#' values and performs the conversion in them. +#' converted, the target columns are the ones returned by the `scan_data()` +#' function. Furthermore, it identifies columns where the proportion of numeric +#' values is at least twice the percentage of character values and performs the +#' conversion in them. #' #' @param data The input data frame or linelist #' @param target_columns A vector of the target column names. When the input @@ -32,6 +32,7 @@ convert_to_numeric <- function(data, target_columns = NULL, lang <- match.arg(lang) if (is.null(target_columns)) { scan_res <- scan_data(data = data) + stopifnot("Please specify the target column names." = !is.na(scan_res)) target_columns <- detect_to_numeric_columns(scan_res) } diff --git a/R/find_and_remove_duplicates.R b/R/find_and_remove_duplicates.R index f7861a9a..cf51a5b1 100644 --- a/R/find_and_remove_duplicates.R +++ b/R/find_and_remove_duplicates.R @@ -120,6 +120,8 @@ find_duplicates <- function(data, target_columns = NULL) { data <- add_to_report(x = data, key = "duplicates_checked_from", value = toString(target_columns)) + } else { + message("No duplicates were found.") } return(data) } diff --git a/man/convert_to_numeric.Rd b/man/convert_to_numeric.Rd index b603ff4a..d0bdc2c6 100644 --- a/man/convert_to_numeric.Rd +++ b/man/convert_to_numeric.Rd @@ -21,10 +21,10 @@ or detected columns have been transformed into numeric format. } \description{ When the function is invoked without specifying the column names to be -converted, it automatically scans for columns containing exclusively missing, -numeric, and character values. Furthermore, it identifies columns where the -proportion of numeric values is at least twice the percentage of character -values and performs the conversion in them. +converted, the target columns are the ones returned by the \code{scan_data()} +function. Furthermore, it identifies columns where the proportion of numeric +values is at least twice the percentage of character values and performs the +conversion in them. } \examples{ dat <- convert_to_numeric( diff --git a/vignettes/design_principle.Rmd b/vignettes/design_principle.Rmd index 9a7d6f3f..457107ca 100644 --- a/vignettes/design_principle.Rmd +++ b/vignettes/design_principle.Rmd @@ -24,7 +24,7 @@ Data cleaning is an important phase for ensuring the efficacy of downstream anal ## Design decisions -The {cleanepi} R package is designed to offer functional programming-style data cleansing tasks. To streamline the organization of data cleaning operations, we have categorized them into distinct groups referred to as **modules**. These modules are based on overarching goals derived from commonly anticipated data cleaning procedures. Each module features a primary function along with additional helper functions tailored to accomplish specific tasks. It's important to note that, except for few cases where the outcome a helper function can impact on the cleaning task, only the main function of each module will be exported. This deliberate choice empowers users to execute individual cleaning tasks as needed, enhancing flexibility and usability. +The {cleanepi} R package is designed to offer functional programming-style data cleansing tasks. To streamline the organization of data cleaning operations, we have categorized them into distinct groups referred to as **modules**. These modules are based on overarching goals derived from commonly anticipated data cleaning procedures. Each module features a primary function along with additional helper functions tailored to accomplish specific tasks. It's important to note that, except for few cases where the outcome from a helper function can impact on the cleaning task, only the main function of each module will be exported. This deliberate choice empowers users to execute individual cleaning tasks as needed, enhancing flexibility and usability. ![cleanepi design diagram](../man/figures/cleanepi_design_diagram.drawio.png) @@ -70,7 +70,7 @@ In addition to the target dataset, the core function `clean_data()` accepts a `l ## Output -Both the primary functions of the modules and the core function `clean_data()` return an object of type `data.frame` or `linelist`, depending on the type of the input dataset. The report generated from all cleaning tasks is attached to this object as an attribute, which can be accessed using the `attr()` function in base R. +Both the primary functions of the modules and the core function `clean_data()` return an object of type `data.frame` or `linelist`, depending on the type of the input dataset. The report generated from all cleaning tasks is attached to this object as an attribute, which can be accessed using either the `attr()` or `attributes()` functions in base R. ### Modules in {cleanepi} @@ -115,13 +115,12 @@ This module aims at eliminating irrelevant and redundant rows and columns, inclu This module is designed to identify and eliminate duplicated rows. -- **Main function:** `find_duplicates(), remove_duplicates()` +- **Main functions:** `find_duplicates(), remove_duplicates()` - **Input:** Accepts a `data.frame` or `linelist` object, along with optional parameters: * Vector of target columns (default is to consider all columns; possible to use `linelist_tags` to consider tagged variables only when the input is a linelist object). - **Output:** Returns the input object after applying the specified operations. - **Report:** - * A list with one or two table(s) showcasing found duplicates and removed duplicates (if `remove = TRUE`). - * A table detailing the removed duplicates. + * A list with one or two table(s) showcasing found duplicates and removed duplicates. - **Mode:** * explicit @@ -134,7 +133,7 @@ This module aims to standardize and unify the representation of missing values w - **Main function:** `replace_missing_values()` - **Input:** Accepts a `data.frame` or `linelist` object, along with: * A `vector` of column names (if not provided, the operation is performed across all columns) - * A string or a vector of strings representing the current missing values (default value is cleanepi::common_na_strings) + * A string or a vector of strings representing the current missing values (default value is `cleanepi::common_na_strings`) - **Output:** Returns the input object with all missing values represented as `NA`. - **Report:** Generates a three-column table featuring index, column, and value for each missing item in the dataset. - **Mode:** @@ -144,7 +143,7 @@ By utilizing the `replace_missing_char()` function, users can ensure consistency **5. Standardization of date values** -This module is dedicated to convert date values in character columns into `Date` value in `ISO8601` format, and ensuring that all dates fall within the given timeframe. +This module is dedicated to convert date values in character columns into `ISO8601` `Date` format, and ensuring that all dates fall within the expected user-provided timeframe. - **Main function:** `standardize_dates()` - **Input:** Accepts a `data.frame` or `linelist` object, along with: @@ -152,11 +151,11 @@ This module is dedicated to convert date values in character columns into `Date` * Tolerance threshold that defines the % of missing (out of range values converted to `NA`) values to be allowed in a converted column. When % missing values exceeds or is equal to it, the original values are returned (default value is 40%) * format (default value is NULL) * timeframe (default value is null) -- **Output:** Returns the input object with standardized date values in the format of *%Y-%m-%d*. +- **Output:** Returns the input object with standardized date values formatted as *%Y-%m-%d*. - **Report:** * A two-column table listing the columns where date values were standardized. * A three-column table featuring index, column name, and values that fall outside the specified timeframe. - * A data frame featuring date values that can comply with more than one specified format. Users can update the standardized date values with the correct when it's appropriated. + * A data frame featuring date values that can comply with more than one specified format. Users can update the standardized date values with the correct ones when it's appropriated. - **Mode:** * explicit @@ -166,22 +165,27 @@ By employing the `standardize_dates()` function, users can ensure uniformity and This module is tailored to verify whether the values in the column uniquely identifying subjects adhere to a consistent format. It also offers a functionality that allow users to correct the inconsistent subject ids. -- **Main function:** `check_subject_ids(), correct_subject_ids()` +- **Main function:** `check_subject_ids()` - **Input:** Accepts a `data.frame` or `linelist` object, along with: * The name of the ID column * Strings for prefix, suffix, and numerical range within the IDs - * A logical that determines whether to delete the wrong ids or not. -- **Output:** Returns the input object with standardized subject IDs. -- **Report:** Generates a two-column table featuring index and value of each subject ID that deviates from the expected format. + * A numeric representing the expected number of characters in the subject ids. +- **Output:** Returns the input object attached with an attribute called `incorrect_subject_id`. +- **Report:** Generates a two-column table featuring the index and value of each subject ID that deviates from the expected format. - **Mode:** * explicit +The `correct_subject_ids()` function can be used to correct the identified incorrect subject ids. In addition to the input data, it expects a data frame with two columns `from` and `to` containing the wrong and the correct ids respectively. + By utilizing the functions in this module, users can ensure uniformity in the format of subject ids, facilitating accurate tracking and analysis of individual subjects within the dataset. **7. Dictionary based substitution ** This module facilitates dictionary-based substitution, which involves replacing existing values with predefined ones. It replaces entries in a specific columns to certain values, such as substituting 1 with “male” and 2 with “female” in a gender column. It also interoperates seamlessly with the `get_meta_data()` function from {readepi} R package. +Note that the `clean_using_dictionary()` function will return a warning when it detects unexpected values in the target columns from the data dictionary. +These unexpected values can be added to the data dictionary using the `add_to_dictionary()` function. + - **Main function:** `clean_using_dictionary()` - **Input:** Accepts a `data.frame` or `linelist` object, along with a data dictionary featuring the following column names: _options_, _values_, and _order_. - **Output:** Returns the input object where the specified options are replaced by their corresponding values. @@ -191,16 +195,13 @@ This module facilitates dictionary-based substitution, which involves replacing By leveraging the `clean_using_dictionary()` function, users can streamline and standardize the values within specific columns based on predefined mappings, enhancing consistency and accuracy in the dataset. -Note that the `clean_using_dictionary()` function will return a warning when it detects unexpected values in the target columns from the data dictionary. -The unexpected values can be added to the data dictionary using the `add_to_dictionary()` function. - **8. Conversion of values when necessary ** -This module is designed to convert numbers written in letters to numerical values, ensuring interoperability with the `{numberize}` package. +This module is designed to convert numbers written in letters to numerical values, ensuring interoperability with the [{numberize}](https://epiverse-trace.github.io/numberize/) package. - **Main function:** `convert_to_numeric()` - **Input:** Accepts a `data.frame` or `linelist` object, along with: - * A vector of column names to be converted into numeric, and + * A vector of column names to be converted into numeric. When not provided, the target columns are those returned by the `scan_data()` function * A string that denotes the language used in the text. The current version supports `English, French and Spanish`. - **Output:** Returns the input object with values in the target columns converted into numeric format. - **Report:** Generates a three-column table with index, column, and value for each unrecognized value in the dataset (strings that could not be converted into numeric). @@ -208,8 +209,6 @@ This module is designed to convert numbers written in letters to numerical value * explicit By employing the `convert_to_numeric()` function, users can seamlessly transform numeric representations written in letters into numerical values, ensuring compatibility with the {numberize} package and promoting accuracy in numerical analysis. - -Note that `convert_to_numeric()` will issue a warning for unexpected values and return them in the report. **9. Verification of the sequence of date-events ** @@ -218,7 +217,7 @@ This module provides functions to verify whether the sequence of date events ali - **Main function:** `check_date_sequence()` - **Input:** Accepts a `data.frame` or `linelist` object, along with: * A vector of date column names to be considered -- **Output:** Returns the input object with incorrect rows removed if specified. +- **Output:** Returns the input object where the incorrect rows have been removed. - **Report:** Generates a table listing the incorrect rows from the specified columns. - **Mode:** * explicit @@ -227,7 +226,7 @@ By using the `check_date_sequence()` function, users can systematically validate **10. Transformation of selected columns** -This module is dedicated to performing various specialized operations related to epidemiological data analytics, and it currently includes the following functions: +This module is dedicated to performing various specialized operations related to epidemiological data analytics. In the current version of the package, this module includes the following functions: - **Main function:** `timespan()` - **Input:** Accepts a `data.frame` or `linelist` object, along with: From 46fb2cac10d7557a06862ba6de4bb13abaa29d95 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Thu, 8 Aug 2024 22:57:19 +0000 Subject: [PATCH 15/22] allow for double counting of numeric values as date when appropriate --- R/clean_data_helpers.R | 68 +++++++++++++++++++++++++--------- man/scan_data.Rd | 6 +++ man/scan_in_character.Rd | 4 +- vignettes/design_principle.Rmd | 7 +++- 4 files changed, 65 insertions(+), 20 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 84304c46..6e5d62af 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -9,6 +9,14 @@ #' columns representing their column names, proportion of missing, numeric, #' date, character, and logical values. #' +#' @details +#' When a numeric value is found in a character column which also contains Date +#' values, the numeric ones which are potentially of type Date (a numeric, which +# after conversion to Date, fall within the interval +# [50 years back from today's date, today's date]) will add to the date count +#' as well as to the numeric count. For this reason, the sum across rows in the +#' output object could be greater than 1. +#' #' @export #' #' @examples @@ -49,10 +57,12 @@ scan_data <- function(data) { # unclass the data to prevent from warnings when dealing with linelist, and # scan through the character columns data <- as.data.frame(data)[, target_columns] - scan_result <- vapply(data, scan_in_character, numeric(5L)) + scan_result <- vapply(seq_len(ncol(data)), function(col_index) { + scan_in_character(data[[col_index]], names(data)[[col_index]]) + }, numeric(5L)) scan_result <- as.data.frame(t(scan_result)) names(scan_result) <- c("missing", "numeric", "date", "character", "logical") - scan_result <- cbind(Field_names = rownames(scan_result), scan_result) + scan_result <- cbind(Field_names = names(data), scan_result) rownames(scan_result) <- NULL return(scan_result) } @@ -60,34 +70,44 @@ scan_data <- function(data) { #' Scan through a character column #' #' @param x The input character vector +#' @param x_name The name of the corresponding column #' #' @return A numeric vector with the proportion of the different types of data #' that were detected within the input vector. #' @keywords internal #' -scan_in_character <- function(x) { +scan_in_character <- function(x, x_name) { # There might be, within a character column, values of type: # character, numeric, date (date or date-time), NA, and logical # In this function, we check the presence of these different types within a # character column. # save the variable length - initial_length <- length(x) + # the character count is decreased by the number of occurrence a different + # data type is found. + initial_length <- character_count <- length(x) # get the count of missing data (NA) na_count <- sum(is.na(x)) x <- x[!is.na(x)] + character_count <- character_count - na_count # We will check if there is any Date values within the variable by parsing the # values, looking for the ones that fit any of the predefined format. - # When there is one or more Date values, we will convert the remaining - # values into numeric and determine if any of them is a Date (a numeric, which + # When there is one or more Date values, the remaining values are + # converted into numeric. The first numeric count is recorded at this point. + # If any of these numeric values is a Date (a numeric, which # after conversion to Date, fall within the interval - # [50 years back from today's date, today's date]). That way the Date count is - # the count of date identified from the parsing + the count of Dates within - # the numeric values. + # [50 years back from today's date, today's date]), it will added to the date + # count. That way the Date count is the count of date identified from the + # parsing + the count of Dates within the numeric values. + # + # NOTE: This is what justifies that the sum across rows in the output object + # could be > 1. + # # When there is no Date values identified from the parsing, the variable - # is converted into numeric. The numeric count is the sum of numeric values. + # is converted into numeric. The final numeric count is the sum of all the + # identified numeric values. # The logical count is the number of TRUE and FALSE written in both lower # and upper cases within the variable. # The remaining values will be considered of type character. @@ -111,31 +131,43 @@ scan_in_character <- function(x) { # get the date count date_count <- date_count + sum(!is.na(are_date)) + character_count <- character_count - date_count # convert to numeric and check for the presence of Date among the numeric non_date <- x[is.na(are_date)] are_numeric <- suppressWarnings(as.numeric(non_date)) + character_count <- character_count - sum(!is.na(are_numeric)) + numeric_count <- numeric_count + sum(!is.na(are_numeric)) if (sum(!is.na(are_numeric)) > 0L) { y <- lubridate::as_date( are_numeric[!is.na(are_numeric)], origin = oldest_date ) - # second count of date values coming from date within numeric - date_count <- date_count + sum(!is.na(y)) - numeric_count <- sum(is.na(y)) + # send a warning to inform about the presence of numeric values that could + # potentially be Date + if (sum(!is.na(y)) > 0L) { + # second count of date values coming from date within numeric + date_count <- date_count + sum(!is.na(y)) + warning( + sprintf( + "'%s' contains %d numeric values that are potentially of type Date." + , x_name, + sum(!is.na(y)) + ), + call. = FALSE + ) + } } } else { are_numeric <- suppressWarnings(as.numeric(x)) - numeric_count <- sum(!is.na(are_numeric)) + numeric_count <- numeric_count + sum(!is.na(are_numeric)) + character_count <- character_count - numeric_count } # get logical count logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE" logical_count <- sum(logicals) - - # get the character count - character_count <- initial_length - - (na_count + logical_count + numeric_count + date_count) + character_count <- character_count - logical_count # transform into proportions props <- round( diff --git a/man/scan_data.Rd b/man/scan_data.Rd index a9d511d9..a9bd239b 100644 --- a/man/scan_data.Rd +++ b/man/scan_data.Rd @@ -21,6 +21,12 @@ date, character, and logical values. Scan through all character columns of a data frame to determine the proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values. } +\details{ +When a numeric value is found in a character column which also contains Date +values, the numeric ones which are potentially of type Date (a numeric, which +as well as to the numeric count. For this reason, the sum across rows in the +output object could be greater than 1. +} \examples{ # scan through a data frame of characters scan_result <- scan_data( diff --git a/man/scan_in_character.Rd b/man/scan_in_character.Rd index 83559951..7d2a8a7a 100644 --- a/man/scan_in_character.Rd +++ b/man/scan_in_character.Rd @@ -4,10 +4,12 @@ \alias{scan_in_character} \title{Scan through a character column} \usage{ -scan_in_character(x) +scan_in_character(x, x_name) } \arguments{ \item{x}{The input character vector} + +\item{x_name}{The name of the corresponding column} } \value{ A numeric vector with the proportion of the different types of data diff --git a/vignettes/design_principle.Rmd b/vignettes/design_principle.Rmd index 457107ca..f6a82ad8 100644 --- a/vignettes/design_principle.Rmd +++ b/vignettes/design_principle.Rmd @@ -37,7 +37,12 @@ In addition, this package also has two surrogate functions: * numbers written in letters. When the input data contains character columns, the function returns a data frame with the same number of row as the character columns and six columns representing their column names, proportion of missing, numeric, date, character, and logical values. We transpose the result relative to the input dataset (columns in the input are returned as row) to avoid horizontal scrolling in the case of datasets with a large number of character columns. -The sum of the proportion across all columns is expected to be 1 as it represent the ratio between the count of each data type and the total number of observations in the data. +The sum of the proportion across all columns is not always equal to 1 for the reason below: + * When a numeric value is found in a character column which also contains Date + values, the numeric ones which are potentially of type Date (a numeric, which + after conversion to Date, fall within the interval + [50 years back from today's date, today's date]) will add to the date count + as well as to the numeric count. > There is no ambiguity for the columns of type Date, logical, and numeric. Values in such columns are expected to be of the same type. Hence, the function will not be applied on columns other than character columns. Consequently, it invisibly returns `NA` when applied on a dataset with no character columns, after printing out a message about the absence of character columns from the input dataset. From 1a4e1390f52ee727ad71ff9ecca908e872357c72 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Tue, 20 Aug 2024 18:17:23 +0000 Subject: [PATCH 16/22] add cli as a dependency --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 54b48d9e..1cb4ee92 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,6 +40,7 @@ Depends: Imports: arsenal, checkmate, + cli, dplyr, janitor, linelist (>= 1.0.0), From 1af8dd0e34d885899600937c2be929135f942a88 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Tue, 20 Aug 2024 18:18:14 +0000 Subject: [PATCH 17/22] update scan_data() internal functions --- R/clean_data_helpers.R | 115 +++++++++++++++-------- man/scan_data.Rd | 15 ++- tests/testthat/test-clean_data_helpers.R | 20 +++- 3 files changed, 106 insertions(+), 44 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 6e5d62af..d430ab5a 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -10,12 +10,24 @@ #' date, character, and logical values. #' #' @details -#' When a numeric value is found in a character column which also contains Date -#' values, the numeric ones which are potentially of type Date (a numeric, which -# after conversion to Date, fall within the interval -# [50 years back from today's date, today's date]) will add to the date count -#' as well as to the numeric count. For this reason, the sum across rows in the -#' output object could be greater than 1. +#' How does it work? +#' The character columns are identified first. When there is no character column +#' the function returns a message. +#' For every character column, we look for the presence of date values. +#' When Date values are found, the first count of dates is recorded. These Date +#' values will be in turn converted to numeric. If any numeric value is detected +#' among them, the first count of numeric values is recorded. +#' The remaining values are then converted to numeric. The second numeric count +#' will be recorded from this. The detected numeric values will also be +#' converted into Date to identify the ones which are potentially of type Date +#' (a numeric, which after conversion to Date, fall within the interval +# [50 years back from today's date, today's date]). Those that turns out to be +# Date values are counted in the second count of dates. +# For this reason, the sum across rows in the output object could be greater +# than 1. +# In the absence of Date values, the entire column is converted into numeric to +# record the numeric count. +# The logical and character counts will subsequently be evaluated. #' #' @export #' @@ -56,7 +68,7 @@ scan_data <- function(data) { # unclass the data to prevent from warnings when dealing with linelist, and # scan through the character columns - data <- as.data.frame(data)[, target_columns] + data <- as.data.frame(data)[, target_columns, drop = FALSE] scan_result <- vapply(seq_len(ncol(data)), function(col_index) { scan_in_character(data[[col_index]], names(data)[[col_index]]) }, numeric(5L)) @@ -94,13 +106,18 @@ scan_in_character <- function(x, x_name) { # We will check if there is any Date values within the variable by parsing the # values, looking for the ones that fit any of the predefined format. - # When there is one or more Date values, the remaining values are - # converted into numeric. The first numeric count is recorded at this point. + # When there is one or more Date values, they will be converted into + # numeric. The first numeric count is recorded at this point. The rest of the + # values are converted into numeric, and the second count of numeric is + # recorded. They will in turn be converted into date. # If any of these numeric values is a Date (a numeric, which # after conversion to Date, fall within the interval - # [50 years back from today's date, today's date]), it will added to the date - # count. That way the Date count is the count of date identified from the - # parsing + the count of Dates within the numeric values. + # [50 years back from today's date, today's date]), it will add to the second + # date count. + # That way the Date count is the count of date identified from the + # parsing + the count of Dates within the numeric values. Similarly, the + # numeric count is the count of numeric values within dates values and count + # among the non-date values. # # NOTE: This is what justifies that the sum across rows in the output object # could be > 1. @@ -123,42 +140,62 @@ scan_in_character <- function(x, x_name) { ) ) - # getting the date and numeric count as describe above - date_count <- numeric_count <- 0L + # when date values are identified, check if they are at the same time numeric + # and get the count of ambiguous + # convert the rest to numeric and check if they can also translate to data and + # get the second count of ambiguous + date_count <- ambiguous_count <- numeric_count <- 0L if (sum(!is.na(are_date)) > 0L) { - # Setting the first date to 50 years before the current date - oldest_date <- seq.Date(Sys.Date(), length.out = 2L, by = "-50 years")[[2L]] - - # get the date count + # get the date count and the indices of the date values date_count <- date_count + sum(!is.na(are_date)) + are_date_idx <- which(!is.na(are_date)) + + # convert the date values into numeric and check if some of them are also + # numeric. If some are, get the first count of ambiguous and numeric values + are_numeric_in_dates <- suppressWarnings(as.numeric(x[are_date_idx])) + ambiguous_count <- ambiguous_count + sum(!is.na(are_numeric_in_dates)) + numeric_count <- ambiguous_count + + # getting out of this condition with non-date values character_count <- character_count - date_count + x <- x[-are_date_idx] - # convert to numeric and check for the presence of Date among the numeric - non_date <- x[is.na(are_date)] - are_numeric <- suppressWarnings(as.numeric(non_date)) - character_count <- character_count - sum(!is.na(are_numeric)) + # convert the remaining values into numeric. + # then check if any of them can be a date value + are_numeric <- suppressWarnings(as.numeric(x)) numeric_count <- numeric_count + sum(!is.na(are_numeric)) - if (sum(!is.na(are_numeric)) > 0L) { - y <- lubridate::as_date( - are_numeric[!is.na(are_numeric)], - origin = oldest_date - ) - # send a warning to inform about the presence of numeric values that could - # potentially be Date - if (sum(!is.na(y)) > 0L) { + are_numeric_idx <- which(!is.na(are_numeric)) + if (length(are_numeric_idx) > 0L) { + numeric_values <- as.numeric(x[are_numeric_idx]) + x <- x[-are_numeric_idx] + character_count <- character_count - length(are_numeric_idx) + + # convert the numeric values into date. + # If some are date, get the second count of ambiguous and date values + + # Setting the first date to 50 years before the current date + oldest_date <- seq.Date( + Sys.Date(), length.out = 2L, by = "-50 years" + )[[2L]] + + date_values <- lubridate::as_date(numeric_values) + valid_dates <- date_values >= oldest_date & date_values <= Sys.Date() + if (any(valid_dates)) { # second count of date values coming from date within numeric - date_count <- date_count + sum(!is.na(y)) - warning( - sprintf( - "'%s' contains %d numeric values that are potentially of type Date." - , x_name, - sum(!is.na(y)) - ), - call. = FALSE - ) + date_count <- date_count + sum(valid_dates) + ambiguous_count <- ambiguous_count + sum(valid_dates) } } + + # send a warning about the number of ambiguous values found on that column + if (ambiguous_count > 0) { + cli::cli_alert_warning(c( + "i" = "Found {ambiguous_count} values that can be either numeric or", + "date in column `{x_name}`" + )) + } } else { + # convert everything to numeric and get numeric count are_numeric <- suppressWarnings(as.numeric(x)) numeric_count <- numeric_count + sum(!is.na(are_numeric)) character_count <- character_count - numeric_count diff --git a/man/scan_data.Rd b/man/scan_data.Rd index a9bd239b..8305fb7a 100644 --- a/man/scan_data.Rd +++ b/man/scan_data.Rd @@ -22,10 +22,17 @@ Scan through all character columns of a data frame to determine the proportion of \code{missing}, \code{numeric}, \code{Date}, \code{character}, \code{logical}, values. } \details{ -When a numeric value is found in a character column which also contains Date -values, the numeric ones which are potentially of type Date (a numeric, which -as well as to the numeric count. For this reason, the sum across rows in the -output object could be greater than 1. +How does it work? +The character columns are identified first. When there is no character column +the function returns a message. +For every character column, we look for the presence of date values. +When Date values are found, the first count of dates is recorded. These Date +values will be in turn converted to numeric. If any numeric value is detected +among them, the first count of numeric values is recorded. +The remaining values are then converted to numeric. The second numeric count +will be recorded from this. The detected numeric values will also be +converted into Date to identify the ones which are potentially of type Date +(a numeric, which after conversion to Date, fall within the interval } \examples{ # scan through a data frame of characters diff --git a/tests/testthat/test-clean_data_helpers.R b/tests/testthat/test-clean_data_helpers.R index 880b2c9d..940413fc 100644 --- a/tests/testthat/test-clean_data_helpers.R +++ b/tests/testthat/test-clean_data_helpers.R @@ -26,9 +26,27 @@ test_that("scan_data works as expected", { # using a data with some character columns dat <- readRDS(system.file("extdata", "test_linelist.RDS", package = "cleanepi")) - scan_result <- suppressWarnings(scan_data(data = dat)) + scan_result <- scan_data(data = dat) expect_identical(ncol(scan_result), 6L) expect_identical(nrow(scan_result), 2L) expect_false(nrow(scan_result) == ncol(dat)) expect_identical(scan_result[["Field_names"]], c("id", "age_class")) + + # use data where output is easily predictable. the data contains: + # 1 character value + # 1 date value + # 1 numeric value which also corresponds to the date value above, hence the + # warning about the presence of ambiguous data + dat <- data.frame(col1 = c("20210702", "test")) + scan_result <- scan_data(data = dat) + expect_identical(as.numeric(scan_result[1L, -1L]), c(0, 0.5, 0.5, 0.5, 0)) + + # use data where output is easily predictable. the data contains: + # 1 character value + # 1 date value + # 1 numeric value which also corresponds to the date value above, hence the + # warning about the presence of ambiguous data + dat <- data.frame(col1 = c(c("20210702", "2021/07/03", "3"), "test")) + scan_result <- scan_data(data = dat) + expect_identical(as.numeric(scan_result[1L, -1L]), c(0, 0.5, 0.5, 0.25, 0)) }) From 356a14341ba052b0ad4b524abd2f6e0dd17091f8 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Wed, 21 Aug 2024 13:18:39 +0000 Subject: [PATCH 18/22] fix linters --- R/clean_data_helpers.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index d430ab5a..25ba2d0f 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -189,10 +189,10 @@ scan_in_character <- function(x, x_name) { # send a warning about the number of ambiguous values found on that column if (ambiguous_count > 0) { - cli::cli_alert_warning(c( - "i" = "Found {ambiguous_count} values that can be either numeric or", - "date in column `{x_name}`" - )) + cli::cli_alert_warning( + "Found {ambiguous_count} values that can be either numeric or date in", + "column `{x_name}`" + ) } } else { # convert everything to numeric and get numeric count From a372271cff2d3aff08a20bf90e2d5be761759470 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Wed, 21 Aug 2024 15:21:47 +0000 Subject: [PATCH 19/22] remove the else statement --- R/clean_data_helpers.R | 13 +++++++------ tests/testthat/test-clean_data_helpers.R | 4 ++-- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 25ba2d0f..e6122b20 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -181,7 +181,8 @@ scan_in_character <- function(x, x_name) { date_values <- lubridate::as_date(numeric_values) valid_dates <- date_values >= oldest_date & date_values <= Sys.Date() if (any(valid_dates)) { - # second count of date values coming from date within numeric + # get the second count of date and ambiguous values coming from date + # within numeric date_count <- date_count + sum(valid_dates) ambiguous_count <- ambiguous_count + sum(valid_dates) } @@ -194,13 +195,13 @@ scan_in_character <- function(x, x_name) { "column `{x_name}`" ) } - } else { - # convert everything to numeric and get numeric count - are_numeric <- suppressWarnings(as.numeric(x)) - numeric_count <- numeric_count + sum(!is.na(are_numeric)) - character_count <- character_count - numeric_count } + # convert everything to numeric and get the numeric count + are_numeric <- suppressWarnings(as.numeric(x)) + numeric_count <- numeric_count + sum(!is.na(are_numeric)) + character_count <- character_count - sum(!is.na(are_numeric)) + # get logical count logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE" logical_count <- sum(logicals) diff --git a/tests/testthat/test-clean_data_helpers.R b/tests/testthat/test-clean_data_helpers.R index 940413fc..51c5a919 100644 --- a/tests/testthat/test-clean_data_helpers.R +++ b/tests/testthat/test-clean_data_helpers.R @@ -43,8 +43,8 @@ test_that("scan_data works as expected", { # use data where output is easily predictable. the data contains: # 1 character value - # 1 date value - # 1 numeric value which also corresponds to the date value above, hence the + # 2 date values (one of them `20210702` is also numeric) + # 2 numeric values in which one is also a date value, hence the # warning about the presence of ambiguous data dat <- data.frame(col1 = c(c("20210702", "2021/07/03", "3"), "test")) scan_result <- scan_data(data = dat) From fa1d9d7b81c4bf306343f9fb8d08e2659b26c580 Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Wed, 21 Aug 2024 15:31:34 +0000 Subject: [PATCH 20/22] update how oldest_date is calculated --- R/clean_data_helpers.R | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index e6122b20..77ea3cad 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -173,11 +173,10 @@ scan_in_character <- function(x, x_name) { # convert the numeric values into date. # If some are date, get the second count of ambiguous and date values - # Setting the first date to 50 years before the current date - oldest_date <- seq.Date( - Sys.Date(), length.out = 2L, by = "-50 years" - )[[2L]] + # Set the first date to 50 years before the current date + oldest_date <- Sys.Date() - lubridate::years(50) + # identify potential date values and increment date and ambiguous counts date_values <- lubridate::as_date(numeric_values) valid_dates <- date_values >= oldest_date & date_values <= Sys.Date() if (any(valid_dates)) { From 0be056a864eefda1c1c5ec898d2b8a01d0fbd6af Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Wed, 21 Aug 2024 15:37:57 +0000 Subject: [PATCH 21/22] update counting approach --- R/clean_data_helpers.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R index 77ea3cad..d040a105 100644 --- a/R/clean_data_helpers.R +++ b/R/clean_data_helpers.R @@ -147,13 +147,13 @@ scan_in_character <- function(x, x_name) { date_count <- ambiguous_count <- numeric_count <- 0L if (sum(!is.na(are_date)) > 0L) { # get the date count and the indices of the date values - date_count <- date_count + sum(!is.na(are_date)) + date_count <- sum(!is.na(are_date)) are_date_idx <- which(!is.na(are_date)) # convert the date values into numeric and check if some of them are also # numeric. If some are, get the first count of ambiguous and numeric values are_numeric_in_dates <- suppressWarnings(as.numeric(x[are_date_idx])) - ambiguous_count <- ambiguous_count + sum(!is.na(are_numeric_in_dates)) + ambiguous_count <- sum(!is.na(are_numeric_in_dates)) numeric_count <- ambiguous_count # getting out of this condition with non-date values From 939c9e10cdb4f1b228c3b50ba7be6b8d65b0b11d Mon Sep 17 00:00:00 2001 From: Karim-Mane Date: Mon, 2 Sep 2024 14:52:04 +0000 Subject: [PATCH 22/22] activate evaluation of remove_contants() code chunk --- vignettes/cleanepi.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/cleanepi.Rmd b/vignettes/cleanepi.Rmd index 7187ea8c..829fad5a 100644 --- a/vignettes/cleanepi.Rmd +++ b/vignettes/cleanepi.Rmd @@ -177,7 +177,7 @@ The `remove_constants()` function can be used to remove such “noise”. The fu 1. **data**: the input data frame or linelist, 2. **cutoff**: a numeric, between `0` and `1`, to be used when removing empty rows and columns. When provided, only rows and columns where the percent of missing data is greater than this cut-off will removed. Rows and columns with 100% missing values will be remove by default. -```{r echo=FALSE, eval=TRUE} +```{r eval=TRUE} # IMPORT THE INPUT DATA data <- readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")) @@ -390,7 +390,7 @@ report$multi_format_dates %>% ## Standardizing subject IDs -### Detecting and remove incorrect subject ids +### Detecting and removing incorrect subject ids The `check_subject_ids()` function is designed to identify rows from the input dataset where the ids don't comply with the expected subject ids format. It expects the following parameters: