update the scan_date() function - only scan through character columns

epiverse-trace · Aug 5, 2024 · 83a9150 · 83a9150
1 parent 34ad20f
commit 83a9150
Show file tree

Hide file tree

Showing 10 changed files with 151 additions and 295 deletions.
diff --git a/R/clean_data_helpers.R b/R/clean_data_helpers.R
@@ -1,222 +1,147 @@
-
-#' Calculate the percentage of missing and other data type values in a vector
-#' containing different data types such as numeric, Date, character,
-#' logical, date-time, factor.
-#'
-#' @param x A vector of ones or a combination of various data types.
-#' @param type A character with the the vector type.
-#'
-#' @returns A vector of 5 elements representing the percentage of missing,
-#' numeric, date, character, and logical values found in the input vector.
-#'
-#' @keywords internal
-#'
-scan_columns <- function(x, type) {
-  res <- switch(type,
-                double = scan_in_double(x),
-                integer = scan_in_integer(x),
-                logical = scan_in_logical(x),
-                character = scan_in_character(x))
-  return(res)
-}
-
-#' Scan a data frame to determine the percentage of `missing`, `numeric`,
-#'    `Date`, `character`, `logical`, `date-time`, and `factor` values in every
-#'    column.
+#' Scan through all character columns of a data frame to determine the
+#' proportion of `missing`, `numeric`, `Date`, `character`, `logical`, values.
 #'
 #' @param data A data frame or linelist
 #'
-#' @returns A data frame or linelist with the same number of rows as the number
-#'    of columns of the input data, and 8 column representing the field names,
-#'    the percentage of missing, numeric, date, character, logical, date-time,
-#'    and factor values in each column.
+#' @returns A data frame with the same number of rows as the number of character
+#'    columns of the input data, and six (06) columns representing the field
+#'    names, the percentage of missing, numeric, date, character, and logical,
+#'    values in each column.
 #'
 #' @export
 #'
 #' @examples
+#' # scan through a data frame of characters
 #' scan_result <- scan_data(
 #'   data = readRDS(system.file("extdata", "messy_data.RDS",
 #'                              package = "cleanepi"))
 #' )
 #'
-#' @details
-#' For columns of type character, the detected numeric values could actually be
-#' of type Date or date-time. This is because R coerces some Date values into
-#' numeric when the date is imported from an MS Excel file.
+#' # scan through a data frame with two character columns
+#' scan_result <- scan_data(
+#'   data = readRDS(system.file("extdata", "test_linelist.RDS",
+#'                              package = "cleanepi"))
+#'   )
+#'
+#' # scan through a data frame with no character columns
+#' data(iris)
+#' iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris),
+#'                            replace = TRUE))
+#' iris[["lgl"]]  <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE)
+#' iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"),
+#'                                    to = as.Date("2024-08-30"),
+#'                                    length.out = nrow(iris)))
+#' iris[["posit_ct"]] <- as.POSIXct(iris[["date"]])
+#' scan_result        <- scan_data(data = iris)
 #'
 scan_data <- function(data) {
-  types       <- vapply(data, typeof, character(1L))
+  # scan through all columns of the data and the identify character columns
+  types <- vapply(data, typeof, character(1L))
+  target_columns <- types[types == "character"]
+
+  # send an message if there is no character column found within the input data
+  if (length(target_columns) == 0L) {
+    message("No character column found in the provided data.")
+    return(invisible(NA))
+  }
+
+  # scan through the character columns
+  data <- data[, names(target_columns)]
   scan_result <- vapply(seq_len(ncol(data)), function(col_index) {
-    scan_columns(data[[col_index]], types[[col_index]])
-  }, numeric(7L))
+    scan_in_character(data[[col_index]])
+  }, numeric(5L))
   scan_result        <- as.data.frame(t(scan_result))
-  names(scan_result) <- c("missing", "numeric", "date", "character",
-                          "logical", "date-time", "factor")
+  names(scan_result) <- c("missing", "numeric", "date", "character", "logical")
   scan_result        <- cbind(Field_names = names(data), scan_result)
   return(scan_result)
 }
 
-#' Scan through a double column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#'    that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_double <- function(x) {
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
-  # save the variable length
-  n_rows <- length(x)
-
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
-  x      <- x[!is.na(x)]
-
-  # doubles are either numeric (attributes = NULL), or Date (has a 'class'
-  # attributes = Date), or date-time (has a 'class' attributes = POSIXt)
-  if ("class" %in% names(attributes(x))) {
-    if ("Date" %in% attributes(x)[["class"]]) {
-      are_date <- round((length(x) / n_rows), 6L)
-    } else if ("POSIXt" %in% attributes(x)[["class"]]) {
-      are_date_time <- round((length(x) / n_rows), 6L)
-    }
-  } else {
-    are_numeric <- round((length(x) / n_rows), 6L)
-  }
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
-}
-
-#' Scan through an integer column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#'    that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_integer <- function(x) {
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
-  # save the variable length
-  n_rows <- length(x)
-
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
-  x      <- x[!is.na(x)]
-
-  # integers are either numeric (attributes = NULL), or factors (has a 'class'
-  # and 'levels' attributes)
-  if (is.null(attributes(x))) {
-    are_numeric <- round((length(x) / n_rows), 6L)
-  } else if (identical(names(attributes(x)), c("levels", "class"))) {
-    are_factor <- round((length(x) / n_rows), 6L)
-  }
-
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
-}
-
-#' Scan through a logical column
-#'
-#' @param x The input vector
-#'
-#' @return A numeric vector with the proportion of the different types of data
-#'    that were detected within the input vector.
-#' @keywords internal
-#'
-scan_in_logical <- function(x) {
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
-
-  # logical are simply logical. We will only determine the %NA and %logical
-  # save the variable length
-  n_rows <- length(x)
-
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
-  x      <- x[!is.na(x)]
-
-  # get the proportion of logical
-  are_logical <- round((length(x) / n_rows), 6L)
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
-}
-
 #' Scan through a character column
 #'
-#' @param x The input vector
+#' @param x The input character vector
 #'
 #' @return A numeric vector with the proportion of the different types of data
 #'    that were detected within the input vector.
 #' @keywords internal
 #'
 scan_in_character <- function(x) {
   # There might be, within a character column, values of type:
-  # character, numeric, date, date-time, NA, and logical
+  # character, numeric, date (date or date-time), NA, and logical
   # In this function, we check the presence of these different types within a
   # character column.
-  # Note that numeric values can actually be of 'Date' or 'date-time' type.
-  # Given that any numeric can be converted into Date, we will not check for
-  # Date or date-time values within the numeric.
-
-  are_factor <- are_date <- are_date_time <- are_character <- are_numeric <-
-    are_logical <- are_na <- 0.0
 
   # save the variable length
-  n_rows <- length(x)
+  initial_length <- length(x)
 
-  # get the proportion of NA
-  are_na <- round((sum(is.na(x)) / n_rows), 6L)
+  # get the count of missing data (NA)
+  na_count <- sum(is.na(x))
   x      <- x[!is.na(x)]
 
-  # get double values and evaluate the proportion numeric values
-  doubles <- x[!is.na(suppressWarnings(as.double(x)))]
-  if (length(doubles) > 0L) {
-    are_numeric <- round((length(doubles) / n_rows), 6L)
-  }
-
-  # get character values and check for the presence of Date and date-time
-  characters <- x[is.na(suppressWarnings(as.double(x)))]
-  if (length(characters) > 0L &&
-      !is.null(lubridate::guess_formats(characters,
-                                        c("ymd", "ydm", "dmy", "mdy", "myd",
-                                          "dym", "Ymd", "Ydm", "dmY", "mdY",
-                                          "mYd", "dYm")))) {
-    # get the proportion of date values
-    tmp    <- suppressWarnings(
-      as.Date(
-        lubridate::parse_date_time(
-          characters,
-          orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
-                     "dmY", "mdY", "mYd", "dYm")
-        )
+  # We will check if there is any Date values within the variable by parsing the
+  # values, looking for the ones that fit any of the predefined format.
+  #     When there is one or more Date values, we will convert the variable into
+  # numeric and determine if any of them is a Date (a numeric, which after
+  # conversion to Date, fall within the interval
+  # [50 years back from today's date, today's date]). That way the Date count is
+  # the count of date identified from the parsing + the count of Dates within
+  # the numeric values.
+  #     When there is no Date values identified from the parsing, the variable
+  # is converted into numeric. The numeric count is the sum of numeric values.
+  #     The logical count is the number of TRUE and FALSE written in both lower
+  # and upper case within the variable
+  #     The remaining values will be of type character.
+
+  # parsing the vector, looking for date values
+  tmp <- suppressWarnings(
+    as.Date(
+      lubridate::parse_date_time(
+        x,
+        orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm",
+                   "dmY", "mdY", "mYd", "dYm")
       )
     )
-    are_date   <- round((sum(!is.na(tmp)) / n_rows), 6L)
-    x          <- x[is.na(tmp)]
-    characters <- characters[is.na(tmp)]
+  )
+
+  # getting the date and numeric count as describe above
+  date_count <- numeric_count <- 0L
+  if (sum(!is.na(tmp)) > 0L) {
+    # Setting the first date to 50 years before the current date
+    target_interval <- sort(
+      seq.Date(Sys.Date(), length.out = 2L, by = "-50 years")
+    )
+
+    # get the date count
+    date_count <- date_count + sum(!is.na(tmp))
+
+    # convert to numeric and check for the presence of Date among the numeric
+    tmp2 <- x[is.na(tmp)]
+    tmp3 <- suppressWarnings(as.numeric(tmp2))
+    if (sum(!is.na(tmp3)) > 0L) {
+      y <- lubridate::as_date(
+        tmp3[!is.na(tmp3)],
+        origin = target_interval[[1L]]
+      )
+      # second count of date values coming from date within numeric
+      date_count <- date_count + sum(!is.na(y))
+      numeric_count <- sum(is.na(y))
+    }
+  } else {
+    tmp <- suppressWarnings(as.numeric(x))
+    numeric_count <- sum(!is.na(tmp))
   }
 
-  # get the proportion of logical values
-  logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE"
-  are_logical <- round((sum(logicals) / n_rows), 6L)
+  # get logical count
+  logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE"
+  logical_count <- sum(logicals)
 
-  # get the proportion of character values
-  are_character <- round((1.0 - (are_na + are_numeric +
-                                   are_date + are_logical)), 6L)
+  # get the character count
+  character_count <- initial_length -
+    (na_count + logical_count + numeric_count + date_count)
 
-  # return the output
-  return(
-    c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time,
-      are_factor)
-  )
+  # transform into proportions
+  props <- round(
+    c(na_count, numeric_count, date_count, character_count, logical_count) /
+      initial_length, 4L)
+
+  return(props)
 }
diff --git a/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/printing-rmd/skeleton/skeleton.Rmd
@@ -59,7 +59,7 @@ p.compact {
 ```{r cleanepi-source-data, eval=TRUE, echo=FALSE}
 # EXTRACT THE REPORT SECTIONS
 scanning_result <- params[["scanning_result"]]
-is_data_scanned <- !is.null(scanning_result)
+is_data_scanned <- !is.na(scanning_result)
 standardized_column_names <- params[["colnames"]]
 are_column_standardised   <- !is.null(standardized_column_names)
 out_of_range_dates           <- params[["out_of_range_dates"]]

diff --git a/man/scan_columns.Rd b/man/scan_columns.Rd