-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update the scan_date() function - only scan through character columns
- Loading branch information
1 parent
34ad20f
commit 83a9150
Showing
10 changed files
with
151 additions
and
295 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,222 +1,147 @@ | ||
|
||
#' Calculate the percentage of missing and other data type values in a vector | ||
#' containing different data types such as numeric, Date, character, | ||
#' logical, date-time, factor. | ||
#' | ||
#' @param x A vector of ones or a combination of various data types. | ||
#' @param type A character with the the vector type. | ||
#' | ||
#' @returns A vector of 5 elements representing the percentage of missing, | ||
#' numeric, date, character, and logical values found in the input vector. | ||
#' | ||
#' @keywords internal | ||
#' | ||
scan_columns <- function(x, type) { | ||
res <- switch(type, | ||
double = scan_in_double(x), | ||
integer = scan_in_integer(x), | ||
logical = scan_in_logical(x), | ||
character = scan_in_character(x)) | ||
return(res) | ||
} | ||
|
||
#' Scan a data frame to determine the percentage of `missing`, `numeric`, | ||
#' `Date`, `character`, `logical`, `date-time`, and `factor` values in every | ||
#' column. | ||
#' Scan through all character columns of a data frame to determine the | ||
#' proportion of `missing`, `numeric`, `Date`, `character`, `logical`, values. | ||
#' | ||
#' @param data A data frame or linelist | ||
#' | ||
#' @returns A data frame or linelist with the same number of rows as the number | ||
#' of columns of the input data, and 8 column representing the field names, | ||
#' the percentage of missing, numeric, date, character, logical, date-time, | ||
#' and factor values in each column. | ||
#' @returns A data frame with the same number of rows as the number of character | ||
#' columns of the input data, and six (06) columns representing the field | ||
#' names, the percentage of missing, numeric, date, character, and logical, | ||
#' values in each column. | ||
#' | ||
#' @export | ||
#' | ||
#' @examples | ||
#' # scan through a data frame of characters | ||
#' scan_result <- scan_data( | ||
#' data = readRDS(system.file("extdata", "messy_data.RDS", | ||
#' package = "cleanepi")) | ||
#' ) | ||
#' | ||
#' @details | ||
#' For columns of type character, the detected numeric values could actually be | ||
#' of type Date or date-time. This is because R coerces some Date values into | ||
#' numeric when the date is imported from an MS Excel file. | ||
#' # scan through a data frame with two character columns | ||
#' scan_result <- scan_data( | ||
#' data = readRDS(system.file("extdata", "test_linelist.RDS", | ||
#' package = "cleanepi")) | ||
#' ) | ||
#' | ||
#' # scan through a data frame with no character columns | ||
#' data(iris) | ||
#' iris[["fct"]] <- as.factor(sample(c("gray", "orange"), nrow(iris), | ||
#' replace = TRUE)) | ||
#' iris[["lgl"]] <- sample(c(TRUE, FALSE), nrow(iris), replace = TRUE) | ||
#' iris[["date"]] <- as.Date(seq.Date(from = as.Date("2024-01-01"), | ||
#' to = as.Date("2024-08-30"), | ||
#' length.out = nrow(iris))) | ||
#' iris[["posit_ct"]] <- as.POSIXct(iris[["date"]]) | ||
#' scan_result <- scan_data(data = iris) | ||
#' | ||
scan_data <- function(data) { | ||
types <- vapply(data, typeof, character(1L)) | ||
# scan through all columns of the data and the identify character columns | ||
types <- vapply(data, typeof, character(1L)) | ||
target_columns <- types[types == "character"] | ||
|
||
# send an message if there is no character column found within the input data | ||
if (length(target_columns) == 0L) { | ||
message("No character column found in the provided data.") | ||
return(invisible(NA)) | ||
} | ||
|
||
# scan through the character columns | ||
data <- data[, names(target_columns)] | ||
scan_result <- vapply(seq_len(ncol(data)), function(col_index) { | ||
scan_columns(data[[col_index]], types[[col_index]]) | ||
}, numeric(7L)) | ||
scan_in_character(data[[col_index]]) | ||
}, numeric(5L)) | ||
scan_result <- as.data.frame(t(scan_result)) | ||
names(scan_result) <- c("missing", "numeric", "date", "character", | ||
"logical", "date-time", "factor") | ||
names(scan_result) <- c("missing", "numeric", "date", "character", "logical") | ||
scan_result <- cbind(Field_names = names(data), scan_result) | ||
return(scan_result) | ||
} | ||
|
||
#' Scan through a double column | ||
#' | ||
#' @param x The input vector | ||
#' | ||
#' @return A numeric vector with the proportion of the different types of data | ||
#' that were detected within the input vector. | ||
#' @keywords internal | ||
#' | ||
scan_in_double <- function(x) { | ||
are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- | ||
are_logical <- are_na <- 0.0 | ||
# save the variable length | ||
n_rows <- length(x) | ||
|
||
# get the proportion of NA | ||
are_na <- round((sum(is.na(x)) / n_rows), 6L) | ||
x <- x[!is.na(x)] | ||
|
||
# doubles are either numeric (attributes = NULL), or Date (has a 'class' | ||
# attributes = Date), or date-time (has a 'class' attributes = POSIXt) | ||
if ("class" %in% names(attributes(x))) { | ||
if ("Date" %in% attributes(x)[["class"]]) { | ||
are_date <- round((length(x) / n_rows), 6L) | ||
} else if ("POSIXt" %in% attributes(x)[["class"]]) { | ||
are_date_time <- round((length(x) / n_rows), 6L) | ||
} | ||
} else { | ||
are_numeric <- round((length(x) / n_rows), 6L) | ||
} | ||
return( | ||
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, | ||
are_factor) | ||
) | ||
} | ||
|
||
#' Scan through an integer column | ||
#' | ||
#' @param x The input vector | ||
#' | ||
#' @return A numeric vector with the proportion of the different types of data | ||
#' that were detected within the input vector. | ||
#' @keywords internal | ||
#' | ||
scan_in_integer <- function(x) { | ||
are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- | ||
are_logical <- are_na <- 0.0 | ||
# save the variable length | ||
n_rows <- length(x) | ||
|
||
# get the proportion of NA | ||
are_na <- round((sum(is.na(x)) / n_rows), 6L) | ||
x <- x[!is.na(x)] | ||
|
||
# integers are either numeric (attributes = NULL), or factors (has a 'class' | ||
# and 'levels' attributes) | ||
if (is.null(attributes(x))) { | ||
are_numeric <- round((length(x) / n_rows), 6L) | ||
} else if (identical(names(attributes(x)), c("levels", "class"))) { | ||
are_factor <- round((length(x) / n_rows), 6L) | ||
} | ||
|
||
return( | ||
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, | ||
are_factor) | ||
) | ||
} | ||
|
||
#' Scan through a logical column | ||
#' | ||
#' @param x The input vector | ||
#' | ||
#' @return A numeric vector with the proportion of the different types of data | ||
#' that were detected within the input vector. | ||
#' @keywords internal | ||
#' | ||
scan_in_logical <- function(x) { | ||
are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- | ||
are_logical <- are_na <- 0.0 | ||
|
||
# logical are simply logical. We will only determine the %NA and %logical | ||
# save the variable length | ||
n_rows <- length(x) | ||
|
||
# get the proportion of NA | ||
are_na <- round((sum(is.na(x)) / n_rows), 6L) | ||
x <- x[!is.na(x)] | ||
|
||
# get the proportion of logical | ||
are_logical <- round((length(x) / n_rows), 6L) | ||
return( | ||
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, | ||
are_factor) | ||
) | ||
} | ||
|
||
#' Scan through a character column | ||
#' | ||
#' @param x The input vector | ||
#' @param x The input character vector | ||
#' | ||
#' @return A numeric vector with the proportion of the different types of data | ||
#' that were detected within the input vector. | ||
#' @keywords internal | ||
#' | ||
scan_in_character <- function(x) { | ||
# There might be, within a character column, values of type: | ||
# character, numeric, date, date-time, NA, and logical | ||
# character, numeric, date (date or date-time), NA, and logical | ||
# In this function, we check the presence of these different types within a | ||
# character column. | ||
# Note that numeric values can actually be of 'Date' or 'date-time' type. | ||
# Given that any numeric can be converted into Date, we will not check for | ||
# Date or date-time values within the numeric. | ||
|
||
are_factor <- are_date <- are_date_time <- are_character <- are_numeric <- | ||
are_logical <- are_na <- 0.0 | ||
|
||
# save the variable length | ||
n_rows <- length(x) | ||
initial_length <- length(x) | ||
|
||
# get the proportion of NA | ||
are_na <- round((sum(is.na(x)) / n_rows), 6L) | ||
# get the count of missing data (NA) | ||
na_count <- sum(is.na(x)) | ||
x <- x[!is.na(x)] | ||
|
||
# get double values and evaluate the proportion numeric values | ||
doubles <- x[!is.na(suppressWarnings(as.double(x)))] | ||
if (length(doubles) > 0L) { | ||
are_numeric <- round((length(doubles) / n_rows), 6L) | ||
} | ||
|
||
# get character values and check for the presence of Date and date-time | ||
characters <- x[is.na(suppressWarnings(as.double(x)))] | ||
if (length(characters) > 0L && | ||
!is.null(lubridate::guess_formats(characters, | ||
c("ymd", "ydm", "dmy", "mdy", "myd", | ||
"dym", "Ymd", "Ydm", "dmY", "mdY", | ||
"mYd", "dYm")))) { | ||
# get the proportion of date values | ||
tmp <- suppressWarnings( | ||
as.Date( | ||
lubridate::parse_date_time( | ||
characters, | ||
orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", | ||
"dmY", "mdY", "mYd", "dYm") | ||
) | ||
# We will check if there is any Date values within the variable by parsing the | ||
# values, looking for the ones that fit any of the predefined format. | ||
# When there is one or more Date values, we will convert the variable into | ||
# numeric and determine if any of them is a Date (a numeric, which after | ||
# conversion to Date, fall within the interval | ||
# [50 years back from today's date, today's date]). That way the Date count is | ||
# the count of date identified from the parsing + the count of Dates within | ||
# the numeric values. | ||
# When there is no Date values identified from the parsing, the variable | ||
# is converted into numeric. The numeric count is the sum of numeric values. | ||
# The logical count is the number of TRUE and FALSE written in both lower | ||
# and upper case within the variable | ||
# The remaining values will be of type character. | ||
|
||
# parsing the vector, looking for date values | ||
tmp <- suppressWarnings( | ||
as.Date( | ||
lubridate::parse_date_time( | ||
x, | ||
orders = c("ymd", "ydm", "dmy", "mdy", "myd", "dym", "Ymd", "Ydm", | ||
"dmY", "mdY", "mYd", "dYm") | ||
) | ||
) | ||
are_date <- round((sum(!is.na(tmp)) / n_rows), 6L) | ||
x <- x[is.na(tmp)] | ||
characters <- characters[is.na(tmp)] | ||
) | ||
|
||
# getting the date and numeric count as describe above | ||
date_count <- numeric_count <- 0L | ||
if (sum(!is.na(tmp)) > 0L) { | ||
# Setting the first date to 50 years before the current date | ||
target_interval <- sort( | ||
seq.Date(Sys.Date(), length.out = 2L, by = "-50 years") | ||
) | ||
|
||
# get the date count | ||
date_count <- date_count + sum(!is.na(tmp)) | ||
|
||
# convert to numeric and check for the presence of Date among the numeric | ||
tmp2 <- x[is.na(tmp)] | ||
tmp3 <- suppressWarnings(as.numeric(tmp2)) | ||
if (sum(!is.na(tmp3)) > 0L) { | ||
y <- lubridate::as_date( | ||
tmp3[!is.na(tmp3)], | ||
origin = target_interval[[1L]] | ||
) | ||
# second count of date values coming from date within numeric | ||
date_count <- date_count + sum(!is.na(y)) | ||
numeric_count <- sum(is.na(y)) | ||
} | ||
} else { | ||
tmp <- suppressWarnings(as.numeric(x)) | ||
numeric_count <- sum(!is.na(tmp)) | ||
} | ||
|
||
# get the proportion of logical values | ||
logicals <- toupper(characters) == "TRUE" | toupper(characters) == "FALSE" | ||
are_logical <- round((sum(logicals) / n_rows), 6L) | ||
# get logical count | ||
logicals <- toupper(x) == "TRUE" | toupper(x) == "FALSE" | ||
logical_count <- sum(logicals) | ||
|
||
# get the proportion of character values | ||
are_character <- round((1.0 - (are_na + are_numeric + | ||
are_date + are_logical)), 6L) | ||
# get the character count | ||
character_count <- initial_length - | ||
(na_count + logical_count + numeric_count + date_count) | ||
|
||
# return the output | ||
return( | ||
c(are_na, are_numeric, are_date, are_character, are_logical, are_date_time, | ||
are_factor) | ||
) | ||
# transform into proportions | ||
props <- round( | ||
c(na_count, numeric_count, date_count, character_count, logical_count) / | ||
initial_length, 4L) | ||
|
||
return(props) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.