Skip to content

Commit

Permalink
Add function 'verify.data.frame.columns'
Browse files Browse the repository at this point in the history
Add the 'verify.data.frame.column' to the miscellaneous utilities to check the existence and welltypedness of columns in a dataframe.

This works towards fixing #208.

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
  • Loading branch information
maxloeffler committed Dec 24, 2022
1 parent 9c85fcd commit d1d9a03
Show file tree
Hide file tree
Showing 2 changed files with 137 additions and 0 deletions.
67 changes: 67 additions & 0 deletions tests/test-misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,73 @@ test_that("Match argument or take default.", {
expect_equal(actual.result, expected.result, info = "Multiple choices with ignored default, two choices")
})

##
## Check presence and datatype of column.
##

test_that("Check presence and datatype of column.", {

user.names = c("John", "Peter", "Maria", "Susanne")

## contains NaN to verify functionality does not break
age = c(42, 50, NaN, 66)

## contains NA to verify functionality does not break
is.male = c(TRUE, TRUE, FALSE, NA)

## construct simple testing dataframe
data.frame = data.frame(user.names, age, is.male)

## 1) Check base functionality (benign use-case)
expect_no_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male"), c("character", "numeric", "logical")),
message = "All columns present and well-typed.")

## 2) Base test with reordered columns
expect_no_error(verify.data.frame.columns(
data.frame, c("is.male", "age", "user.names"), c("logical", "numeric", "character")),
message = "Order of columns does not matter.")

## 3) Specify less columns than present (Allow optional columns)
expect_no_error(verify.data.frame.columns(
data.frame, c("user.names", "age"), c("character", "numeric")),
message = "Optional columns are allowed.")

## 4) Unequal amount of column names and datatypes
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male"), c("character", "numeric")),
message = "More coloumn names specified than datatypes.")
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age"), c("character", "numeric", "logical")),
message = "More coloumn names specified than datatypes.")

## 5) Datatypes do not match column names
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male"), c("logical", "character", "numeric")),
message = "Column names do not match datatypes.")

## 6) Invalid column / Column not present in dataframe (Typo)
expect_error(verify.data.frame.columns(
data.frame, c("user.name"), c("character")),
message = "Column names do not match datatypes.")

## 7) No datatypes specified and column names are present
expect_no_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male")),
message = "Column names do not match datatypes.")

## 8) No datatypes specified but column names are not present (Typo)
expect_error(verify.data.frame.columns(
data.frame, c("user.name")),
message = "Column names do not match datatypes.")

## 9) To many column names and no datatypes specified
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male", "job.orientation")),
message = "Column names do not match datatypes.")

})

## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Date handling -----------------------------------------------------------

Expand Down
70 changes: 70 additions & 0 deletions util-misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,76 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE
}
}

#' Check if a dataframe matches a given structure. This includes the dataframe to contain columns
#' which must match the column names in \code{columns} and the datatypes in \code{data.types}
#'
#' @param data the dataframe under investigation for structural conformity
#' @param columns a character vector containing the column names the data frame should include
#' @param data.types an ordered vector containing the data types corresponding to the columns.
#' This vector must be of same length of the vector of \code{columns}
#' [default: NULL]
verify.data.frame.columns = function(data, columns, data.types = NULL) {

## every column of the data frame must be one to one mapped to a datatype expected in the column
## therefore if there aren't as many datatypes provided in \code{data.types} as column names have
## been provided in \code{columns} we can stop here already.
if (!is.null(data.types) && length(columns) != length(data.types)) {
error.message = sprintf("If specified, the length of the two given vectors columns and data.types must match.")
logging::logerror(error.message)
stop(error.message)
}

## obtain vector of all column names included in the dataframe to ease further checks.
data.frame.columns = colnames(data)

## iterate over all columns in \code{columns}
for (i in seq_along(columns)) {

## obtain the column.
column = columns[i]

## stop verification process early if column is not present in the dataframe.
if (!(column %in% data.frame.columns)) {
error.message = sprintf("Column '%s' is missing from the dataframe", column)
logging::logerror(error.message)
stop(error.message)
}

if (!is.null(data.types)) {

## obtain the datatype that should be present in the dataframe column c
## which is currently under investigation.
expected.type = data.types[i]

## necessary case distinction for special case list where calling \code{base::class}
## removes information about the listing property
if (expected.type == "list()") {

## column is not a list
if (!is.list(data[[column]])) {
error.message = sprintf("Column '%s' is expected to be a list but it '%s'",
column, class(received.type))
logging::logerror(error.message)
stop(error.message)
}

} else {
## obtain the datatype that elements of the current column hold in the dataframe.
received.type = class(data[[column]])

## stop verification process early if column type in the dataframe is not matching
## the expected datatype.
if (!(expected.type %in% received.type)) {
error.message = sprintf("Column '%s' has type '%s' in dataframe, expected '%s'",
column, received.type, expected.type)
logging::logerror(error.message)
stop(error.message)
}
}
}
}
}


## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Empty dataframe creation-------------------------------------------------
Expand Down

0 comments on commit d1d9a03

Please sign in to comment.