Skip to content

Commit

Permalink
Merge pull request #231 from MaLoefUDS/dev
Browse files Browse the repository at this point in the history
Add input validation for setter and read functions

Reviewed-by: Thomas Bock <bockthom@cs.uni-saarland.de>
Reviewed-by: Christian Hechtl <hechtl@cs.uni-saarland.de>
  • Loading branch information
hechtlC authored Jan 18, 2023
2 parents 9c85fcd + 38c2077 commit 3c452ca
Show file tree
Hide file tree
Showing 6 changed files with 267 additions and 20 deletions.
14 changes: 14 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,20 @@

# coronet – Changelog

## 4.3

### Added

- Add function `verify.data.frame.columns` to check that a dataframe includes all required columns, optionally with a specified datatype (PR #231, d1d9a039f50480ec5b442dc7e8b518648d1f9d9d).

### Changed/Improved

- Include structural verification to almost all functions that read dataframes from files or set a dataframe (setter-functions) (PR #231, b7a95881da72ccaa548c6cd5d94bd558a25caa6f).

### Fixed

- Fix check for empty input-files in utility read functions. Compared to unpresent files, empty files do not throw an error when reading them, a check for `nrow(commit.data) < 1` is therefore required (PR #231, ecfa643cbc15975c3062af95c50ead02730b580f).

## 4.2

### Added
Expand Down
76 changes: 76 additions & 0 deletions tests/test-misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
## Copyright 2017 by Felix Prasse <prassefe@fim.uni-passau.de>
## Copyright 2017-2018 by Claus Hunsen <hunsen@fim.uni-passau.de>
## Copyright 2017-2018 by Thomas Bock <bockthom@fim.uni-passau.de>
## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
## All Rights Reserved.


Expand Down Expand Up @@ -107,6 +108,81 @@ test_that("Match argument or take default.", {
expect_equal(actual.result, expected.result, info = "Multiple choices with ignored default, two choices")
})

##
## Check presence and datatype of data frame columns.
##

test_that("Check presence and datatype of data frame columns.", {

user.names = c("John", "Peter", "Maria", "Susanne")

## contains NaN to verify functionality does not break
age = c(42, 50, NaN, 66)

## contains NA to verify functionality does not break
is.male = c(TRUE, TRUE, FALSE, NA)

## construct simple testing dataframe
data.frame = data.frame(user.names, age, is.male)

## 1) Check base functionality (benign use-case)
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male"), c("character", "numeric", "logical")),
NA,
message = "All columns present and well-typed.")
## Expect no error

## 2) Base test with reordered columns
expect_error(verify.data.frame.columns(
data.frame, c("is.male", "age", "user.names"), c("logical", "numeric", "character")),
NA,
message = "Order of columns does not matter.")
## Expect no error

## 3) Specify less columns than present (Allow optional columns)
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age"), c("character", "numeric")),
NA,
message = "Optional columns are allowed.")
## Expect no error

## 4) Unequal amount of column names and datatypes
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male"), c("character", "numeric")),
message = "More column names specified than datatypes.")
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age"), c("character", "numeric", "logical")),
message = "More column names specified than datatypes.")

## 5) Datatypes do not match column names
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male"), c("logical", "character", "numeric")),
message = "Column names do not match datatypes.")

## 6) Invalid column / Column not present in dataframe (Typo)
expect_error(verify.data.frame.columns(
data.frame, c("user.name"), c("character")),
message = "Column name 'user.name' should not be in dataframe.")

## 7) No datatypes specified and column names are present
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male")),
NA,
message = "Column names do not match datatypes.")
## Expect no error

## 8) No datatypes specified and column names are not specified correctly (Typo)
expect_error(verify.data.frame.columns(
data.frame, c("user.name")),
message = "Column name 'user.name' should not be in dataframe.")

## 9) Too many column names and no datatypes specified
expect_error(verify.data.frame.columns(
data.frame, c("user.names", "age", "is.male", "job.orientation")),
message = "More column names specififed than present in the dataframe.")

})

## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Date handling -----------------------------------------------------------

Expand Down
5 changes: 3 additions & 2 deletions tests/test-read.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
## Copyright 2021 by Johannes Hostert <s8johost@stud.uni-saarland.de>
## Copyright 2021 by Mirabdulla Yusifli <s8miyusi@stud.uni-saarland.de>
## Copyright 2022 by Jonathan Baumann <joba00002@stud.uni-saarland.de>
## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
## All Rights Reserved.


Expand Down Expand Up @@ -129,7 +130,7 @@ test_that("Read the raw commit data with the file artifact.", {
artifact = c("test.c", "test.c", "test2.c", "test3.c", UNTRACKED.FILE, UNTRACKED.FILE, "test2.c"),
artifact.type = c("File", "File", "File", "File", UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE,
UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, "File"),
artifact.diff.size = c(1, 1, 1, 1, 0, 0, 1))
artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1)))

## check the results
expect_identical(commit.data.read, commit.data.expected, info = "Raw commit data.")
Expand Down Expand Up @@ -243,7 +244,7 @@ test_that("Read the author data.", {

## build the expected data.frame
author.data.expected = data.frame(
author.id = as.integer(c(4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944)),
author.id = as.character(c(4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944)),
author.name = c("Thomas", "Olaf", "Björn", "udo", "Fritz fritz@example.org", "georg", "Hans", "Karl", "Max"),
author.email = c("thomas@example.org", "olaf@example.org", "bjoern@example.org", "udo@example.org",
"asd@sample.org", "heinz@example.org", "hans1@example.org", "karl@example.org", "max@example.org"),
Expand Down
35 changes: 35 additions & 0 deletions util-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
## Copyright 2021 by Johannes Hostert <s8johost@stud.uni-saarland.de>
## Copyright 2021 by Mirabdulla Yusifli <s8miyusi@stud.uni-saarland.de>
## Copyright 2022 by Jonathan Baumann <joba00002@stud.uni-saarland.de>
## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
## All Rights Reserved.


Expand Down Expand Up @@ -1064,6 +1065,9 @@ ProjectData = R6::R6Class("ProjectData",

if (is.null(commit.data)) {
commit.data = create.empty.commits.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(commit.data, COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)
}

## store commit data
Expand Down Expand Up @@ -1145,6 +1149,9 @@ ProjectData = R6::R6Class("ProjectData",

if (is.null(data)) {
data = create.empty.commit.message.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(data, COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)
}

## set the actual data
Expand Down Expand Up @@ -1214,6 +1221,9 @@ ProjectData = R6::R6Class("ProjectData",

if (is.null(data)) {
data = create.empty.synchronicity.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(data, SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES)
}

## set the actual data
Expand Down Expand Up @@ -1287,6 +1297,9 @@ ProjectData = R6::R6Class("ProjectData",

if (is.null(data)) {
data = create.empty.pasta.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(data, PASTA.LIST.COLUMNS, PASTA.LIST.DATA.TYPES)
}

## set the actual data
Expand Down Expand Up @@ -1368,6 +1381,9 @@ ProjectData = R6::R6Class("ProjectData",

if (is.null(data)) {
data = create.empty.gender.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(data, GENDER.LIST.COLUMNS, GENDER.LIST.DATA.TYPES)
}

## set the actual data
Expand Down Expand Up @@ -1444,6 +1460,9 @@ ProjectData = R6::R6Class("ProjectData",

if (is.null(mail.data)) {
mail.data = create.empty.mails.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(mail.data, MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES)
}

## store mail data
Expand Down Expand Up @@ -1502,6 +1521,14 @@ ProjectData = R6::R6Class("ProjectData",
set.authors = function(data) {
logging::loginfo("Setting author data.")
private$authors = data

if (is.null(data)) {
data = create.empty.authors.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(data, AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)
}

## add gender data if wanted
if (private$project.conf$get.value("gender")) {

Expand Down Expand Up @@ -1606,6 +1633,9 @@ ProjectData = R6::R6Class("ProjectData",

if (is.null(data)) {
data = create.empty.issues.list()
} else {
## check that dataframe is of correct shape
verify.data.frame.columns(data, ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES)
}

private$issues.unfiltered = data
Expand Down Expand Up @@ -2129,6 +2159,11 @@ ProjectData = R6::R6Class("ProjectData",
#'
#' @param custom.event.timestamps the list of timestamps to set
set.custom.event.timestamps = function(custom.event.timestamps) {
if (!is.list(custom.event.timestamps)) {
error.message = sprintf("set.custom.event.timestamps: Input is expected to be a list.")
logging::logerror(error.message)
stop(error.message)
}
if(length(custom.event.timestamps) != 0){
private$custom.event.timestamps = custom.event.timestamps[
order(unlist(get.date.from.string(custom.event.timestamps)))
Expand Down
74 changes: 74 additions & 0 deletions util-misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
## Copyright 2018-2019 by Jakob Kronawitter <kronawij@fim.uni-passau.de>
## Copyright 2021 by Niklas Schneider <s8nlschn@stud.uni-saarland.de>
## Copyright 2022 by Jonathan Baumann <joba00002@stud.uni-saarland.de>
## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
## All Rights Reserved.


Expand Down Expand Up @@ -139,6 +140,79 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE
}
}

#' Check if a dataframe matches a given structure. This includes the dataframe to contain columns
#' which must match the column names in \code{columns} and the datatypes in \code{data.types}.
#'
#' @param data the data frame under investigation for structural conformity
#' @param columns a character vector containing the column names the data frame should include
#' @param data.types an ordered vector containing the data types corresponding to the columns.
#' If this parameter is \code{NULL} only the existence of \code{columns} is checked
#' without regarding column types. Otherwise this vector must be of the
#' same length as the vector of \code{columns}
#' [default: NULL]
verify.data.frame.columns = function(data, columns, data.types = NULL) {

## every column of the data frame must be one to one mapped to a datatype expected in the column
## therefore if there aren't as many datatypes provided in \code{data.types} as column names have
## been provided in \code{columns} we can stop here already
if (!is.null(data.types) && length(columns) != length(data.types)) {
error.message = sprintf(paste("If specified, the length of the two given vectors",
"'columns' and 'data.types' must match."))
logging::logerror(error.message)
stop(error.message)
}

## obtain vector of all column names included in the data frame to ease further checks
data.frame.columns = colnames(data)

## iterate over all columns in \code{columns}
for (i in seq_along(columns)) {

## obtain the column.
column = columns[i]

## stop verification process early if column is not present in the data frame
if (!(column %in% data.frame.columns)) {
error.message = sprintf("Column '%s' is missing from the dataframe", column)
logging::logerror(error.message)
stop(error.message)
}

if (!is.null(data.types)) {

## obtain the datatype that should be present in the data frame column
## which is currently under investigation
expected.type = data.types[i]

## necessary case distinction for special case list where calling \code{base::class}
## removes the information whether or not \code{data[[column]]} is a list
if (expected.type == "list()") {

## column is not a list
if (!is.list(data[[column]])) {
error.message = sprintf("Column '%s' is expected to be a list but is '%s'",
column, class(received.type))
logging::logerror(error.message)
stop(error.message)
}

} else {
## obtain the datatype that elements of the current column hold in the data frame
received.type = class(data[[column]])

## stop verification process early if column type in the data frame is not matching
## the expected datatype
if (!(expected.type %in% received.type)) {
error.message = sprintf("Column '%s' has type '%s' in dataframe, expected '%s'",
column, received.type, expected.type)
logging::logerror(error.message)
stop(error.message)
}
}
}
}
}


## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
## Empty dataframe creation-------------------------------------------------
Expand Down
Loading

0 comments on commit 3c452ca

Please sign in to comment.