Merge pull request #231 from MaLoefUDS/dev

Add input validation for setter and read functions Reviewed-by: Thomas Bock <bockthom@cs.uni-saarland.de> Reviewed-by: Christian Hechtl <hechtl@cs.uni-saarland.de>
se-sic · Jan 18, 2023 · 3c452ca · 3c452ca
2 parents 9c85fcd + 38c2077
commit 3c452ca
Show file tree

Hide file tree

Showing 6 changed files with 267 additions and 20 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,20 @@
 
 # coronet – Changelog
 
+## 4.3
+
+### Added
+
+- Add function `verify.data.frame.columns` to check that a dataframe includes all required columns, optionally with a specified datatype (PR #231, d1d9a039f50480ec5b442dc7e8b518648d1f9d9d).
+
+### Changed/Improved
+
+- Include structural verification to almost all functions that read dataframes from files or set a dataframe (setter-functions) (PR #231, b7a95881da72ccaa548c6cd5d94bd558a25caa6f).
+
+### Fixed
+
+- Fix check for empty input-files in utility read functions. Compared to unpresent files, empty files do not throw an error when reading them, a check for `nrow(commit.data) < 1` is therefore required (PR #231, ecfa643cbc15975c3062af95c50ead02730b580f).
+
 ## 4.2
 
 ### Added

diff --git a/tests/test-misc.R b/tests/test-misc.R
@@ -14,6 +14,7 @@
 ## Copyright 2017 by Felix Prasse <prassefe@fim.uni-passau.de>
 ## Copyright 2017-2018 by Claus Hunsen <hunsen@fim.uni-passau.de>
 ## Copyright 2017-2018 by Thomas Bock <bockthom@fim.uni-passau.de>
+## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 ## All Rights Reserved.
 
 
@@ -107,6 +108,81 @@ test_that("Match argument or take default.", {
     expect_equal(actual.result, expected.result, info = "Multiple choices with ignored default, two choices")
 })
 
+##
+## Check presence and datatype of data frame columns.
+##
+
+test_that("Check presence and datatype of data frame columns.", {
+
+    user.names = c("John", "Peter", "Maria", "Susanne")
+
+    ## contains NaN to verify functionality does not break
+    age = c(42, 50, NaN, 66)
+
+    ## contains NA to verify functionality does not break
+    is.male = c(TRUE, TRUE, FALSE, NA)
+
+    ## construct simple testing dataframe
+    data.frame = data.frame(user.names, age, is.male)
+
+    ## 1) Check base functionality (benign use-case)
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.names", "age", "is.male"), c("character", "numeric", "logical")),
+        NA,
+        message = "All columns present and well-typed.")
+    ## Expect no error
+
+    ## 2) Base test with reordered columns
+    expect_error(verify.data.frame.columns(
+        data.frame, c("is.male", "age", "user.names"), c("logical", "numeric", "character")),
+        NA,
+        message = "Order of columns does not matter.")
+    ## Expect no error
+
+    ## 3) Specify less columns than present (Allow optional columns)
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.names", "age"), c("character", "numeric")),
+        NA,
+        message = "Optional columns are allowed.")
+    ## Expect no error
+
+    ## 4) Unequal amount of column names and datatypes
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.names", "age", "is.male"), c("character", "numeric")),
+        message = "More column names specified than datatypes.")
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.names", "age"), c("character", "numeric", "logical")),
+        message = "More column names specified than datatypes.")
+
+    ## 5) Datatypes do not match column names
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.names", "age", "is.male"), c("logical", "character", "numeric")),
+        message = "Column names do not match datatypes.")
+
+    ## 6) Invalid column / Column not present in dataframe (Typo)
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.name"), c("character")),
+        message = "Column name 'user.name' should not be in dataframe.")
+
+    ## 7) No datatypes specified and column names are present
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.names", "age", "is.male")),
+        NA,
+        message = "Column names do not match datatypes.")
+    ## Expect no error
+
+    ## 8) No datatypes specified and column names are not specified correctly (Typo)
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.name")),
+        message = "Column name 'user.name' should not be in dataframe.")
+
+    ## 9) Too many column names and no datatypes specified
+    expect_error(verify.data.frame.columns(
+        data.frame, c("user.names", "age", "is.male", "job.orientation")),
+        message = "More column names specififed than present in the dataframe.")
+
+})
+
 ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
 ## Date handling -----------------------------------------------------------
 

diff --git a/tests/test-read.R b/tests/test-read.R
@@ -21,6 +21,7 @@
 ## Copyright 2021 by Johannes Hostert <s8johost@stud.uni-saarland.de>
 ## Copyright 2021 by Mirabdulla Yusifli <s8miyusi@stud.uni-saarland.de>
 ## Copyright 2022 by Jonathan Baumann <joba00002@stud.uni-saarland.de>
+## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 ## All Rights Reserved.
 
 
@@ -129,7 +130,7 @@ test_that("Read the raw commit data with the file artifact.", {
                                       artifact = c("test.c", "test.c", "test2.c", "test3.c", UNTRACKED.FILE, UNTRACKED.FILE, "test2.c"),
                                       artifact.type = c("File", "File", "File", "File", UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE,
                                                         UNTRACKED.FILE.EMPTY.ARTIFACT.TYPE, "File"),
-                                      artifact.diff.size = c(1, 1, 1, 1, 0, 0, 1))
+                                      artifact.diff.size = as.integer(c(1, 1, 1, 1, 0, 0, 1)))
 
     ## check the results
     expect_identical(commit.data.read, commit.data.expected, info = "Raw commit data.")
@@ -243,7 +244,7 @@ test_that("Read the author data.", {
 
     ## build the expected data.frame
     author.data.expected = data.frame(
-        author.id = as.integer(c(4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944)),
+        author.id = as.character(c(4936, 4937, 4938, 4939, 4940, 4941, 4942, 4943, 4944)),
         author.name = c("Thomas", "Olaf", "Björn", "udo", "Fritz fritz@example.org", "georg", "Hans", "Karl", "Max"),
         author.email = c("thomas@example.org", "olaf@example.org", "bjoern@example.org", "udo@example.org",
                          "asd@sample.org", "heinz@example.org", "hans1@example.org", "karl@example.org", "max@example.org"),

diff --git a/util-data.R b/util-data.R
@@ -25,6 +25,7 @@
 ## Copyright 2021 by Johannes Hostert <s8johost@stud.uni-saarland.de>
 ## Copyright 2021 by Mirabdulla Yusifli <s8miyusi@stud.uni-saarland.de>
 ## Copyright 2022 by Jonathan Baumann <joba00002@stud.uni-saarland.de>
+## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 ## All Rights Reserved.
 
 
@@ -1064,6 +1065,9 @@ ProjectData = R6::R6Class("ProjectData",
 
             if (is.null(commit.data)) {
                 commit.data = create.empty.commits.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(commit.data, COMMITS.LIST.COLUMNS, COMMITS.LIST.DATA.TYPES)
             }
 
             ## store commit data
@@ -1145,6 +1149,9 @@ ProjectData = R6::R6Class("ProjectData",
 
             if (is.null(data)) {
                 data = create.empty.commit.message.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(data, COMMIT.MESSAGE.LIST.COLUMNS, COMMIT.MESSAGE.LIST.DATA.TYPES)
             }
 
             ## set the actual data
@@ -1214,6 +1221,9 @@ ProjectData = R6::R6Class("ProjectData",
 
             if (is.null(data)) {
                 data = create.empty.synchronicity.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(data, SYNCHRONICITY.LIST.COLUMNS, SYNCHRONICITY.LIST.DATA.TYPES)
             }
 
             ## set the actual data
@@ -1287,6 +1297,9 @@ ProjectData = R6::R6Class("ProjectData",
 
             if (is.null(data)) {
                 data = create.empty.pasta.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(data, PASTA.LIST.COLUMNS, PASTA.LIST.DATA.TYPES)
             }
 
             ## set the actual data
@@ -1368,6 +1381,9 @@ ProjectData = R6::R6Class("ProjectData",
 
             if (is.null(data)) {
                 data = create.empty.gender.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(data, GENDER.LIST.COLUMNS, GENDER.LIST.DATA.TYPES)
             }
 
             ## set the actual data
@@ -1444,6 +1460,9 @@ ProjectData = R6::R6Class("ProjectData",
 
             if (is.null(mail.data)) {
                 mail.data = create.empty.mails.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(mail.data, MAILS.LIST.COLUMNS, MAILS.LIST.DATA.TYPES)
             }
 
             ## store mail data
@@ -1502,6 +1521,14 @@ ProjectData = R6::R6Class("ProjectData",
         set.authors = function(data) {
             logging::loginfo("Setting author data.")
             private$authors = data
+
+            if (is.null(data)) {
+                data = create.empty.authors.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(data, AUTHORS.LIST.COLUMNS, AUTHORS.LIST.DATA.TYPES)
+            }
+
             ## add gender data if wanted
             if (private$project.conf$get.value("gender")) {
 
@@ -1606,6 +1633,9 @@ ProjectData = R6::R6Class("ProjectData",
 
             if (is.null(data)) {
                 data = create.empty.issues.list()
+            } else {
+                ## check that dataframe is of correct shape
+                verify.data.frame.columns(data, ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES)
             }
 
             private$issues.unfiltered = data
@@ -2129,6 +2159,11 @@ ProjectData = R6::R6Class("ProjectData",
         #'
         #' @param  custom.event.timestamps the list of timestamps to set
         set.custom.event.timestamps = function(custom.event.timestamps) {
+            if (!is.list(custom.event.timestamps)) {
+                error.message = sprintf("set.custom.event.timestamps: Input is expected to be a list.")
+                logging::logerror(error.message)
+                stop(error.message)
+            }
             if(length(custom.event.timestamps) != 0){
                 private$custom.event.timestamps = custom.event.timestamps[
                     order(unlist(get.date.from.string(custom.event.timestamps)))

diff --git a/util-misc.R b/util-misc.R
@@ -20,6 +20,7 @@
 ## Copyright 2018-2019 by Jakob Kronawitter <kronawij@fim.uni-passau.de>
 ## Copyright 2021 by Niklas Schneider <s8nlschn@stud.uni-saarland.de>
 ## Copyright 2022 by Jonathan Baumann <joba00002@stud.uni-saarland.de>
+## Copyright 2022-2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 ## All Rights Reserved.
 
 
@@ -139,6 +140,79 @@ match.arg.or.default = function(arg, choices, default = NULL, several.ok = FALSE
     }
 }
 
+#' Check if a dataframe matches a given structure. This includes the dataframe to contain columns
+#' which must match the column names in \code{columns} and the datatypes in \code{data.types}.
+#'
+#' @param data the data frame under investigation for structural conformity
+#' @param columns a character vector containing the column names the data frame should include
+#' @param data.types an ordered vector containing the data types corresponding to the columns.
+#'                   If this parameter is \code{NULL} only the existence of \code{columns} is checked
+#'                   without regarding column types. Otherwise this vector must be of the
+#'                   same length as the vector of \code{columns}
+#'                   [default: NULL]
+verify.data.frame.columns = function(data, columns, data.types = NULL) {
+
+    ## every column of the data frame must be one to one mapped to a datatype expected in the column
+    ## therefore if there aren't as many datatypes provided in \code{data.types} as column names have
+    ## been provided in \code{columns} we can stop here already
+    if (!is.null(data.types) && length(columns) != length(data.types)) {
+        error.message = sprintf(paste("If specified, the length of the two given vectors",
+                                "'columns' and 'data.types' must match."))
+        logging::logerror(error.message)
+        stop(error.message)
+    }
+
+    ## obtain vector of all column names included in the data frame to ease further checks
+    data.frame.columns = colnames(data)
+
+    ## iterate over all columns in \code{columns}
+    for (i in seq_along(columns)) {
+
+        ## obtain the column.
+        column = columns[i]
+
+        ## stop verification process early if column is not present in the data frame
+        if (!(column %in% data.frame.columns)) {
+            error.message = sprintf("Column '%s' is missing from the dataframe", column)
+            logging::logerror(error.message)
+            stop(error.message)
+        }
+
+        if (!is.null(data.types)) {
+
+            ## obtain the datatype that should be present in the data frame column
+            ## which is currently under investigation
+            expected.type = data.types[i]
+
+            ## necessary case distinction for special case list where calling \code{base::class}
+            ## removes the information whether or not \code{data[[column]]} is a list
+            if (expected.type == "list()") {
+
+                ## column is not a list
+                if (!is.list(data[[column]])) {
+                    error.message = sprintf("Column '%s' is expected to be a list but is '%s'",
+                                            column, class(received.type))
+                    logging::logerror(error.message)
+                    stop(error.message)
+                }
+
+            } else {
+                ## obtain the datatype that elements of the current column hold in the data frame
+                received.type = class(data[[column]])
+
+                ## stop verification process early if column type in the data frame is not matching
+                ## the expected datatype
+                if (!(expected.type %in% received.type)) {
+                    error.message = sprintf("Column '%s' has type '%s' in dataframe, expected '%s'",
+                                            column, received.type, expected.type)
+                    logging::logerror(error.message)
+                    stop(error.message)
+                }
+            }
+        }
+    }
+}
+
 
 ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
 ## Empty dataframe creation-------------------------------------------------