se-sic · bockthom · Dec 21, 2021 · Oct 26, 2021 · Oct 26, 2021 · Oct 26, 2021
diff --git a/util-read.R b/util-read.R
@@ -21,6 +21,7 @@
 ## Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 ## Copyright 2020-2021 by Niklas Schneider <s8nlschn@stud.uni-saarland.de>
 ## Copyright 2021 by Johannes Hostert <s8johost@stud.uni-saarland.de>
+## Copyright 2021 by Mirabdulla Yusifli <s8miyusi@stud.uni-saarland.de>
 ## All Rights Reserved.
 
 ## Note:
@@ -387,6 +388,64 @@ read.issues = function(data.path, issues.sources = c("jira", "github")) {
 }
 
 
+## * Gender data ------------------------------------------------------------
+
+## column names of a dataframe containing gender data (see function \code{read.gender})
+GENDER.LIST.COLUMNS = c(
+    "author.name", "gender"
+)
+
+## declare the datatype for each column in the constant 'GENDER.LIST.COLUMNS'
+GENDER.LIST.DATA.TYPES = c(
+    "character", "character"
+)
+
+#' Read and parse the gender data from the 'gender' file.
+#' The form in the file is : author.name, gender
+#' The parsed form is a data frame with author.name as key, gender as value.
+#'
+#' @param data.path the path to the gender data
+#'
+#' @return the read and parsed gender data
+read.gender = function(data.path) {
+    # constant for seperating key and value
+    SEPERATOR = ";"
+
+    ## get file name of gender data
+    filepath = file.path(data.path, "gender")
+
+    ## read data from disk [can be empty]
+    lines = suppressWarnings(try(readLines(filepath), silent = TRUE))
+
+    ## handle the case if the list of items is empty
+    if (inherits(lines, "try-error")) {
+        logging::logwarn("There are no gender data available for the current environment.")
+        logging::logwarn("Datapath: %s", data.path)
+        return(create.empty.gender.list())
+    }
+
+    result.list = parallel::mcmapply(lines, seq_along(lines), SIMPLIFY = FALSE, FUN = function(line, line.id) {
+        if ( nchar(line) == 0 ) {
+            return(NULL)
+        }
+
+        # 1) split key
+        # 2) split value
+        line.split = unlist(strsplit(line, SEPERATOR))
+        key = line.split[1]
+        value = line.split[2]
+
+        # Transform data to data.frame
+        df = merge(key, value)
+        colnames(df) = c("author.name", "gender")
+        return(df)
+    })
+
+    result.df = plyr::rbind.fill(result.list)
+    logging::logdebug("read.gender: finished.")
+    return(result.df)
+}
+
 #' Create an empty dataframe which has the same shape as a dataframe containing issues. The dataframe has the column
 #' names and column datatypes defined in \code{ISSUES.LIST.COLUMNS} and \code{ISSUES.LIST.DATA.TYPES}, respectively.
 #'
@@ -395,6 +454,14 @@ create.empty.issues.list = function() {
     return (create.empty.data.frame(ISSUES.LIST.COLUMNS, ISSUES.LIST.DATA.TYPES))
 }
 
+#' Create an empty dataframe which has the same shape as a dataframe containing gender data.
+#' The dataframe has the column names and column datatypes defined in \code{GENDER.LIST.COLUMNS}
+#' and \code{GENDER.LIST.DATA.TYPES}, respectively.
+#'
+#' @return the empty dataframe
+create.empty.gender.list = function() {
+    return (create.empty.data.frame(GENDER.LIST.COLUMNS, GENDER.LIST.DATA.TYPES))
+}
 
 ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
 ## Additional data sources -------------------------------------------------