Skip to content

Commit

Permalink
Merge pull request #136 from RSGInc/132-clean-up-styling-in-code
Browse files Browse the repository at this point in the history
132 clean up styling in code
  • Loading branch information
erika-redding authored Feb 29, 2024
2 parents 9fc8f6f + de0c1a4 commit f4666a3
Show file tree
Hide file tree
Showing 69 changed files with 2,215 additions and 2,199 deletions.
6 changes: 3 additions & 3 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#' Test household travel survey dataset
#'
#'
#' @format ## `test_data`
#' A list containing the following datasets:
#'
#'
#' @source A list of export tables compiled from internal RSG database on 11/22/2023
"test_data"
#' Test household dataset
Expand Down Expand Up @@ -152,4 +152,4 @@
#' \item{value}{The numeric value of the variable}
#' \item{label}{What the numeric value of the variable represents}
#' }
"value_labels"
"value_labels"
87 changes: 47 additions & 40 deletions R/factorize_column.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,81 +34,88 @@
#' var_str = "income_detailed",
#' vals_df = value_labels,
#' extra_labels = "Missing",
#' value_label_colname = 'label'
#' value_label_colname = "label"
#' )
#'
#' @export factorize_column
factorize_column = function(
x,
var_str,
vals_df,
variable_colname = 'variable',
value_colname = 'value',
value_label_colname = 'value_label',
value_order_colname = 'value',
extra_labels = NULL,
add_na = TRUE
) {
x,
var_str,
vals_df,
variable_colname = "variable",
value_colname = "value",
value_label_colname = "value_label",
value_order_colname = "value",
extra_labels = NULL,
add_na = TRUE) {
vals_df = data.table::data.table(vals_df)

# sort the vals_df to ensure the ordered factor is ordered correctly
vals_df = vals_df[with(vals_df,
order(get(variable_colname),
get(value_order_colname),
get(value_colname),
get(value_label_colname))), ]
vals_df = vals_df[with(
vals_df,
order(
get(variable_colname),
get(value_order_colname),
get(value_colname),
get(value_label_colname)
)
), ]

# select levels and labels for this variable
levels = vals_df[get(variable_colname) == var_str, get(value_colname)]
labels = vals_df[get(variable_colname) == var_str, get(value_label_colname)]
levels = vals_df[get(variable_colname) == var_str, get(value_colname)]
labels = vals_df[get(variable_colname) == var_str, get(value_label_colname)]

# add extra labels, if provided
if ( !missing(extra_labels) ) {
levels = c(levels, vals_df[get(variable_colname) %in% extra_labels, get(value_colname)])
labels = c(labels, vals_df[get(variable_colname) %in% extra_labels, get(value_label_colname)])
if (!missing(extra_labels)) {
levels = c(levels, vals_df[get(variable_colname) %in% extra_labels, get(value_colname)])
labels = c(labels, vals_df[get(variable_colname) %in% extra_labels, get(value_label_colname)])
}

# only apply labels if there are labels specific to this variable
if (var_str %in% unique(vals_df[, get(variable_colname)])) {

#perform QC checks before factorizing
# perform QC checks before factorizing

# check for duplicate labels (gets a warning)
if ( any(duplicated(labels)) ) {
warning('Duplicated labels in variable "', var_str, '". Labels: ',
paste(labels, collapse = '; '))
}
if (any(duplicated(labels))) {
warning(
'Duplicated labels in variable "', var_str, '". Labels: ',
paste(labels, collapse = "; ")
)
}

# check for duplicate values (or "levels") (gets a warning)
if ( any(duplicated(levels)) ) {
warning('Duplicated values/levels in variable "', var_str, '". Values: ',
paste(levels, collapse = '; '))
}
if (any(duplicated(levels))) {
warning(
'Duplicated values/levels in variable "', var_str, '". Values: ',
paste(levels, collapse = "; ")
)
}

# check for missing levels (except NA) (gets a warning)
if ( any(!(unique(x) %in% c(levels, NA))) ) {
if (any(!(unique(x) %in% c(levels, NA)))) {
missing_levels = unique(x)[!unique(x) %in% c(levels, NA)]

levels = c(levels, missing_levels)
labels = c(labels, extra_labels)

warning('Missing labels in variable "', var_str, '". Values missing labels: ',
paste(missing_levels, collapse = '; '))
}
warning(
'Missing labels in variable "', var_str, '". Values missing labels: ',
paste(missing_levels, collapse = "; ")
)
}

# TODO add checks after making variable a factor to ensure it's right?

y = factor(x, levels = levels, labels = labels, ordered = TRUE)

# if NAs are desired.
if (add_na) {y = addNA(y)}
#if (add_na) {forcats::fct_explicit_na(y, na_level = "(Missing - NA)")}

if (add_na) {
y = addNA(y)
}
# if (add_na) {forcats::fct_explicit_na(y, na_level = "(Missing - NA)")}
} else {

# if there are no labels in the codebook for this variable, it is returned unchanged.
y = x
}
return(invisible(y))
}

39 changes: 19 additions & 20 deletions R/factorize_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,44 +23,43 @@
#' hh_labeled = factorize_df(
#' df = hh,
#' vals_df = value_labels,
#' value_label_colname = 'label',
#' value_label_colname = "label",
#' extra_labels = c("Missing")
#' )
#'
#' @export factorize_df
factorize_df <- function(df, vals_df, verbose = TRUE, ...) {

if ( 'data.table' %in% class(df) ){
factorize_df = function(df, vals_df, verbose = TRUE, ...) {
if ("data.table" %in% class(df)) {
df_is_dt = TRUE
df_labeled <- as.data.frame(df) # If df is a data.table, R crashes in Rstudio
df_labeled = as.data.frame(df) # If df is a data.table, R crashes in Rstudio
# on.exit(expr=data.table(df_labeled))
} else {
df_is_dt = FALSE
}

for (i in 1:ncol(df)) {
var_str = names(df)[i]

var_str <- names(df)[i]

df_labeled[[var_str]] <- factorize_column(
x=df[[var_str]],
var_str=var_str,
vals_df=vals_df,
...)
}
df_labeled[[var_str]] = factorize_column(
x = df[[var_str]],
var_str = var_str,
vals_df = vals_df,
...
)
}

if ( verbose ){
if (verbose) {
# print which vars are labeled and unlabeled
labeled_vars_in_df <- sort(colnames(df)[ (colnames(df) %in% vals_df$variable) ])
unlabeled_vars_in_df <- sort(colnames(df)[!(colnames(df) %in% vals_df$variable) ])
labeled_vars_in_df = sort(colnames(df)[(colnames(df) %in% vals_df$variable)])
unlabeled_vars_in_df = sort(colnames(df)[!(colnames(df) %in% vals_df$variable)])

message("\n Labeled vars: ")
message(paste(sprintf("- %s", labeled_vars_in_df), collapse= '\n'), '\n')
message(paste(sprintf("- %s", labeled_vars_in_df), collapse = "\n"), "\n")
message("Unlabeled vars: ")
message(paste(sprintf("- %s",unlabeled_vars_in_df), collapse='\n'))
message(paste(sprintf("- %s", unlabeled_vars_in_df), collapse = "\n"))
}
if ( df_is_dt ){
df_labeled = data.table::data.table(df_labeled)
if (df_is_dt) {
df_labeled = data.table::data.table(df_labeled)
}
return(df_labeled)
}
77 changes: 38 additions & 39 deletions R/get_distance_meters.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,45 +8,44 @@
#' @param radius Radius of the sphere to use for haversine calculation (Defaults to meters)
#' @return A vector of distances in meters
#' @export get_distance_meters
#'
#' @examples
#'
#' @examples
#' get_distance_meters(c(38.8734, -75.2394), c(40.3497, -76.2314))
#'
#'
get_distance_meters =
function(
location_1,
location_2,
radius = 6378137) {

# convert to matrix if not already a matrix
location_1 = matrix(location_1, ncol = 2)
location_2 = matrix(location_2, ncol = 2)

# do some checks on inputs
# longitudes should be numeric and between -180 & 180
stopifnot(is.numeric(location_1[, 1]))
stopifnot(is.numeric(location_2[, 1]))
stopifnot(location_1[!is.na(location_1[, 1]), 1] %between% c(-180, 180))
stopifnot(location_2[!is.na(location_2[, 1]), 1] %between% c(-180, 180))

# latitudes should be numeric and between -90 & 90
stopifnot(is.numeric(location_1[, 2]))
stopifnot(is.numeric(location_2[, 2]))
stopifnot(location_1[!is.na(location_1[, 2]), 2] %between% c(-90, 90))
stopifnot(location_2[!is.na(location_2[, 2]), 2] %between% c(-90, 90))

lon_1 = location_1[, 1] * pi / 180 # converts to radian
lon_2 = location_2[, 1] * pi / 180

lat_1 = location_1[, 2] * pi / 180
lat_2 = location_2[, 2] * pi / 180

dLat = lat_2 - lat_1
dLon = lon_2 - lon_1

a = sin(dLat / 2) ^ 2 + cos(lat_1) * cos(lat_2) * sin(dLon / 2) ^ 2
a = pmin(a, 1)
dist = 2 * atan2(sqrt(a), sqrt(1 - a)) * radius

return(dist)
}
location_1,
location_2,
radius = 6378137) {
# convert to matrix if not already a matrix
location_1 = matrix(location_1, ncol = 2)
location_2 = matrix(location_2, ncol = 2)

# do some checks on inputs
# longitudes should be numeric and between -180 & 180
stopifnot(is.numeric(location_1[, 1]))
stopifnot(is.numeric(location_2[, 1]))
stopifnot(location_1[!is.na(location_1[, 1]), 1] %between% c(-180, 180))
stopifnot(location_2[!is.na(location_2[, 1]), 1] %between% c(-180, 180))

# latitudes should be numeric and between -90 & 90
stopifnot(is.numeric(location_1[, 2]))
stopifnot(is.numeric(location_2[, 2]))
stopifnot(location_1[!is.na(location_1[, 2]), 2] %between% c(-90, 90))
stopifnot(location_2[!is.na(location_2[, 2]), 2] %between% c(-90, 90))

lon_1 = location_1[, 1] * pi / 180 # converts to radian
lon_2 = location_2[, 1] * pi / 180

lat_1 = location_1[, 2] * pi / 180
lat_2 = location_2[, 2] * pi / 180

dLat = lat_2 - lat_1
dLon = lon_2 - lon_1

a = sin(dLat / 2)^2 + cos(lat_1) * cos(lat_2) * sin(dLon / 2)^2
a = pmin(a, 1)
dist = 2 * atan2(sqrt(a), sqrt(1 - a)) * radius

return(dist)
}
Loading

0 comments on commit f4666a3

Please sign in to comment.