Skip to content

Commit

Permalink
resolve conflicts Merge remote-tracking branch 'origin/main' into 49-…
Browse files Browse the repository at this point in the history
…create-unit-tests
  • Loading branch information
erika-redding committed Jan 18, 2024
2 parents f6c2959 + d2ece0c commit c4404a8
Show file tree
Hide file tree
Showing 22 changed files with 778 additions and 623 deletions.
42 changes: 25 additions & 17 deletions R/hts_prep_byvar.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,25 +32,25 @@ hts_prep_byvar = function(summarize_by = NULL,
variables_dt = variables_list,
hts_data,
...) {

# For each variables in trip table:
byvar_dt_ls = list()

for (b in seq_along(summarize_by)) {
byvar = summarize_by[[b]]

byvar_loc = hts_find_var(byvar, variables_dt = variables_dt)

byvar_dt_v = data.table::copy(hts_data[[byvar_loc]])

# Is this a shared variable?
byvar_is_shared = variables_dt[shared_name == byvar, is_checkbox][1] == 1

# Is this a numeric variable?
byvar_is_numeric = variables_dt[shared_name == summarize_by[[b]], data_type][[1]] == "numeric"

if (byvar_is_shared) {

byvar_dt_v =
hts_melt_vars(
shared_name = summarize_by[[b]],
Expand All @@ -62,30 +62,38 @@ hts_prep_byvar = function(summarize_by = NULL,
checkbox_label_sep = ":",
to_single_row = TRUE
)

}

if (byvar_is_numeric) {
byvar_dt_v = hts_bin_var(prepped_dt = byvar_dt_v,
numvar = byvar,
nbins = 7)
}

if (!byvar_is_shared) {
byvar_cols = c(hts_get_keycols(byvar_dt_v), byvar)

byvar_dt_v = byvar_dt_v[, byvar_cols, with=FALSE]

}


# Set keys for merging
# keycols = hts_get_keycols(byvar_dt_v)
#
# setkeyv(byvar_dt_v, keycols)

byvar_dt_ls[[b]] = byvar_dt_v

}


byvar_dt = Reduce(function(x, y)
merge(x, y, all.x = FALSE, all.y = FALSE),
merge(x, y, all.x = FALSE, all.y = FALSE, by = intersect(names(x),
names(y))
),
byvar_dt_ls)

return(byvar_dt[])
}

Expand Down
69 changes: 67 additions & 2 deletions R/hts_prep_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#' @param remove_outliers Whether to remove outliers for numeric variable. Default
#' is TRUE.
#' @param threshold Threshold to define outliers. Default is 0.975.
#' @param strataname Name of strata name to bring in. Default is NULL.
#'
#' @return List containing the categorical and numeric datasets of the summary
#' variables and key columns, and either whether the summarize variable is shared
Expand All @@ -37,14 +38,51 @@
#' 'day' = day,
#' 'trip' = trip,
#' 'vehicle' = vehicle))
#'
#'
#' hts_prep_data(summarize_var = 'employment',
#' summarize_by = c('age', 'race'),
#' variables_dt = variable_list,
#' data = list('hh' = hh,
#' 'person' = person,
#' 'day' = day,
#' 'trip' = trip,
#' 'vehicle' = vehicle))
hts_prep_data = function(summarize_var = NULL,
summarize_by = NULL,
variables_dt = variable_list,
data = hts_data,
weighted = TRUE,
remove_outliers = TRUE,
threshold = 0.975) {

threshold = 0.975,
remove_missing = TRUE,
missing_value = 995,
not_imputable = -1,
strataname = NULL) {
# tictoc::tic("Total Time")
# Message:
msg_pt1 = paste0("Creating a summary of ",
hts_find_var(summarize_var, variables_dt = variables_dt), " ", summarize_var)


if (!is.null(summarize_by)){

byvarlocs = lapply(summarize_by, hts_find_var)

for(b in 1:length(byvarlocs)) {
byvarlocs[b] = paste0(byvarlocs[[b]], " ", summarize_by[[b]])
}

byvarlocs = unlist(byvarlocs)

msg_pt2 = ifelse(length(summarize_by) > 0,
paste0("broken down by ",
paste0(byvarlocs, collapse = " and ")),
"")
} else {
msg_pt2 = NULL
}
message(paste0(msg_pt1, " ", msg_pt2))
# TODO: Could we put id and weight cols in a snippet or some such?
# Or in a settings/options for these functions?

Expand Down Expand Up @@ -172,7 +210,34 @@ hts_prep_data = function(summarize_var = NULL,


}
if (remove_missing){

hts_data = hts_remove_missing_data(hts_data = data,
variables_dt = variables_dt,
summarize_var = summarize_var,
summarize_by = summarize_by,
missing_value = missing_value,
not_imputable = not_imputable)
}

if (!is.null(strataname)) {

if(!is.null(cat_res)){

cat_res = hts_cbind_var(lhs_table = cat_res,
rhs_var = strataname,
variable_list = variables_dt)

}

if(!is.null(num_res)){

num_res = hts_cbind_var(lhs_table = num_res,
rhs_var = strataname,
variable_list = variables_dt)

}
}

prepped_dt_ls = list("cat" = cat_res,
"num" = num_res,
Expand Down
24 changes: 14 additions & 10 deletions R/hts_prep_triprate.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
#' is NULL.
#' @param variables_dt List of variable locations and descriptions in data.table
#' format.
#' @param tripdat Dataset of trips in data.table format.
#' @param daydat Dataset of days in data.table format.
#' @param trip_name Name of the trip dataset in hts_data.
#' @param day_name Name of the day dataset in hts_data.
#' @param remove_outliers Boolean whether or not to remove outliers from dataset.
#' Default is TRUE.
#' @param threshold Threshold to define outliers. Default is 0.975.
Expand All @@ -23,31 +23,34 @@
#' require(data.table)
#' require(stringr)
#' hts_prep_triprate(variables_dt = variable_list,
#' tripdat = trip,
#' daydat = day,
#' trip_name = 'trip',
#' day_name = 'day',
#' hts_data = list('hh' = hh,
#' 'person' = person,
#' 'day' = day,
#' 'trip' = trip,
#' 'vehicle' = vehicle))
#' hts_prep_triprate(summarize_by = 'age',
#' variables_dt = variable_list,
#' tripdat = trip,
#' daydat = day,
#' trip_name = 'trip',
#' day_name = 'day',
#' hts_data = list('hh' = hh,
#' 'person' = person,
#' 'day' = day,
#' 'trip' = trip,
#' 'vehicle' = vehicle))
hts_prep_triprate = function(summarize_by = NULL,
variables_dt = variable_list,
tripdat = trip,
daydat = day,
trip_name = 'trip',
day_name = 'day',
remove_outliers = TRUE,
threshold = 0.975,
weighted = TRUE,
hts_data) {

tripdat = hts_data[[trip_name]]
daydat = hts_data[[day_name]]

tripratekeys = c("hh_id", "person_id", "day_id")
trip_subset_cols = hts_get_keycols(tripdat)
day_subset_cols = hts_get_keycols(daydat)
Expand All @@ -70,8 +73,9 @@ hts_prep_triprate = function(summarize_by = NULL,
by = tripratekeys]
}

# FIXME: rename triprate_binned to num_trips?
if (!weighted) {
triprate_dt = tripdat[, .(triprate_binned = .N),
triprate_dt = tripdat[, .(num_trips = .N),
by = tripratekeys]
}

Expand Down Expand Up @@ -119,7 +123,7 @@ hts_prep_triprate = function(summarize_by = NULL,
}

if (!weighted) {
triprate_dt = triprate_dt[, .(triprate_binned = .N),
triprate_dt = triprate_dt[, .(num_trips = .N),
by = triprate_cols_all]
}

Expand Down
61 changes: 32 additions & 29 deletions R/hts_remove_missing_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,51 +33,54 @@ hts_remove_missing_data = function(hts_data,
summarize_by = NULL,
missing_value = 995,
not_imputable = -1){

summarize_var_loc = hts_find_var(summarize_var)

#get variable or first occurrence for checkbox
summarize_var_name = variables_dt[shared_name == summarize_var, variable][1]

summarize_var_tbl = hts_data[[summarize_var_loc]][
!get(summarize_var_name) %in% c(missing_value, not_imputable) |
is.na(get(summarize_var_name))]

summarize_var_id = hts_get_keycols(summarize_var_tbl,
ids = TRUE,
weights = FALSE,
priority = TRUE)

hts_data = hts_filter_data(
hts_data = hts_data,
ids = summarize_var_tbl[, get(summarize_var_id)],
id_type = stringr::str_remove(summarize_var_id, '_id')
)

if (!is.null(summarize_by)){

summarize_by_loc = hts_find_var(summarize_by)

#get variable or first occurrence for checkbox
summarize_by_name = variables_dt[shared_name == summarize_by, variable][1]

summarize_by_tbl = hts_data[[summarize_by_loc]][
!get(summarize_by_name) %in% c(missing_value, not_imputable) |
is.na(get(summarize_by_name))]

summarize_by_id = hts_get_keycols(summarize_by_tbl,
ids = TRUE,
weights = FALSE,
priority = TRUE)

hts_data = hts_filter_data(
hts_data = hts_data,
ids = summarize_by_tbl[, get(summarize_by_id)],
id_type = stringr::str_remove(summarize_by_id, '_id')
)



for (i in 1:length(summarize_by)){

summarize_by_loc = hts_find_var(summarize_by[i])

#get variable or first occurrence for checkbox
summarize_by_name = variables_dt[shared_name == summarize_by[i], variable][1]

summarize_by_tbl = hts_data[[summarize_by_loc]][
!get(summarize_by_name) %in% c(missing_value, not_imputable) |
is.na(get(summarize_by_name))]

summarize_by_id = hts_get_keycols(summarize_by_tbl,
ids = TRUE,
weights = FALSE,
priority = TRUE)

hts_data = hts_filter_data(
hts_data = hts_data,
ids = summarize_by_tbl[, get(summarize_by_id)],
id_type = stringr::str_remove(summarize_by_id, '_id')
)
}
}

return(hts_data)

}
Loading

0 comments on commit c4404a8

Please sign in to comment.