diff --git a/NAMESPACE b/NAMESPACE index 2728918..8ee2f50 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,7 +8,6 @@ export(hts_bin_var) export(hts_cbind_var) export(hts_filter_data) export(hts_find_var) -export(hts_get_keycols) export(hts_get_ns) export(hts_melt_vars) export(hts_prep_byvar) diff --git a/R/get_distance_meters.R b/R/get_distance_meters.R index 869fc70..6696e8e 100644 --- a/R/get_distance_meters.R +++ b/R/get_distance_meters.R @@ -9,7 +9,7 @@ #' @return A vector of distances in meters #' @export get_distance_meters #' -#' @example +#' @examples #' get_distance_meters(c(38.8734, -75.2394), c(40.3497, -76.2314)) #' get_distance_meters = diff --git a/R/hts_cbind_var.R b/R/hts_cbind_var.R index 279660a..b88772b 100644 --- a/R/hts_cbind_var.R +++ b/R/hts_cbind_var.R @@ -2,11 +2,14 @@ #' #' @param lhs_table Table to bind a column to in data.table format #' @param rhs_var Variable to bind to the lhs_table. +#' @param hts_data List of household, person, vehicle, day, and trip tables in +#' data.table format. #' @param variable_list A variable list with descriptions and table locations #' of variables. +#' @param cbind_ids list of unique identifiers for each table in hts_data +#' @param cbind_wts list of weight for each table in hts_data #' @param return_weight_cols If true binds weight variable along with rhs_var #' to lhs_table. Default is FALSE. -#' @param ... Additional arguments passed to \code{link{hts_get_keycols}} #' #' @return Inputted table with inputted variable binded. #' @export @@ -14,20 +17,28 @@ #' @examples #' #' require(data.table) -#' hts_cbind_var(lhs_table = trip, rhs_var = 'speed_mph', variable_list = variable_list) -#' hts_cbind_var(lhs_table = trip, rhs_var = 'speed_mph', -#' variable_list = variable_list, return_weight_cols = TRUE) +#' hts_cbind_var(lhs_table = trip, +#' rhs_var = 'speed_mph', +#' hts_data = test_data, +#' variable_list = variable_list) +#' hts_cbind_var(lhs_table = trip, +#' rhs_var = 'speed_mph', +#' hts_data = test_data, +#' variable_list = variable_list, +#' return_weight_cols = TRUE) #' hts_cbind_var = function(lhs_table, rhs_var, + hts_data, variable_list = variable_list, return_weight_cols = FALSE, - ...) { + cbind_ids = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), + cbind_wts = c('hh_weight', 'person_weight', 'day_weight', 'trip_weight', 'hh_weight')) { var_location = hts_find_var(rhs_var, variables_dt = variable_list) - rhs_table =data.table::copy(get(as.character(var_location))) + rhs_table = hts_data[[var_location]] # If joining trip to vehicle or vice versa, need vehicle ID: if ("trip_id" %in% names(lhs_table) & @@ -45,9 +56,11 @@ hts_cbind_var = function(lhs_table, } # Subset table to ID columns, weight columns (if desired), rhs_var: - selected_cols = c(hts_get_keycols(rhs_table, - ids = TRUE, - weights = return_weight_cols), rhs_var) + selected_cols = c(intersect( + names(rhs_table), + c(cbind_ids, cbind_wts) + ), + rhs_var) rhs_table = rhs_table[, selected_cols, with = FALSE] diff --git a/R/hts_get_keycols.R b/R/hts_get_keycols.R deleted file mode 100644 index 28dc368..0000000 --- a/R/hts_get_keycols.R +++ /dev/null @@ -1,78 +0,0 @@ -#' Find key columns in table -#' -#' @param dt Dataset to find key columns of in data.table format -#' @param ids Boolean whether to return id columns. Default is TRUE. -#' @param weights Boolean whether to return weight columns. Default is TRUE. -#' @param priority Boolean whether to only return highest level weight/id. -#' Default is FALSE. -#' -#' @return List of names of key columns in the dataset. -#' @export -#' -#' @examples -#' -#' require(data.table) -#' hts_get_keycols(dt = trip) -#' hts_get_keycols(dt = trip, priority = TRUE) -#' -hts_get_keycols = function(dt, - ids = TRUE, - weights = TRUE, - priority = FALSE){ - - idcols = c('trip_id', 'day_id', 'person_id','hh_id', 'vehicle_id') - wtcols = c('trip_weight', 'day_weight', 'person_weight', 'hh_weight') - - idnames = c() - wtnames = c() - - #get id(s) - if(ids){ - - # priority will select the highest level weight/id only - if (priority){ - - for (name in idcols){ - - if (name %in% names(dt)){ - - idnames = name - break - } - - } - - } else { - - idnames = c(names(dt)[names(dt) %in% idcols]) - - } - } - - #get weight(s) - if(weights){ - - if (priority){ - - for (name in wtcols){ - - if (name %in% names(dt)){ - - wtnames = name - break - } - - } - - } else { - - wtnames = c(names(dt)[names(dt) %in% wtcols]) - - } - } - - names = c(idnames, wtnames) - - return(names) - -} diff --git a/R/hts_get_ns.R b/R/hts_get_ns.R index 56cc53e..8557b87 100644 --- a/R/hts_get_ns.R +++ b/R/hts_get_ns.R @@ -2,6 +2,8 @@ #' #' @param prepped_dt Dataset to pull counts from. #' @param weighted Boolean whether to pull weighted estimates. +#' @param ids list of possible ids to return counts for +#' @param wt_col weight column to return sum of #' #' @return List of unweighted counts, weighted counts, and highest level unit. #' @export @@ -9,77 +11,33 @@ #' @examples #' #' require(data.table) -#' hts_get_ns(prepped_dt = day, weighted = TRUE) +# hts_get_ns(prepped_dt = day, +# weighted = TRUE, +# wt_col = 'day_weight') #' hts_get_ns = function(prepped_dt, - weighted + weighted, + ids = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), + wt_col ) { - #get unweighted counts - n_idcols = hts_get_keycols(prepped_dt, - ids = TRUE, - weights = FALSE) + # Get ids that are in prepped_dt + present_ids = intersect(names(prepped_dt), ids) - ndt_ids = prepped_dt[, n_idcols, with=FALSE] + ndt_ids = prepped_dt[, present_ids, with=FALSE] ns_unwtd = lapply(ndt_ids, function(x) uniqueN(x)) - - - n_names = - paste0(stringr::str_to_title( - stringr::str_replace( - pattern = "hh", - replacement = "household", - string = stringr::str_remove(string = n_idcols, pattern = "_id") - ) - ), - "s") + + n_names = paste('Count of unique', present_ids) names(ns_unwtd) = n_names - #get units - id_names = c('trip_id', 'day_id', 'person_id', 'vehicle_id', 'hh_id') - - unit = NULL - - for (name in id_names) { - - - if (name %in% names(prepped_dt)){ - - id = name - - unit = paste0(stringr::str_remove(string = id, pattern = "_id"), - "s") - - break - - } - - } - #get weighted counts if (weighted){ - n_wtcols = hts_get_keycols(prepped_dt, - ids = FALSE, - weights = TRUE) - - ndt_wts = prepped_dt[, n_wtcols, with=FALSE] - - ns_wtd = lapply(ndt_wts, function(x) sum(x)) - - n_names = - paste0(stringr::str_to_title( - stringr::str_replace( - pattern = "hh", - replacement = "household", - string = stringr::str_remove(string = n_wtcols, pattern = "_weight") - ) - ), - "s") - - names(ns_wtd) = n_names + ns_wtd = prepped_dt[, sum(get(wt_col))] + + names(ns_wtd) = paste('Sum of', wt_col) } else { @@ -89,11 +47,9 @@ hts_get_ns = function(prepped_dt, ns = list( 'unwtd' = ns_unwtd, - 'wtd' = ns_wtd, - 'units' = unit + 'wtd' = ns_wtd ) return(ns) - } diff --git a/R/hts_melt_vars.R b/R/hts_melt_vars.R index b44f9a3..5b310c2 100644 --- a/R/hts_melt_vars.R +++ b/R/hts_melt_vars.R @@ -8,8 +8,9 @@ #' Defaults to NULL. #' @param variables_dt List of variable locations and descriptions in data.table #' format. -#' @param hts_data List containing household, person, day, trip, and vehicle +#' @param data List containing household, person, day, trip, and vehicle #' datasets in data.table format. +#' @param ids unique identifiers appearing in wide_dt #' @param remove_missing Boolean to remove rows with missing values. Defaults to #' TRUE. #' @param missing_values Missing values to remove. Defaults to 'Missing Response' @@ -32,7 +33,7 @@ #' require(stringr) #' hts_melt_vars(shared_name = 'race', #' wide_dt = person, -#' hts_data = list('hh' = hh, +#' data = list('hh' = hh, #' 'person' = person, #' 'day' = day, #' 'trip' = trip, @@ -42,7 +43,8 @@ hts_melt_vars = function(shared_name = NULL, wide_dt = NULL, shared_name_vars = NULL, variables_dt = variable_list, - hts_data = hts_data, + data = hts_data, + ids = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), remove_missing = TRUE, missing_values = c("Missing Response", "995"), checkbox_label_sep = ":", @@ -54,7 +56,7 @@ hts_melt_vars = function(shared_name = NULL, var_location = hts_find_var(shared_name, variables_dt = variables_dt) # Select table where this variable lives: - wide_dt = hts_data[[var_location]] + wide_dt = data[[var_location]] } @@ -67,7 +69,7 @@ hts_melt_vars = function(shared_name = NULL, melted_dt = data.table::melt( wide_dt, - id.vars = hts_get_keycols(wide_dt), + id.vars = intersect(names(wide_dt),ids), measure.vars = shared_name_vars, variable.name = "variable", value.name = "value" @@ -96,7 +98,7 @@ hts_melt_vars = function(shared_name = NULL, # two or more checked: melted_dt[, num_checked := sum(value), - by = c(hts_get_keycols(wide_dt))] + by = intersect(names(melted_dt), ids)] # make factor levels melted_dt$description = factor(melted_dt$description, levels = unique(melted_dt$description)) diff --git a/R/hts_prep_byvar.R b/R/hts_prep_byvar.R index 0d3f55d..8697c7b 100644 --- a/R/hts_prep_byvar.R +++ b/R/hts_prep_byvar.R @@ -5,6 +5,8 @@ #' format. #' @param hts_data List containing household, person, day, trip, and vehicle #' datasets in data.table format. +#' @param byvar_ids unique identifiers for each table in hts_data +#' @param byvar_wts weight column for each table in hts_data #' @param ... Additional parameters to pass to \code{link{hts_melt_vars}} #' #' @return Data table containing the variable to be summarized and other key @@ -31,6 +33,8 @@ hts_prep_byvar = function(summarize_by = NULL, variables_dt = variables_list, hts_data, + byvar_ids = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), + byvar_wts = c('hh_weight', 'person_weight', 'day_weight', 'trip_weight', 'hh_weight'), ...) { # For each variables in trip table: @@ -55,6 +59,7 @@ hts_prep_byvar = function(summarize_by = NULL, hts_melt_vars( shared_name = summarize_by[[b]], wide_dt = byvar_dt_v, + ids = byvar_ids, shared_name_vars = NULL, variables_dt = variables_dt, to_single_row = TRUE, @@ -69,18 +74,23 @@ hts_prep_byvar = function(summarize_by = NULL, nbins = 7) } + if (!is.null(byvar_wts)){ + + table_idx = which(names(hts_data) == byvar_loc) + wtname = byvar_wts[table_idx] + + } else { + wtname = NULL + } + + if (!byvar_is_shared) { - byvar_cols = c(hts_get_keycols(byvar_dt_v), byvar) + byvar_cols = c(intersect(c(byvar_ids, wtname), names(byvar_dt_v)), byvar) byvar_dt_v = byvar_dt_v[, byvar_cols, with=FALSE] } - # Set keys for merging - # keycols = hts_get_keycols(byvar_dt_v) - # - # setkeyv(byvar_dt_v, keycols) - byvar_dt_ls[[b]] = byvar_dt_v } diff --git a/R/hts_prep_data.R b/R/hts_prep_data.R index 67aa2a0..b60a887 100644 --- a/R/hts_prep_data.R +++ b/R/hts_prep_data.R @@ -7,7 +7,9 @@ #' format. #' @param data List of household, person, vehicle, day, and trip tables in #' data.table format. +#' @param id_cols name of unique identifier for each table in hts_data #' @param weighted Whether the data is weighted. Default is TRUE. +#' @param wt_cols weight name for each table in hts_data #' @param remove_outliers Whether to remove outliers for numeric variable. Default #' is TRUE. #' @param threshold Threshold to define outliers. Default is 0.975. @@ -57,7 +59,9 @@ hts_prep_data = function(summarize_var = NULL, summarize_by = NULL, variables_dt = variable_list, data = hts_data, + id_cols = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), weighted = TRUE, + wt_cols = c('hh_weight', 'person_weight', 'day_weight', 'trip_weight', 'hh_weight'), remove_outliers = TRUE, threshold = 0.975, remove_missing = TRUE, @@ -91,19 +95,32 @@ hts_prep_data = function(summarize_var = NULL, message(paste0(msg_pt1, " ", msg_pt2)) # TODO: Could we put id and weight cols in a snippet or some such? # Or in a settings/options for these functions? - + + if (remove_missing){ + + data = hts_remove_missing_data(hts_data = data, + variables_dt = variables_dt, + summarize_var = summarize_var, + summarize_by = summarize_by, + ids = id_cols, + missing_values = missing_values, + not_imputable = not_imputable) + } + # Find location of summary variable: var_location = hts_find_var(summarize_var, variables_dt = variables_dt) - + + tbl_idx = which(names(data) == var_location) + # Select table where this variable lives: var_dt = data[[var_location]] - + # Is this a shared variable? var_is_shared = variables_dt[shared_name == summarize_var, is_checkbox][1] == 1 - + # If yes, expand summarize_var: if (var_is_shared) { - + summarize_var = variables_dt[shared_name == summarize_var, variable] for(i in 1:length(summarize_var)){ @@ -118,86 +135,121 @@ hts_prep_data = function(summarize_var = NULL, } + } - # Subset table to these column(s): - subset_cols = c(hts_get_keycols(var_dt), summarize_var) + # only keep ids that are in var_dt + id_cols = intersect(id_cols, names(var_dt)) + # Subset table to these column(s): + wtname = wt_cols[tbl_idx] + + if (weighted){ + + subset_cols = c(id_cols, summarize_var, wtname) + + } else { + + subset_cols = c(id_cols, summarize_var) + + } + var_dt = var_dt[, subset_cols, with=FALSE] - + # If shared variable, melt var_dt: if (var_is_shared) { - + shared_name = variables_dt[variable == summarize_var[[1]], shared_name] + + if (weighted){ + + var_dt = hts_melt_vars( + shared_name = shared_name, + wide_dt = var_dt, + variables_dt = variables_dt, + shared_name_vars = summarize_var, + ids = c(id_cols, wtname), + remove_missing = TRUE, + checkbox_label_sep = ":", + missing_values = c("Missing Response", "995"), + to_single_row = FALSE + ) + + } else { + + var_dt = hts_melt_vars( + shared_name = shared_name, + wide_dt = var_dt, + variables_dt = variables_dt, + shared_name_vars = summarize_var, + ids = id_cols, + remove_missing = TRUE, + checkbox_label_sep = ":", + missing_values = c("Missing Response", "995"), + to_single_row = FALSE + ) + + } - var_dt = hts_melt_vars( - shared_name = shared_name, - wide_dt = var_dt, - variables_dt = variables_dt, - shared_name_vars = summarize_var, - remove_missing = TRUE, - checkbox_label_sep = ":", - missing_values = missing_values, - to_single_row = FALSE - ) summarize_var = shared_name - + setnames(var_dt, shared_name, 'shared_name') - + # make factor levels var_dt$shared_name = factor(var_dt$shared_name, levels = unique(var_dt$shared_name)) - + setnames(var_dt, 'shared_name', shared_name) - + } - + # Identify, then bin, if summarize_var is numeric: v_class = variables_dt[shared_name == summarize_var, data_type][[1]] - + if (!v_class %in% c("integer", "numeric")) { var_dt_num = NULL var_dt_cat = var_dt - + } - + if (v_class %in% c("integer", "numeric")) { - + # remove outliers if (remove_outliers){ - + out = hts_remove_outliers(var_dt, numvar = summarize_var, threshold = threshold) - + var_dt = out[['dt']] - + outlier_table = out[['outlier_description']] - + } - + # save a copy of the un-binned data: var_dt_num = data.table::copy(var_dt) - - + + # bin the data for categorical summaries: var_dt_cat = hts_bin_var(prepped_dt = var_dt, numvar = summarize_var, nbins = 7) - + } - + # Summarize-by variables: if (length(summarize_by) == 0) { - + num_res = var_dt_num cat_res = var_dt_cat - + } - + if (length(summarize_by) > 0) { + for (i in 1:length(summarize_by)){ var = summarize_by[i] @@ -224,44 +276,42 @@ hts_prep_data = function(summarize_var = NULL, } } + byvar_dt = hts_prep_byvar(summarize_by, variables_dt = variables_dt, - hts_data = data) - + hts_data = data, + byvar_ids = id_cols, + byvar_wts = wt_cols) + # Merge by var and summarize var: allow_cartesian_setting = FALSE - + if (var_is_shared == TRUE) { allow_cartesian_setting = TRUE } - + cat_res = merge(var_dt_cat, byvar_dt, all.x = FALSE, all.y = FALSE, allow.cartesian = allow_cartesian_setting) - + + setcolorder(cat_res, intersect(c(id_cols, wt_cols, summarize_var, summarize_by), names(cat_res))) + if (v_class %in% c("integer", "numeric")) { num_res = merge(var_dt_num, byvar_dt, all.x = FALSE, all.y = FALSE, allow.cartesian = allow_cartesian_setting) - + + setcolorder(num_res, intersect(c(id_cols, wt_cols, summarize_var, summarize_by), names(cat_res))) + } - + if (!v_class %in% c("integer", "numeric")) { num_res = NULL } - - - } - if (remove_missing){ - hts_data = hts_remove_missing_data(hts_data = data, - variables_dt = variables_dt, - summarize_var = summarize_var, - summarize_by = summarize_by, - missing_values = missing_values, - not_imputable = not_imputable) + } if (!is.null(strataname)) { @@ -270,7 +320,10 @@ hts_prep_data = function(summarize_var = NULL, cat_res = hts_cbind_var(lhs_table = cat_res, rhs_var = strataname, - variable_list = variables_dt) + hts_data = data, + variable_list = variables_dt, + cbind_ids = id_cols, + cbind_wts = wt_cols) } @@ -278,15 +331,18 @@ hts_prep_data = function(summarize_var = NULL, num_res = hts_cbind_var(lhs_table = num_res, rhs_var = strataname, - variable_list = variables_dt) + hts_data = data, + variable_list = variables_dt, + cbind_ids = id_cols, + cbind_wts = wt_cols) } } - + prepped_dt_ls = list("cat" = cat_res, "num" = num_res, "var_is_shared" = var_is_shared) - + # Append outliers: if (v_class %in% c("integer", 'numeric') & remove_outliers) { prepped_dt_ls = list( @@ -294,10 +350,9 @@ hts_prep_data = function(summarize_var = NULL, "num" = num_res, "outliers" = outlier_table) } - - + + return(prepped_dt_ls) - } diff --git a/R/hts_prep_triprate.R b/R/hts_prep_triprate.R index 338c78c..f26efd1 100644 --- a/R/hts_prep_triprate.R +++ b/R/hts_prep_triprate.R @@ -6,6 +6,8 @@ #' format. #' @param trip_name Name of the trip dataset in hts_data. #' @param day_name Name of the day dataset in hts_data. +#' @param ids name of unique identifier in each table in hts_data +#' @param wts name of weight column in each table in hts_data #' @param remove_outliers Boolean whether or not to remove outliers from dataset. #' Default is TRUE. #' @param threshold Threshold to define outliers. Default is 0.975. @@ -43,6 +45,8 @@ hts_prep_triprate = function(summarize_by = NULL, variables_dt = variable_list, trip_name = 'trip', day_name = 'day', + ids = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), + wts = c('hh_weight', 'person_weight', 'day_weight', 'trip_weight', 'hh_weight'), remove_outliers = TRUE, threshold = 0.975, weighted = TRUE, @@ -51,12 +55,23 @@ hts_prep_triprate = function(summarize_by = NULL, tripdat = hts_data[[trip_name]] daydat = hts_data[[day_name]] - tripratekeys = c("hh_id", "person_id", "day_id") - trip_subset_cols = hts_get_keycols(tripdat) - day_subset_cols = hts_get_keycols(daydat) + trip_index = which(names(hts_data) == trip_name) + day_index = which(names(hts_data) == day_name) - if (weighted & (!"trip_weight" %in% trip_subset_cols | - !"day_weight" %in% day_subset_cols)) { + # Get ids + trip_id = ids[trip_index] + day_id = ids[day_index] + + # Get weights + trip_wt = wts[trip_index] + day_wt = wts[day_index] + + tripratekeys = intersect(names(tripdat), ids[-trip_index]) + trip_subset_cols = intersect(names(tripdat), c(ids, wts)) + day_subset_cols = intersect(names(daydat), c(ids, wts)) + + if (weighted & (!trip_wt %in% trip_subset_cols | + !day_wt %in% day_subset_cols)) { stop("Trip/Day weight not found - are these data weighted?") } @@ -69,7 +84,7 @@ hts_prep_triprate = function(summarize_by = NULL, if (length(summarize_by) == 0) { if (weighted) { - triprate_dt = tripdat[, .(num_trips = sum(trip_weight)), + triprate_dt = tripdat[, .(num_trips = sum(get(trip_wt))), by = tripratekeys] } @@ -95,7 +110,7 @@ hts_prep_triprate = function(summarize_by = NULL, # calculate trip rate triprate_dt[, trip_rate := - ifelse(num_trips == 0, 0, num_trips / day_weight)] + ifelse(num_trips == 0, 0, num_trips / get(day_wt))] triprate_dt[, num_trips := NULL] @@ -105,20 +120,23 @@ hts_prep_triprate = function(summarize_by = NULL, if (length(summarize_by) > 0) { - byvar_dt = hts_prep_byvar(summarize_by, variables_dt = variables_dt, hts_data = hts_data) + byvar_dt = hts_prep_byvar(summarize_by, + variables_dt = variables_dt, + hts_data = hts_data, + byvar_ids = ids) merge_cols = names(byvar_dt)[names(byvar_dt) %in% names(trip_control)] triprate_dt = merge(trip_control, byvar_dt, by = merge_cols) - triprate_cols = hts_get_keycols(triprate_dt) + triprate_cols = intersect(names(triprate_dt), c(ids, wts)) - triprate_cols = triprate_cols[!triprate_cols %in% c("trip_id", "trip_weight")] + triprate_cols = triprate_cols[!triprate_cols %in% c(trip_id, trip_wt)] triprate_cols_all = c(triprate_cols, summarize_by) if (weighted) { - triprate_dt = triprate_dt[, .(num_trips = sum(trip_weight)), + triprate_dt = triprate_dt[, .(num_trips = sum(get(trip_wt))), by = triprate_cols_all] } @@ -134,7 +152,7 @@ hts_prep_triprate = function(summarize_by = NULL, # If one of the by-variables is in trip table, need to expand to # include all levels of the variable for every trip, and fill with zeros: - if ("trip_id" %in% names(byvar_dt)) { + if (trip_id %in% names(byvar_dt)) { # fill in with zeros for zero trips for a given level of xt_var using dcast: dcast_formula = paste0(paste0(triprate_cols, collapse = " + "), @@ -178,7 +196,7 @@ hts_prep_triprate = function(summarize_by = NULL, # calculate trip rate triprate_dt[, trip_rate := - ifelse(num_trips == 0, 0, num_trips / day_weight)] + ifelse(num_trips == 0, 0, num_trips / get(day_wt))] triprate_dt[, num_trips := NULL] diff --git a/R/hts_remove_missing_data.R b/R/hts_remove_missing_data.R index 99ccc91..351ae9b 100644 --- a/R/hts_remove_missing_data.R +++ b/R/hts_remove_missing_data.R @@ -6,6 +6,7 @@ #' of variables. #' @param summarize_var Variable to be summarized that has it's missing data #' removed. +#' @param ids names of unique identifiers for each table in hts_data #' @param summarize_by Variable being summarized by that has it's missing data #' removed. Default is NULL. #' @param missing_values Missing values that will be removed. Defaults are 995 and @@ -31,6 +32,7 @@ hts_remove_missing_data = function(hts_data, variables_dt, summarize_var, + ids = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), summarize_by = NULL, missing_values = c("Missing Response", "995"), not_imputable = -1){ @@ -44,10 +46,16 @@ hts_remove_missing_data = function(hts_data, !get(summarize_var_name) %in% c(missing_values, not_imputable) | is.na(get(summarize_var_name))] - summarize_var_id = hts_get_keycols(summarize_var_tbl, - ids = TRUE, - weights = FALSE, - priority = TRUE) + # get ids that are in this table + ids_in_table = intersect(ids, names(summarize_var_tbl)) + + # get id with the most unique counts to filter on + max_index = which.max( + sapply(summarize_var_tbl[, ids_in_table, with = FALSE], function(x) length(unique(x))) + ) + + summarize_var_id = ids_in_table[max_index] + hts_data = hts_filter_data( hts_data = hts_data, @@ -69,10 +77,18 @@ hts_remove_missing_data = function(hts_data, !get(summarize_by_name) %in% c(missing_values, not_imputable) | is.na(get(summarize_by_name))] - summarize_by_id = hts_get_keycols(summarize_by_tbl, - ids = TRUE, - weights = FALSE, - priority = TRUE) + # get id with the most unique counts to filter on + + + # get ids that are in this table + ids_in_table = intersect(ids, names(summarize_by_tbl)) + + max_index = which.max( + sapply(summarize_by_tbl[, ids_in_table, with = FALSE], function(x) length(unique(x))) + ) + + summarize_by_id = ids_in_table[max_index] + hts_data = hts_filter_data( hts_data = hts_data, @@ -85,3 +101,6 @@ hts_remove_missing_data = function(hts_data, return(hts_data) } + +## quiets concerns of R CMD check +utils::globalVariables(c("ids", "ids_in_table")) \ No newline at end of file diff --git a/R/hts_summary.R b/R/hts_summary.R index 656d47f..5f54b8e 100644 --- a/R/hts_summary.R +++ b/R/hts_summary.R @@ -10,6 +10,7 @@ #' variable being summarized is categorical), 'checkbox' (when the variable being #' summarized is derived from a multiple response, aka select-all-that-apply question) #' or 'numeric', when the variable being summarized is numeric. +#' @param id_cols names of possible ids in prepped_dt to return unique counts of #' @param weighted Whether the data is weighted. Default is TRUE. #' @param se Whether to calculate standard error. Default is FALSE. Will be set #' to FALSE if weighted is FALSE. @@ -74,6 +75,7 @@ hts_summary = function( summarize_var, summarize_by = NULL, summarize_vartype = 'categorical', + id_cols = c('hh_id', 'person_id', 'day_id', 'trip_id', 'vehicle_id'), weighted = TRUE, se = FALSE, wtname = NULL, @@ -113,7 +115,9 @@ hts_summary = function( cat_ns = hts_get_ns( prepped_dt = prepped_dt, - weighted = weighted + weighted = weighted, + ids = id_cols, + wt_col = wtname ) # something here to check if the number of unique values is more than 20 diff --git a/man/get_distance_meters.Rd b/man/get_distance_meters.Rd index 9e5c950..7df05d4 100644 --- a/man/get_distance_meters.Rd +++ b/man/get_distance_meters.Rd @@ -20,3 +20,7 @@ A vector of distances in meters Function to get haversine distance in meters between two points Based on calculation from the geosphere package } +\examples{ +get_distance_meters(c(38.8734, -75.2394), c(40.3497, -76.2314)) + +} diff --git a/man/hts_cbind_var.Rd b/man/hts_cbind_var.Rd index 94352eb..69d8277 100644 --- a/man/hts_cbind_var.Rd +++ b/man/hts_cbind_var.Rd @@ -7,9 +7,11 @@ hts_cbind_var( lhs_table, rhs_var, + hts_data, variable_list = variable_list, return_weight_cols = FALSE, - ... + cbind_ids = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), + cbind_wts = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight") ) } \arguments{ @@ -17,13 +19,18 @@ hts_cbind_var( \item{rhs_var}{Variable to bind to the lhs_table.} +\item{hts_data}{List of household, person, vehicle, day, and trip tables in +data.table format.} + \item{variable_list}{A variable list with descriptions and table locations of variables.} \item{return_weight_cols}{If true binds weight variable along with rhs_var to lhs_table. Default is FALSE.} -\item{...}{Additional arguments passed to \code{link{hts_get_keycols}}} +\item{cbind_ids}{list of unique identifiers for each table in hts_data} + +\item{cbind_wts}{list of weight for each table in hts_data} } \value{ Inputted table with inputted variable binded. @@ -34,8 +41,14 @@ Bind a column from one table to another \examples{ require(data.table) -hts_cbind_var(lhs_table = trip, rhs_var = 'speed_mph', variable_list = variable_list) -hts_cbind_var(lhs_table = trip, rhs_var = 'speed_mph', -variable_list = variable_list, return_weight_cols = TRUE) +hts_cbind_var(lhs_table = trip, + rhs_var = 'speed_mph', + hts_data = test_data, + variable_list = variable_list) +hts_cbind_var(lhs_table = trip, + rhs_var = 'speed_mph', + hts_data = test_data, +variable_list = variable_list, + return_weight_cols = TRUE) } diff --git a/man/hts_get_keycols.Rd b/man/hts_get_keycols.Rd deleted file mode 100644 index 4c5e62d..0000000 --- a/man/hts_get_keycols.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/hts_get_keycols.R -\name{hts_get_keycols} -\alias{hts_get_keycols} -\title{Find key columns in table} -\usage{ -hts_get_keycols(dt, ids = TRUE, weights = TRUE, priority = FALSE) -} -\arguments{ -\item{dt}{Dataset to find key columns of in data.table format} - -\item{ids}{Boolean whether to return id columns. Default is TRUE.} - -\item{weights}{Boolean whether to return weight columns. Default is TRUE.} - -\item{priority}{Boolean whether to only return highest level weight/id. -Default is FALSE.} -} -\value{ -List of names of key columns in the dataset. -} -\description{ -Find key columns in table -} -\examples{ - -require(data.table) -hts_get_keycols(dt = trip) -hts_get_keycols(dt = trip, priority = TRUE) - -} diff --git a/man/hts_get_ns.Rd b/man/hts_get_ns.Rd index 0f08a79..c9294d1 100644 --- a/man/hts_get_ns.Rd +++ b/man/hts_get_ns.Rd @@ -4,12 +4,21 @@ \alias{hts_get_ns} \title{Get counts from dataset} \usage{ -hts_get_ns(prepped_dt, weighted) +hts_get_ns( + prepped_dt, + weighted, + ids = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), + wt_col +) } \arguments{ \item{prepped_dt}{Dataset to pull counts from.} \item{weighted}{Boolean whether to pull weighted estimates.} + +\item{ids}{list of possible ids to return counts for} + +\item{wt_col}{weight column to return sum of} } \value{ List of unweighted counts, weighted counts, and highest level unit. @@ -20,6 +29,5 @@ Get counts from dataset \examples{ require(data.table) -hts_get_ns(prepped_dt = day, weighted = TRUE) } diff --git a/man/hts_melt_vars.Rd b/man/hts_melt_vars.Rd index bb55f8d..86d4ba2 100644 --- a/man/hts_melt_vars.Rd +++ b/man/hts_melt_vars.Rd @@ -9,7 +9,8 @@ hts_melt_vars( wide_dt = NULL, shared_name_vars = NULL, variables_dt = variable_list, - hts_data = hts_data, + data = hts_data, + ids = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), remove_missing = TRUE, missing_values = c("Missing Response", "995"), checkbox_label_sep = ":", @@ -29,9 +30,11 @@ Defaults to NULL.} \item{variables_dt}{List of variable locations and descriptions in data.table format.} -\item{hts_data}{List containing household, person, day, trip, and vehicle +\item{data}{List containing household, person, day, trip, and vehicle datasets in data.table format.} +\item{ids}{unique identifiers appearing in wide_dt} + \item{remove_missing}{Boolean to remove rows with missing values. Defaults to TRUE.} @@ -58,7 +61,7 @@ require(data.table) require(stringr) hts_melt_vars(shared_name = 'race', wide_dt = person, - hts_data = list('hh' = hh, + data = list('hh' = hh, 'person' = person, 'day' = day, 'trip' = trip, diff --git a/man/hts_prep_byvar.Rd b/man/hts_prep_byvar.Rd index 593c851..d1bcf2c 100644 --- a/man/hts_prep_byvar.Rd +++ b/man/hts_prep_byvar.Rd @@ -8,6 +8,8 @@ hts_prep_byvar( summarize_by = NULL, variables_dt = variables_list, hts_data, + byvar_ids = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), + byvar_wts = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight"), ... ) } @@ -20,6 +22,10 @@ format.} \item{hts_data}{List containing household, person, day, trip, and vehicle datasets in data.table format.} +\item{byvar_ids}{unique identifiers for each table in hts_data} + +\item{byvar_wts}{weight column for each table in hts_data} + \item{...}{Additional parameters to pass to \code{link{hts_melt_vars}}} } \value{ diff --git a/man/hts_prep_data.Rd b/man/hts_prep_data.Rd index 7ba59f1..1f3e6ed 100644 --- a/man/hts_prep_data.Rd +++ b/man/hts_prep_data.Rd @@ -9,7 +9,9 @@ hts_prep_data( summarize_by = NULL, variables_dt = variable_list, data = hts_data, + id_cols = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), weighted = TRUE, + wt_cols = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight"), remove_outliers = TRUE, threshold = 0.975, remove_missing = TRUE, @@ -30,8 +32,12 @@ format.} \item{data}{List of household, person, vehicle, day, and trip tables in data.table format.} +\item{id_cols}{name of unique identifier for each table in hts_data} + \item{weighted}{Whether the data is weighted. Default is TRUE.} +\item{wt_cols}{weight name for each table in hts_data} + \item{remove_outliers}{Whether to remove outliers for numeric variable. Default is TRUE.} diff --git a/man/hts_prep_triprate.Rd b/man/hts_prep_triprate.Rd index a8e24ce..2ec7a46 100644 --- a/man/hts_prep_triprate.Rd +++ b/man/hts_prep_triprate.Rd @@ -9,6 +9,8 @@ hts_prep_triprate( variables_dt = variable_list, trip_name = "trip", day_name = "day", + ids = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), + wts = c("hh_weight", "person_weight", "day_weight", "trip_weight", "hh_weight"), remove_outliers = TRUE, threshold = 0.975, weighted = TRUE, @@ -26,6 +28,10 @@ format.} \item{day_name}{Name of the day dataset in hts_data.} +\item{ids}{name of unique identifier in each table in hts_data} + +\item{wts}{name of weight column in each table in hts_data} + \item{remove_outliers}{Boolean whether or not to remove outliers from dataset. Default is TRUE.} diff --git a/man/hts_remove_missing_data.Rd b/man/hts_remove_missing_data.Rd index a81c5d0..4287af9 100644 --- a/man/hts_remove_missing_data.Rd +++ b/man/hts_remove_missing_data.Rd @@ -8,6 +8,7 @@ hts_remove_missing_data( hts_data, variables_dt, summarize_var, + ids = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), summarize_by = NULL, missing_values = c("Missing Response", "995"), not_imputable = -1 @@ -23,6 +24,8 @@ of variables.} \item{summarize_var}{Variable to be summarized that has it's missing data removed.} +\item{ids}{names of unique identifiers for each table in hts_data} + \item{summarize_by}{Variable being summarized by that has it's missing data removed. Default is NULL.} diff --git a/man/hts_summary.Rd b/man/hts_summary.Rd index 295d6e4..8cc2f2a 100644 --- a/man/hts_summary.Rd +++ b/man/hts_summary.Rd @@ -9,6 +9,7 @@ hts_summary( summarize_var, summarize_by = NULL, summarize_vartype = "categorical", + id_cols = c("hh_id", "person_id", "day_id", "trip_id", "vehicle_id"), weighted = TRUE, se = FALSE, wtname = NULL, @@ -32,6 +33,8 @@ variable being summarized is categorical), 'checkbox' (when the variable being summarized is derived from a multiple response, aka select-all-that-apply question) or 'numeric', when the variable being summarized is numeric.} +\item{id_cols}{names of possible ids in prepped_dt to return unique counts of} + \item{weighted}{Whether the data is weighted. Default is TRUE.} \item{se}{Whether to calculate standard error. Default is FALSE. Will be set diff --git a/tests/testthat/test_hts_cbind_var.R b/tests/testthat/test_hts_cbind_var.R index c175c9d..7fd21ff 100644 --- a/tests/testthat/test_hts_cbind_var.R +++ b/tests/testthat/test_hts_cbind_var.R @@ -15,8 +15,11 @@ test_that("hts_cbind_var should bind a column to another table", { # Create a sample variable_list variable_list = variable_list + # Create a sample hts_data + data("test_data") + # Call the function - result = hts_cbind_var(lhs_table, rhs_var = 'speed_mph', variable_list = variable_list) + result = hts_cbind_var(lhs_table, rhs_var = 'speed_mph', hts_data = test_data, variable_list = variable_list) # Check if the result is a data.table expect_is(result, "data.table", diff --git a/tests/testthat/test_hts_get_keycols.R b/tests/testthat/test_hts_get_keycols.R deleted file mode 100644 index aa403b4..0000000 --- a/tests/testthat/test_hts_get_keycols.R +++ /dev/null @@ -1,25 +0,0 @@ - -context("Test suite for hts_get_keycols function") - -# Load necessary libraries and setup environment -library(testthat) -library(data.table) - -sample_dt = test_data$person - - -test_that("hts_get_keycols should return key columns based on options", { - - results_all = hts_get_keycols(sample_dt, ids=TRUE, weights = TRUE) - - expect_is(results_all, "character", info = "hts_get_keycols should return a character vector") - - results_ids = hts_get_keycols(sample_dt, ids=TRUE, weights = FALSE) - - expect_is(results_ids, "character", - info = "Only id columns should be included in the result when weights = FALSE") - - # need to add two more test to check weight only cols and priority option - - # expect_equal() -}) diff --git a/tests/testthat/test_hts_get_ns.R b/tests/testthat/test_hts_get_ns.R index a611d94..c770080 100644 --- a/tests/testthat/test_hts_get_ns.R +++ b/tests/testthat/test_hts_get_ns.R @@ -7,7 +7,7 @@ library(data.table) sample_dt = test_data$person -test_that("hts_get_ns should return counts and units", { +test_that("hts_get_ns should return counts", { results = hts_get_ns(sample_dt, weighted = FALSE) @@ -17,7 +17,5 @@ test_that("hts_get_ns should return counts and units", { expect_null(results$wtd, info = "'wtd' component should be NULL for unweighted counts") - expect_is(results$units, "character", info = "units component should be a character") - # expect_equal() }) diff --git a/vignettes/getting_started.Rmd b/vignettes/getting_started.Rmd index e7297b4..503c7b2 100644 --- a/vignettes/getting_started.Rmd +++ b/vignettes/getting_started.Rmd @@ -120,11 +120,7 @@ data("variable_list") DT = hts_prep_data(summarize_var = 'speed_mph', variables_dt = variable_list, - data = list('hh' = hh, - 'person' = person, - 'day' = day, - 'trip' = trip, - 'vehicle' = vehicle)) + data = test_data) ``` @@ -203,11 +199,7 @@ If we want summarize a variable by another variable (e.g., mode type by a person DT = hts_prep_data(summarize_var = 'mode_type', summarize_by = 'race', variables_dt = variable_list, - data = list('hh' = hh, - 'person' = person, - 'day' = day, - 'trip' = trip, - 'vehicle' = vehicle)) + data = test_data) mode_by_race_summary = hts_summary(prepped_dt = DT$cat, summarize_var = 'mode_type',