Skip to content

Commit

Permalink
reworked ModelCompareMultivariateVAR class to work with ModelBuildMul…
Browse files Browse the repository at this point in the history
…tivariateVAR class. Also supported seasonality.
  • Loading branch information
ngupta23 committed Mar 29, 2020
1 parent 4d36653 commit dc49bc2
Show file tree
Hide file tree
Showing 22 changed files with 683 additions and 303 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: tswgewrapped
Title: Helpful wrappers for 'tswge' and 'vars' packages
Version: 1.8.8
Version: 1.8.9
Authors@R: c(person("David", "Josephs", email = "josephsd@smu.edu", role = c("aut", "cre")),
person("Nikhil", "Gupta", email = "guptan@smu.edu", role = c("aut")))
Description: This package provides several helpful wrappers for the already useful tswge package. In the future, this package intends to move away from the tswge backend, to be faster, with more readable source code.
Expand Down
27 changes: 21 additions & 6 deletions R/ModelBuildMultivariateVAR.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ ModelBuildMultivariateVAR = R6::R6Class(
#' @param data The dataframe containing the time series realizations (data should not contain time index)
#' @param var_interest The output variable of interest (dependent variable)
#' @param mdl_list A names list of all models (see format below)
#' @param alpha Significance level to use for filtering of variables from the recommendations
#' @param alpha Significance level to use for filtering of variables from the recommendations (Default = 0.05)
#' @param verbose How much to print during the model building and other processes (Default = 0)
#' @param ... Additional parameers to feed to VARSelect (if applicable) and VAR --> Most notably "exogen"
#' @return A new `ModelCompareMultivariateVAR` object.
Expand Down Expand Up @@ -96,15 +96,29 @@ ModelBuildMultivariateVAR = R6::R6Class(
#' (2) The names of the significant variables to use
#' (3) Lag value to use for the model
get_recommendations = function(){

# Check if any seasonality term is significant
season_data = self$summarize_build() %>%
dplyr::group_by(Model, trend_type, season) %>%
dplyr::filter(grepl("^sd[0-9]+$", OriginalVar)) %>%
dplyr::summarise(num_sig_season_factors = n()) %>%
dplyr::ungroup() %>%
dplyr::mutate(sig_season_factors = ifelse(num_sig_season_factors >= 1, TRUE, FALSE)) %>%
dplyr::select(-num_sig_season_factors)

results = self$summarize_build() %>%
dplyr::filter(!(OriginalVar %in% c("trend", "const"))) %>%
dplyr::filter(!(grepl("^s[0-9]+$", OriginalVar))) %>% # Remove seasonality factors
dplyr::filter(!(grepl("^sd[0-9]+$", OriginalVar))) %>% # Remove seasonality factors
dplyr::group_by(Model, trend_type, season) %>%
dplyr::summarise(num_sig_vars = dplyr::n(),
lag_to_use = -min(MaxLag, na.rm = TRUE),
vars_to_use = paste0(OriginalVar, collapse = ",")) %>%
dplyr::ungroup()

dplyr::ungroup() %>%
dplyr::left_join(season_data, by = c("Model", "trend_type", "season")) %>%
dplyr::mutate(season_to_use = ifelse(sig_season_factors == TRUE, season, 0)) %>%
tidyr::replace_na(list(season_to_use = 0)) %>%
dplyr::select(-sig_season_factors)

return(results)
},

Expand All @@ -125,7 +139,7 @@ ModelBuildMultivariateVAR = R6::R6Class(
cat("\n\n\n")
cat(paste("Model: ", name, "\n"))
trend_type = recommendations[[name]][['trend_type']]
season = recommendations[[name]][['season']]
season = recommendations[[name]][['season_to_use']]
if (season == 0){
season = NULL
}
Expand Down Expand Up @@ -223,7 +237,8 @@ ModelBuildMultivariateVAR = R6::R6Class(
mdl_subset = list()

for (name in mdl_subset_names){
mdl_subset[[name]] = private$models[[name]][['varfit']]
mdl_subset[[name]][['varfit']] = private$models[[name]][['varfit']]
mdl_subset[[name]][['sliding_ase']] = FALSE
}

return(mdl_subset)
Expand Down
11 changes: 6 additions & 5 deletions R/ModelCompareBase.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ ModelCompareBase = R6::R6Class(
stop("The model names above already exist in this comparison object. Please provide unique names.")
}

mdl_list = private$clean_model_input(mdl_list)
mdl_list = private$clean_model_input(mdl_list, batch_size)

if (all(is.na(private$models))){
private$models = mdl_list
Expand Down Expand Up @@ -162,8 +162,6 @@ ModelCompareBase = R6::R6Class(
private$models[[name]][['time_test_start']] = res$time_test_start
private$models[[name]][['time_test_end']] = res$time_test_end
private$models[[name]][['batch_num']] = res$batch_num
# private$models[[name]][['AICs']] = res$AICs
# private$models[[name]][['BICs']] = res$BICs
private$models[[name]][['f']] = res$f
private$models[[name]][['ll']] = res$ll
private$models[[name]][['ul']] = res$ul
Expand Down Expand Up @@ -472,6 +470,10 @@ ModelCompareBase = R6::R6Class(
private$data = data
},

get_data_subset = function(col_names){
stop("You are calling the 'get_data_subset' method in the parent class. This should be implemented in the child class.")
},

set_step_n.ahead = function(step_n.ahead){
private$step_n.ahead = step_n.ahead
},
Expand All @@ -490,7 +492,7 @@ ModelCompareBase = R6::R6Class(
return(private$verbose)
},

clean_model_input = function(mdl_list){
clean_model_input = function(mdl_list, batch_size){
# If the inputs are missing sliding_ase, make it FALSE
# Also add 'metric_has_been_computed' key
for (name in names(mdl_list)){
Expand All @@ -499,7 +501,6 @@ ModelCompareBase = R6::R6Class(
}

mdl_list[[name]][['metric_has_been_computed']] = FALSE

}

return(mdl_list)
Expand Down
139 changes: 52 additions & 87 deletions R/ModelCompareMultivariateVAR.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,19 @@ ModelCompareMultivariateVAR = R6::R6Class(
#' @description Returns the VAR model Build Summary
#' @returns A dataframe containing the following columns
#' 'Model': Name of the model
#' 'Selection': The selection criteria used for K value (AIC or BIC)
#' 'Trend': The trend argument used in the VARselect and VAR functions
#' 'Trend': The trend argument used in the VAR functions
#' 'Season' The season argument used in the VAR functions
#' 'SlidingASE': Whether Sliding ASE will be used for this model
#' 'Init_K': The K value recommended by the VARselect function
#' 'Final_K': The adjusted K value to take into account the smaller batch size (only when using sliding_ase)
summarize_build = function(){
results = dplyr::tribble(~Model, ~Selection, ~Trend, ~SlidingASE, ~Init_K, ~Final_K)
results = dplyr::tribble(~Model, ~Trend, ~Season, ~SlidingASE, ~Init_K, ~Final_K)

for (name in names(private$get_models())){
results = results %>%
dplyr::add_row(Model = name,
Selection = private$models[[name]][['select']],
Trend = private$models[[name]][['trend_type']],
Season = ifelse(is.null(private$models[[name]][['season']]), 0, private$models[[name]][['season']]),
SlidingASE = private$models[[name]][['sliding_ase']],
Init_K = private$models[[name]][['k_initial']],
Final_K = private$models[[name]][['k_final']]
Expand All @@ -102,22 +102,54 @@ ModelCompareMultivariateVAR = R6::R6Class(

set_var_interest = function(var_interest){private$var_interest = var_interest},

get_data_subset = function(col_names){
return(self$get_data() %>% dplyr::select(col_names))
},

get_len_x = function(){
return(nrow(self$get_data()))
},

clean_model_input = function(mdl_list){
clean_model_input = function(mdl_list, batch_size){

mdl_list = super$clean_model_input(mdl_list)

for (name in names(mdl_list)){
k = mdl_list[[name]]$varfit$p

mdl_list[[name]][['varfit_alldata']] = mdl_list[[name]]$varfit
mdl_list[[name]][['vars_to_use']] = colnames(mdl_list[[name]]$varfit$y)
mdl_list[[name]][['k_initial']] = k
mdl_list[[name]][['trend_type']] = mdl_list[[name]]$varfit$type
mdl_list[[name]][['season']] = mdl_list[[name]]$varfit$call$season

k_final = k
## If using sliding ASE, make sure that the batch size is large enough to support the number of lags
if (mdl_list[[name]][['sliding_ase']] == TRUE){
k_final = private$validate_k(k, batch_size,
season = mdl_list[[name]]$varfit$call$season,
col_names = colnames(mdl_list[[name]]$varfit$y))
}

mdl_list[[name]][['k_final']] = k_final

AIC = stats::AIC(mdl_list[[name]]$varfit)
BIC = stats::BIC(mdl_list[[name]]$varfit)

mdl_list[[name]][['AIC']] = AIC
mdl_list[[name]][['BIC']] = BIC

}

return(mdl_list)
},

get_sliding_ase_results = function(name, step_n.ahead){
res = sliding_ase_var(data = self$get_data(),
res = sliding_ase_var(data = private$get_data_subset(col_names = private$get_models()[[name]][['vars_to_use']]),
var_interest = self$get_var_interest(),
k = private$get_models()[[name]][['k_final']],
trend_type = private$get_models()[[name]][['trend_type']],
season = private$get_models()[[name]][['season']],
n.ahead = self$get_n.ahead(),
batch_size = private$get_models()[[name]][['batch_size']],
step_n.ahead = step_n.ahead, verbose = private$get_verbose())
Expand Down Expand Up @@ -172,96 +204,29 @@ ModelCompareMultivariateVAR = R6::R6Class(
},

build_models = function(verbose = 0){
for (name in names(private$get_models())){
cat("\n\n\n")
cat(paste("Model: ", name, "\n"))
trend_type = private$get_models()[[name]][['trend_type']]
cat(paste("Trend type: ", trend_type, "\n"))

varselect = vars::VARselect(self$get_data(),
lag.max = private$get_models()[[name]][['lag.max']],
type = trend_type,
season = NULL, exogen = NULL)

if (verbose >= 1){
cat("\nVARselect Object:\n")
print(varselect)
}

selection = private$extract_correct_varselection(varselect)

select = tolower(private$get_models()[[name]][['select']])
if (select == 'aic'){
k = selection[["AIC(n)"]]
}
else if (select == 'bic'){
k = selection[["SC(n)"]]
}
else{
stop("'select' argument must be with 'aic' or 'bic'")
}
cat(paste("Lag K to use for the VAR Model: ", k, "\n"))

k_final = k
## If using sliding ASE, make sure that the batch size is large enough to support the number of lags
if (private$get_models()[[name]][['sliding_ase']] == TRUE){
k_final = private$validate_k(k)
}

# Fit to Entire Data
# This might be needed in many places to computing it here.
varfit = vars::VAR(self$get_data(), p=k_final, type=trend_type)

if (verbose >= 1){
cat(paste0("\n\nPrinting summary of the VAR fit for the variable of interest: ", var_interest, "\n"))
print(summary(varfit$varresult[[var_interest]]))
}

## Inplace
private$models[[name]][['varselect_alldata']] = varselect
private$models[[name]][['k_initial']] = k
private$models[[name]][['k_final']] = k_final
private$models[[name]][['varfit_alldata']] = varfit

}

},

evaluate_xIC = function(){
for (name in names(private$get_models())){

varfit = private$models[[name]][['varfit_alldata']]

AIC = stats::AIC(varfit)
BIC = stats::BIC(varfit)

private$models[[name]][['AIC']] = AIC
private$models[[name]][['BIC']] = BIC
}

},

extract_correct_varselection = function(vselect){
criteria = vselect$criteria
validate_k = function(k, batch_size, season, col_names){
# https://stats.stackexchange.com/questions/234975/how-many-endogenous-variables-in-a-var-model-with-120-observations
## num_vars (in code) = K in the equation in link
## k (code) = p in the equation in link
t = batch_size - self$get_n.ahead()
num_vars = length(col_names)

if(sum(criteria == -Inf, na.rm = TRUE) > 0){
warning("VARselect produced -Inf values. These will be removed before making final 'K' selection.")

criteria[criteria == -Inf] = max(criteria) + 1
if (is.null(season)){
season = 1 # So we dont subtract anthign from the numerator
}

selection = data.frame(t(Rfast::rowMins(criteria)))
colnames(selection) = rownames(criteria)

selection = selection %>%
assertr::verify(assertr::has_all_names("AIC(n)", "HQ(n)", "SC(n)", "FPE(n)"))

return(selection)
},

validate_k = function(k){
new_k = k
num_vars = ncol(self$get_data())
if (k * num_vars >= (self$get_batch_size()-1)){
new_k = floor((self$get_batch_size()-1)/num_vars) - 1 # Being a little more conservative

# NOTE: I changed 1 to 2 since we can also have a case with trend and const instead of just constant.
if (k * (num_vars+1) + 2 > t){
new_k = floor((t-2-(season-1))/(num_vars + 1))
warning("Although the lag value k: ", k, " selected by VARselect will work for your full dataset, is too large for your batch size. Reducing k to allow Batch ASE calculations. New k: ", new_k, " If you do not want to reduce the k value, increase the batch size or make sliding_ase = FALSE for this model in the model list")
}
return((new_k))
Expand Down
2 changes: 1 addition & 1 deletion R/ModelCompareUnivariate.R
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ ModelCompareUnivariate = R6::R6Class(
return(length(self$get_data()))
},

clean_model_input = function(mdl_list){
clean_model_input = function(mdl_list, batch_size){
# If the inputs are missing p, d, q, or s values, this will add 0s to make it consistent
for (name in names(mdl_list)){
if (is.null(mdl_list[[name]][['phi']])){
Expand Down
19 changes: 7 additions & 12 deletions R/sliding_ase.R
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,12 @@ sliding_ase_univariate = function(x,
#' @param var_interest The output variable of interest (dependent variable)
#' @param k The lag value to use for the VAR model (generally determined by the VARselect function)
#' @param trend_type The trend type to use in VARselect and the VAR model. Refer to vars::VARselect and vars::VAR for valid options.
#' @param season The seasonality to use in the VAR model.
#' @param batch_size Window Size used
#' @param n.ahead last n.ahead data points in each batch will be used for prediction and ASE calculations
#' @param step_n.ahead Whether to step each batch by n.ahead values (Default = FALSE)
#' @param verbose How much to print during the model building and other processes (Default = 0)
#' @param ... Additional arguments to pass to the VAR model
#' @return Named list
#' 'ASEs' - ASE values
#' 'time_test_start' - Time Index indicating start of test time corresponding to the ASE values
Expand All @@ -150,9 +152,10 @@ sliding_ase_univariate = function(x,
#' 'time.forecasts' - Time Corresponding to each forecast, upper and lower limit values
#' @export
sliding_ase_var = function(data, var_interest,
k, trend_type = NA,
k, trend_type = NA, season = NULL,
n.ahead = NA, batch_size = NA, # Forecasting specific arguments
step_n.ahead = TRUE, verbose = 0
step_n.ahead = TRUE, verbose = 0,
...
)
{
# Sliding CV ... batches are mutually exclusive
Expand Down Expand Up @@ -198,8 +201,6 @@ sliding_ase_var = function(data, var_interest,
time_test_start = numeric(num_batches)
time_test_end = numeric(num_batches)
batch_num = numeric(num_batches)
# AICs = numeric(num_batches)
# BICs = numeric(num_batches)

for (i in 0:(num_batches-1))
{
Expand All @@ -225,7 +226,7 @@ sliding_ase_var = function(data, var_interest,
test_data = data[test_start:test_end, ]

# Fit model for the batch
varfit = vars::VAR(train_data, p=k, type=trend_type)
varfit = vars::VAR(train_data, p=k, type=trend_type, season = season, ...)

if (verbose >= 2){
cat(paste0("\nBatch: ", i+1, "\n"))
Expand All @@ -236,7 +237,7 @@ sliding_ase_var = function(data, var_interest,

# Forecast for the batch
forecasts = stats::predict(varfit, n.ahead=n.ahead)
forecasts = forecasts$fcst[[var_interest]] ## Get the forecasts only for the dependent variable
forecasts = forecasts[['fcst']][[var_interest]] ## Get the forecasts only for the dependent variable

ASEs[i+1] = mean((test_data[, var_interest] - forecasts[, 'fcst'])^2)
start = start + step_size
Expand All @@ -246,18 +247,12 @@ sliding_ase_var = function(data, var_interest,
forecasts.ul[test_start: test_end] = forecasts[, 'upper']
time.forecasts[test_start: test_end] = seq(test_start, test_end, 1)

# # Batch AIC and BIC
# AICs[i+1] = stats::AIC(varfit)
# BICs[i+1] = stats::BIC(varfit)

}

return(list(ASEs = ASEs,
time_test_start = time_test_start,
time_test_end = time_test_end,
batch_num = batch_num,
# AICs = AICs,
# BICs = BICs,
f = forecasts.f,
ll = forecasts.ll,
ul = forecasts.ul,
Expand Down
Binary file not shown.
Binary file not shown.
7 changes: 7 additions & 0 deletions inst/extdata/multivar_VAR_build_no_season_recommendations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Model,trend_type,season,num_sig_vars,lag_to_use,vars_to_use,season_to_use
AIC Both,both,0,2,3,"logGNP,logM1",0
AIC None,none,0,1,1,logGNP,0
AIC Trend,trend,0,3,1,"logGNP,rs,rl",0
BIC Both,both,0,1,1,logGNP,0
BIC None,none,0,1,1,logGNP,0
BIC Trend,trend,0,1,1,logGNP,0
File renamed without changes.
3 changes: 3 additions & 0 deletions inst/extdata/multivar_VAR_build_season_recommendations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"Model","trend_type","season","num_sig_vars","lag_to_use","vars_to_use","season_to_use"
"AIC Trend","trend",3,2,3,"logGNP,logM1",0
"BIC Trend","trend",4,2,1,"logGNP,rl",4
Loading

0 comments on commit dc49bc2

Please sign in to comment.