Merge pull request #19 from josephsdavid/additional-wrappers

reworked ModelCompareMultivariateVAR class to work with ModelBuildMul…
josephsdavid · Mar 29, 2020 · f2246b3 · f2246b3
2 parents ba651b7 + dc49bc2
commit f2246b3
Show file tree

Hide file tree

Showing 22 changed files with 683 additions and 303 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: tswgewrapped
 Title: Helpful wrappers for 'tswge' and 'vars' packages
-Version: 1.8.8
+Version: 1.8.9
 Authors@R: c(person("David", "Josephs", email = "josephsd@smu.edu", role = c("aut", "cre")),
              person("Nikhil", "Gupta", email = "guptan@smu.edu", role = c("aut")))
 Description: This package provides several helpful wrappers for the already useful tswge package. In the future, this package intends to move away from the tswge backend, to be faster, with more readable source code.

diff --git a/R/ModelBuildMultivariateVAR.R b/R/ModelBuildMultivariateVAR.R
@@ -16,7 +16,7 @@ ModelBuildMultivariateVAR = R6::R6Class(
     #' @param data The dataframe containing the time series realizations (data should not contain time index)
     #' @param var_interest The output variable of interest (dependent variable)
     #' @param mdl_list A names list of all models (see format below)
-    #' @param alpha Significance level to use for filtering of variables from the recommendations
+    #' @param alpha Significance level to use for filtering of variables from the recommendations (Default = 0.05)
     #' @param verbose How much to print during the model building and other processes (Default = 0)
     #' @param ... Additional parameers to feed to VARSelect (if applicable) and VAR --> Most notably "exogen"
     #' @return A new `ModelCompareMultivariateVAR` object.
@@ -96,15 +96,29 @@ ModelBuildMultivariateVAR = R6::R6Class(
     #' (2) The names of the significant variables to use
     #' (3) Lag value to use for the model
     get_recommendations = function(){
+
+      # Check if any seasonality term is significant
+      season_data = self$summarize_build() %>% 
+        dplyr::group_by(Model, trend_type, season) %>% 
+        dplyr::filter(grepl("^sd[0-9]+$", OriginalVar)) %>% 
+        dplyr::summarise(num_sig_season_factors = n()) %>% 
+        dplyr::ungroup() %>% 
+        dplyr::mutate(sig_season_factors = ifelse(num_sig_season_factors >= 1, TRUE, FALSE)) %>% 
+        dplyr::select(-num_sig_season_factors)
+
       results = self$summarize_build() %>% 
         dplyr::filter(!(OriginalVar %in% c("trend", "const"))) %>% 
-        dplyr::filter(!(grepl("^s[0-9]+$", OriginalVar))) %>% # Remove seasonality factors
+        dplyr::filter(!(grepl("^sd[0-9]+$", OriginalVar))) %>% # Remove seasonality factors
         dplyr::group_by(Model, trend_type, season) %>% 
         dplyr::summarise(num_sig_vars = dplyr::n(),
                          lag_to_use = -min(MaxLag, na.rm = TRUE),
                          vars_to_use =  paste0(OriginalVar, collapse = ",")) %>%
-        dplyr::ungroup()                   
-
+        dplyr::ungroup() %>% 
+        dplyr::left_join(season_data, by = c("Model", "trend_type", "season")) %>% 
+        dplyr::mutate(season_to_use = ifelse(sig_season_factors == TRUE, season, 0)) %>% 
+        tidyr::replace_na(list(season_to_use = 0)) %>% 
+        dplyr::select(-sig_season_factors)
+
       return(results)
     },
 
@@ -125,7 +139,7 @@ ModelBuildMultivariateVAR = R6::R6Class(
           cat("\n\n\n")
           cat(paste("Model: ", name, "\n"))
           trend_type = recommendations[[name]][['trend_type']]
-          season = recommendations[[name]][['season']]
+          season = recommendations[[name]][['season_to_use']]
           if (season == 0){
             season = NULL
           }
@@ -223,7 +237,8 @@ ModelBuildMultivariateVAR = R6::R6Class(
       mdl_subset = list()
 
       for (name in mdl_subset_names){
-        mdl_subset[[name]] = private$models[[name]][['varfit']]
+        mdl_subset[[name]][['varfit']] = private$models[[name]][['varfit']]
+        mdl_subset[[name]][['sliding_ase']] = FALSE
       }
 
       return(mdl_subset)

diff --git a/R/ModelCompareBase.R b/R/ModelCompareBase.R
@@ -93,7 +93,7 @@ ModelCompareBase = R6::R6Class(
         stop("The model names above already exist in this comparison object. Please provide unique names.")
       }
 
-      mdl_list = private$clean_model_input(mdl_list)
+      mdl_list = private$clean_model_input(mdl_list, batch_size)
 
       if (all(is.na(private$models))){
         private$models = mdl_list
@@ -162,8 +162,6 @@ ModelCompareBase = R6::R6Class(
           private$models[[name]][['time_test_start']] = res$time_test_start
           private$models[[name]][['time_test_end']] = res$time_test_end
           private$models[[name]][['batch_num']] = res$batch_num
-          # private$models[[name]][['AICs']] = res$AICs
-          # private$models[[name]][['BICs']] = res$BICs
           private$models[[name]][['f']] = res$f
           private$models[[name]][['ll']] = res$ll
           private$models[[name]][['ul']] = res$ul
@@ -472,6 +470,10 @@ ModelCompareBase = R6::R6Class(
       private$data = data
     },
 
+    get_data_subset = function(col_names){
+      stop("You are calling the 'get_data_subset' method in the parent class. This should be implemented in the child class.")
+    },
+
     set_step_n.ahead = function(step_n.ahead){
       private$step_n.ahead = step_n.ahead
     },
@@ -490,7 +492,7 @@ ModelCompareBase = R6::R6Class(
       return(private$verbose)
     },
 
-    clean_model_input = function(mdl_list){
+    clean_model_input = function(mdl_list, batch_size){
       # If the inputs are missing sliding_ase, make it FALSE
       # Also add 'metric_has_been_computed' key
       for (name in names(mdl_list)){
@@ -499,7 +501,6 @@ ModelCompareBase = R6::R6Class(
         }
 
         mdl_list[[name]][['metric_has_been_computed']] = FALSE
-
       }
 
       return(mdl_list)

diff --git a/R/ModelCompareMultivariateVAR.R b/R/ModelCompareMultivariateVAR.R
@@ -70,19 +70,19 @@ ModelCompareMultivariateVAR = R6::R6Class(
     #' @description Returns the VAR model Build Summary
     #' @returns A dataframe containing the following columns
     #'          'Model': Name of the model
-    #'          'Selection': The selection criteria used for K value (AIC or BIC)
-    #'          'Trend': The trend argument used in the VARselect and VAR functions
+    #'          'Trend': The trend argument used in the VAR functions
+    #'          'Season' The season argument used in the VAR functions
     #'          'SlidingASE': Whether Sliding ASE will be used for this model
     #'          'Init_K': The K value recommended by the VARselect function
     #'          'Final_K': The adjusted K value to take into account the smaller batch size (only when using sliding_ase)
     summarize_build = function(){
-      results = dplyr::tribble(~Model, ~Selection, ~Trend, ~SlidingASE, ~Init_K, ~Final_K)
+      results = dplyr::tribble(~Model, ~Trend, ~Season, ~SlidingASE, ~Init_K, ~Final_K)
 
       for (name in names(private$get_models())){
         results = results %>% 
           dplyr::add_row(Model = name,
-                         Selection = private$models[[name]][['select']],
                          Trend = private$models[[name]][['trend_type']],
+                         Season = ifelse(is.null(private$models[[name]][['season']]), 0, private$models[[name]][['season']]),
                          SlidingASE = private$models[[name]][['sliding_ase']],
                          Init_K = private$models[[name]][['k_initial']],
                          Final_K = private$models[[name]][['k_final']]
@@ -102,22 +102,54 @@ ModelCompareMultivariateVAR = R6::R6Class(
 
     set_var_interest = function(var_interest){private$var_interest = var_interest},
 
+    get_data_subset = function(col_names){
+      return(self$get_data() %>% dplyr::select(col_names))
+    },
+
     get_len_x = function(){
       return(nrow(self$get_data()))
     },
 
-    clean_model_input = function(mdl_list){
+    clean_model_input = function(mdl_list, batch_size){
 
       mdl_list = super$clean_model_input(mdl_list)
 
+      for (name in names(mdl_list)){
+        k = mdl_list[[name]]$varfit$p
+
+        mdl_list[[name]][['varfit_alldata']] = mdl_list[[name]]$varfit
+        mdl_list[[name]][['vars_to_use']] = colnames(mdl_list[[name]]$varfit$y)
+        mdl_list[[name]][['k_initial']] = k
+        mdl_list[[name]][['trend_type']] = mdl_list[[name]]$varfit$type
+        mdl_list[[name]][['season']] = mdl_list[[name]]$varfit$call$season
+
+        k_final = k
+        ## If using sliding ASE, make sure that the batch size is large enough to support the number of lags
+        if (mdl_list[[name]][['sliding_ase']] == TRUE){
+          k_final = private$validate_k(k, batch_size,
+                                       season = mdl_list[[name]]$varfit$call$season,
+                                       col_names = colnames(mdl_list[[name]]$varfit$y))
+        }
+
+        mdl_list[[name]][['k_final']] = k_final
+
+        AIC = stats::AIC(mdl_list[[name]]$varfit)
+        BIC = stats::BIC(mdl_list[[name]]$varfit)
+
+        mdl_list[[name]][['AIC']] = AIC
+        mdl_list[[name]][['BIC']] = BIC
+
+      }
+
       return(mdl_list)
     },
 
     get_sliding_ase_results = function(name, step_n.ahead){
-      res = sliding_ase_var(data = self$get_data(),
+      res = sliding_ase_var(data = private$get_data_subset(col_names = private$get_models()[[name]][['vars_to_use']]),
                             var_interest = self$get_var_interest(),
                             k = private$get_models()[[name]][['k_final']],
                             trend_type = private$get_models()[[name]][['trend_type']],
+                            season = private$get_models()[[name]][['season']],
                             n.ahead = self$get_n.ahead(),
                             batch_size = private$get_models()[[name]][['batch_size']],
                             step_n.ahead = step_n.ahead, verbose = private$get_verbose())
@@ -172,96 +204,29 @@ ModelCompareMultivariateVAR = R6::R6Class(
     },
 
     build_models  = function(verbose = 0){
-      for (name in names(private$get_models())){
-        cat("\n\n\n")
-        cat(paste("Model: ", name, "\n"))
-        trend_type = private$get_models()[[name]][['trend_type']]
-        cat(paste("Trend type: ", trend_type, "\n"))
-
-        varselect = vars::VARselect(self$get_data(),
-                                    lag.max = private$get_models()[[name]][['lag.max']],
-                                    type = trend_type,
-                                    season = NULL, exogen = NULL)
-
-        if (verbose >= 1){
-          cat("\nVARselect Object:\n")
-          print(varselect) 
-        }
-
-        selection = private$extract_correct_varselection(varselect)
-
-        select = tolower(private$get_models()[[name]][['select']])
-        if (select == 'aic'){
-          k = selection[["AIC(n)"]]
-        }
-        else if (select == 'bic'){
-          k = selection[["SC(n)"]]
-        }
-        else{
-          stop("'select' argument must be with 'aic' or 'bic'")
-        }
-        cat(paste("Lag K to use for the VAR Model: ", k, "\n")) 
-
-        k_final = k
-        ## If using sliding ASE, make sure that the batch size is large enough to support the number of lags
-        if (private$get_models()[[name]][['sliding_ase']] == TRUE){
-          k_final = private$validate_k(k)
-        }
-
-        # Fit to Entire Data
-        # This might be needed in many places to computing it here.
-        varfit = vars::VAR(self$get_data(), p=k_final, type=trend_type)
-
-        if (verbose >= 1){
-          cat(paste0("\n\nPrinting summary of the VAR fit for the variable of interest: ", var_interest, "\n"))
-          print(summary(varfit$varresult[[var_interest]]))
-        }
-
-        ## Inplace
-        private$models[[name]][['varselect_alldata']] = varselect
-        private$models[[name]][['k_initial']] = k
-        private$models[[name]][['k_final']] = k_final
-        private$models[[name]][['varfit_alldata']] = varfit
-
-      }
+
     },
 
     evaluate_xIC = function(){
-      for (name in names(private$get_models())){
-
-        varfit = private$models[[name]][['varfit_alldata']]
-
-        AIC = stats::AIC(varfit)
-        BIC = stats::BIC(varfit)
-
-        private$models[[name]][['AIC']] = AIC
-        private$models[[name]][['BIC']] = BIC
-      }  
+
     },
 
-    extract_correct_varselection = function(vselect){
-      criteria = vselect$criteria
+    validate_k = function(k, batch_size, season, col_names){
+      # https://stats.stackexchange.com/questions/234975/how-many-endogenous-variables-in-a-var-model-with-120-observations
+      ## num_vars (in code) = K in the equation in link
+      ## k (code) = p in the equation in link
+      t =  batch_size - self$get_n.ahead()
+      num_vars = length(col_names)  
 
-      if(sum(criteria == -Inf, na.rm = TRUE) > 0){
-        warning("VARselect produced -Inf values. These will be removed before making final 'K' selection.")
-
-        criteria[criteria == -Inf] = max(criteria) + 1
+      if (is.null(season)){
+        season = 1  # So we dont subtract anthign from the numerator
       }
 
-      selection = data.frame(t(Rfast::rowMins(criteria)))
-      colnames(selection) = rownames(criteria)
-
-      selection = selection %>% 
-        assertr::verify(assertr::has_all_names("AIC(n)", "HQ(n)", "SC(n)", "FPE(n)"))
-
-      return(selection)
-    },
-
-    validate_k = function(k){
       new_k = k
-      num_vars = ncol(self$get_data())
-      if (k * num_vars >= (self$get_batch_size()-1)){
-        new_k = floor((self$get_batch_size()-1)/num_vars) - 1 # Being a little more conservative 
+
+      # NOTE: I changed 1 to 2 since we can also have a case with trend and const instead of just constant.
+      if (k * (num_vars+1) + 2 > t){
+        new_k =  floor((t-2-(season-1))/(num_vars + 1))
         warning("Although the lag value k: ", k, " selected by VARselect will work for your full dataset, is too large for your batch size. Reducing k to allow Batch ASE calculations. New k: ", new_k, " If you do not want to reduce the k value, increase the batch size or make sliding_ase = FALSE for this model in the model list")
       }
       return((new_k))

diff --git a/R/ModelCompareUnivariate.R b/R/ModelCompareUnivariate.R
@@ -211,7 +211,7 @@ ModelCompareUnivariate = R6::R6Class(
       return(length(self$get_data()))
     },
 
-    clean_model_input = function(mdl_list){
+    clean_model_input = function(mdl_list, batch_size){
       # If the inputs are missing p, d, q, or s values, this will add 0s to make it consistent
       for (name in names(mdl_list)){
         if (is.null(mdl_list[[name]][['phi']])){

diff --git a/R/sliding_ase.R b/R/sliding_ase.R
@@ -133,10 +133,12 @@ sliding_ase_univariate = function(x,
 #' @param var_interest The output variable of interest (dependent variable)
 #' @param k The lag value to use for the VAR model (generally determined by the VARselect function)
 #' @param trend_type The trend type to use in VARselect and the VAR model. Refer to vars::VARselect and vars::VAR for valid options.
+#' @param season The seasonality to use in the VAR model.
 #' @param batch_size Window Size used 
 #' @param n.ahead last n.ahead data points in each batch will be used for prediction and ASE calculations
 #' @param step_n.ahead Whether to step each batch by n.ahead values (Default = FALSE)
 #' @param verbose How much to print during the model building and other processes (Default = 0)
+#' @param ... Additional arguments to pass to the VAR model
 #' @return Named list 
 #'         'ASEs' - ASE values
 #'         'time_test_start' - Time Index indicating start of test time corresponding to the ASE values
@@ -150,9 +152,10 @@ sliding_ase_univariate = function(x,
 #'         'time.forecasts' - Time Corresponding to each forecast, upper and lower limit values
 #' @export
 sliding_ase_var = function(data, var_interest,
-                           k, trend_type = NA,
+                           k, trend_type = NA, season = NULL,
                            n.ahead = NA, batch_size = NA,    # Forecasting specific arguments
-                           step_n.ahead = TRUE, verbose = 0
+                           step_n.ahead = TRUE, verbose = 0,
+                           ...
                            )                              
 {
   # Sliding CV ... batches are mutually exclusive
@@ -198,8 +201,6 @@ sliding_ase_var = function(data, var_interest,
   time_test_start = numeric(num_batches)
   time_test_end = numeric(num_batches)
   batch_num = numeric(num_batches)
-  # AICs = numeric(num_batches)
-  # BICs = numeric(num_batches)
 
   for (i in 0:(num_batches-1))
   {
@@ -225,7 +226,7 @@ sliding_ase_var = function(data, var_interest,
     test_data = data[test_start:test_end, ]
 
     # Fit model for the batch
-    varfit = vars::VAR(train_data, p=k, type=trend_type)
+    varfit = vars::VAR(train_data, p=k, type=trend_type, season = season, ...)
 
     if (verbose >= 2){
       cat(paste0("\nBatch: ", i+1, "\n"))
@@ -236,7 +237,7 @@ sliding_ase_var = function(data, var_interest,
 
     # Forecast for the batch
     forecasts = stats::predict(varfit, n.ahead=n.ahead)
-    forecasts = forecasts$fcst[[var_interest]] ## Get the forecasts only for the dependent variable
+    forecasts = forecasts[['fcst']][[var_interest]] ## Get the forecasts only for the dependent variable
 
     ASEs[i+1] = mean((test_data[, var_interest] - forecasts[, 'fcst'])^2)
     start = start + step_size
@@ -246,18 +247,12 @@ sliding_ase_var = function(data, var_interest,
     forecasts.ul[test_start: test_end] = forecasts[, 'upper']
     time.forecasts[test_start: test_end] = seq(test_start, test_end, 1)
 
-    # # Batch AIC and BIC
-    # AICs[i+1] = stats::AIC(varfit)
-    # BICs[i+1] = stats::BIC(varfit)
-
   }
 
   return(list(ASEs = ASEs,
               time_test_start = time_test_start,
               time_test_end = time_test_end,
               batch_num = batch_num,
-              # AICs = AICs,
-              # BICs = BICs,
               f = forecasts.f,
               ll = forecasts.ll,
               ul = forecasts.ul,

diff --git a/inst/extdata/multivar_VAR_build_final_model_no_season_list.rds b/inst/extdata/multivar_VAR_build_final_model_no_season_list.rds
diff --git a/inst/extdata/multivar_VAR_build_final_model_season_list.rds b/inst/extdata/multivar_VAR_build_final_model_season_list.rds
diff --git a/inst/extdata/multivar_VAR_build_no_season_recommendations.csv b/inst/extdata/multivar_VAR_build_no_season_recommendations.csv
@@ -0,0 +1,7 @@
+Model,trend_type,season,num_sig_vars,lag_to_use,vars_to_use,season_to_use
+AIC Both,both,0,2,3,"logGNP,logM1",0
+AIC None,none,0,1,1,logGNP,0
+AIC Trend,trend,0,3,1,"logGNP,rs,rl",0
+BIC Both,both,0,1,1,logGNP,0
+BIC None,none,0,1,1,logGNP,0
+BIC Trend,trend,0,1,1,logGNP,0
diff --git a/inst/extdata/multivar_VAR_summary.csv → .../multivar_VAR_build_no_season_summary.csv b/inst/extdata/multivar_VAR_summary.csv → .../multivar_VAR_build_no_season_summary.csv
diff --git a/inst/extdata/multivar_VAR_build_season_recommendations.csv b/inst/extdata/multivar_VAR_build_season_recommendations.csv
@@ -0,0 +1,3 @@
+"Model","trend_type","season","num_sig_vars","lag_to_use","vars_to_use","season_to_use"
+"AIC Trend","trend",3,2,3,"logGNP,logM1",0
+"BIC Trend","trend",4,2,1,"logGNP,rl",4