new documentation for helper functions

chancejohnstone · May 8, 2020 · 89b7124 · 89b7124
1 parent 213beee
commit 89b7124
Show file tree

Hide file tree

Showing 30 changed files with 692 additions and 217 deletions.
diff --git a/.Rhistory b/.Rhistory
@@ -1,92 +1,3 @@
-abline(a = 0, b = 1)
-plot(x = coverage[[3]], y = alpha_vec, pch = 20, ylim = c(0,1), xlim = c(0,1))
-abline(a = 0, b = 1)
-library(rfinterval)
-?rfinterval
-library(devtools)
-document()
-setwd("~/piRF")
-document()
-setwd("..")
-install("piRF")
-library(piRF)
-library(rcdk)
-library(piRF)
-#physical checmistry MoleculeNet datasets
-lipo <- read.csv("C:/Users/thechanceyman/Documents/Research/Data/Drug Design/lipophilicity/Lipophilicity.csv")
-esol <- read.csv("C:/Users/thechanceyman/Documents/Research/Data/Drug Design/ESOL/esol.csv")
-freesolv <- read.csv("C:/Users/thechanceyman/Documents/Research/Data/Drug Design/FreeSolv/freesolv.csv")
-#combine data
-all_data <- list(lipo, freesolv, esol)
-names(all_data[[1]])[2] <- "resp"
-names(all_data[[3]])[9] <- "resp"
-names(all_data[[2]])[4] <- "resp"
-#type <- c("lower", "lower", "upper")
-type <- c("two-sided", "two-sided", "two-sided")
-res <- cov_color <- test <- list()
-#par(mfrow = c(3,1))
-for(i in 1:length(all_data)){
-bit_mat <- convert_smiles(all_data[[i]]$smiles)
-#data frame form
-data <- cbind(all_data[[i]]$resp, bit_mat)
-names(data)[1] <- "resp"
-set.seed(2020)
-n <- nrow(data)
-ratio <- .975
-samp <- sample(1:n, ratio*n)
-train <- data[samp,]
-test[[i]] <- data[-samp,]
-res[[i]] <- rfint(resp~., train_data = train, test_data = test[[i]], concise = FALSE, interval_type = type[i])
-cov_color[[i]] <- (test[[i]]$resp <= res[[i]]$int[[1]][,2])*(test[[i]]$resp>= res[[i]]$int[[1]][,1])
-}
-for(i in 1:length(all_data)){
-#changing to color
-range_pred <- range(res[[i]]$int)
-range_true <- range(test[[i]]$resp)
-lim <- .5
-cov_color[[i]][cov_color[[i]] == 1] <- "black"
-cov_color[[i]][cov_color[[i]] == 0] <- "red"
-p_color <- rep("black", times = nrow(test[[i]]))
-p_color[res[[i]]$int[[1]][,1] > lim] <- "skyblue"
-#segments
-#plotting intervals vs true
-plot(x = res[[i]]$preds[[1]], y = test[[i]]$resp, pch = 20,
-ylab = "true", xlab = "predicted", ylim = range_true, xlim = range_pred)
-abline(a = 0, b = 1)
-segments(x0 = res[[i]]$int[[1]][,1], x1 = res[[i]]$int[[1]][,2],
-y1 = test[[i]]$resp, y0 = test[[i]]$resp,
-col = cov_color[[i]], lwd = .5)
-}
-#points(x = res$preds[[1]], y = test$lipo, pch = 20, col = p_color)
-#one sided for freesolv data
-alpha_vec <- seq(.01,.99,length.out = 33)
-int <- rep(0, times = 2*length(alpha_vec))
-dim(int) <- c(length(alpha_vec), 2)
-track <- 1
-k <- 5
-for(alpha in alpha_vec){
-test2 <- data[k,]
-train2 <- data[-k,]
-int[track,] <- unlist(rfint(resp~., train_data = train2, test_data = test2,
-interval_type = "upper", alpha = alpha)$int$Zhang)
-track <- track + 1
-}
-int[,1] <- -100
-plot(x = int[,2], y = 1-alpha_vec, pch = 20,
-ylab = expression((1-alpha)), xlab = "upper one-sided interval", ylim = c(0,1), xlim = c(-3,3))
-segments(x0 = int[,1], x1 = int[,2],
-y1 = 1-alpha_vec, y0 = 1-alpha_vec,
-lwd = .5)
-abline(v = -.5, lty = 2, col = "red")
-#checking calibration of probabilities...
-#insert code here...
-#one sided for freesolv data
-alpha_vec <- seq(.01,.48,length.out = 33)
-k <- 1:100
-method_vec <- c("Zhang", "quantile", "Romano")
-int <- vector(mode = "list", length = length(method_vec))
-for(m in 1:length(int)){
-int[[m]] <- matrix(0, nrow = length(k), ncol = length(alpha_vec))
 }
 track <- 1
 for(alpha in alpha_vec){
@@ -510,3 +421,92 @@ abline(a = 0, b = 1)
 segments(x0 = res$int[[i]][,1], x1 = res$int[[i]][,2],
 y1 = test$pressure, y0 = test$pressure, lwd = 1, col = col)
 }
+par.get()
+o <- par()
+par(o)
+opar <- par()
+opar
+par(opar)
+opar <- par()$mfrow
+par(mfrow = opar)
+opar <- par(mfrow = c(1,1))
+opar
+par(opar)
+opar <- par(mfrow = c(12,2))
+opar
+par(opar)
+par()
+par()$mfrow
+opar <- par(mfrow = c(12,2))
+par()$mfrow
+library(devtools)
+check()
+opar <- par(mfrow = c(2,17))
+par
+par()$mfrow
+par(opar)
+par()$mfrow
+par(mfrow = c(1,1))
+opar <- par(mfrow = c(2,17))
+par()$mfrow
+par(opar)
+par()$mfrow
+release()
+shiny::runApp('~/Trivia')
+runApp('~/Trivia')
+runApp('~/Trivia')
+runApp('~/Trivia')
+runApp('~/Trivia')
+library(rsconnect)
+deployApp()
+getwd()
+setwd("~/")
+getwd()
+setwd("~/Trivia")
+getwd()
+ls
+ls(0)
+getwd()
+deployApp()
+runApp()
+deployApp()
+key <- "1oDgk4uSTEJgNQu9HCkrmIltZOk_Bi1eQFdceH2qmU8Y"
+#trivia_sheet <- read_sheet(key)
+fieldNames <- c("team_name",
+"round",
+"q1",
+"q2",
+"q3",
+"q4",
+"q5",
+"q6",
+"q7",
+"q8",
+"q9",
+"q10")
+runApp()
+logical(1)
+logical(0)
+runApp()
+runApp()
+runApp()
+runApp()
+runApp()
+runApp()
+runApp()
+runApp()
+runApp()
+runApp()
+deployApp()
+library(devtools)
+document()
+document()
+setwd("..")
+install("piRF")
+setwd("~/piRF")
+check()
+document()
+?rfint
+document()
+setwd("..")
+install("piRF")
diff --git a/CRAN-RELEASE b/CRAN-RELEASE
@@ -1,2 +1,2 @@
-This package was submitted to CRAN on 2020-04-28.
-Once it is accepted, delete this file and tag the release (commit a409810de7).
+This package was submitted to CRAN on 2020-05-02.
+Once it is accepted, delete this file and tag the release (commit 213beee10d).
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -26,15 +26,13 @@ Description: Implements multiple state-of-the-art prediction interval methodolog
 	original random forest methodology and novel prediction interval methodologies. All of these 
 	methodologies can be utilized using solely this package, rather than a collection of separate 
 	packages. Currently, only regression trees are supported. Also capable of handling high dimensional data. 
-	Breiman, Leo (2001) <doi:10.1023/A:1010933404324>.
 	Roy, Marie-Helene and Larocque, Denis (2019) <doi:10.1177/0962280219829885>.
 	Ghosal, Indrayudh and Hooker, Giles (2018) <arXiv:1803.08000>.
 	Zhu, Lin and Lu, Jiaxin and Chen, Yihong (2019) <arXiv:1905.10101>.
 	Zhang, Haozhe and Zimmerman, Joshua and Nettleton, Dan and Nordman, Daniel J. (2019) <doi:10.1080/00031305.2019.1585288>.
 	Meinshausen, Nicolai (2006) <http://www.jmlr.org/papers/volume7/meinshausen06a/meinshausen06a.pdf>.
 	Romano, Yaniv and Patterson, Evan and Candes, Emmanuel (2019) <arXiv:1905.03222>.
 	Tung, Nguyen Thanh and Huang, Joshua Zhexue and Nguyen, Thuy Thi and Khan, Imran (2014) <doi:10.13140/2.1.2500.8002>.
-	Lopez, Roberto and Balsa-Canto, E. and Onate, E. (2008) <doi:10.1002/nme.2304>.
 License: GPL-3
 Encoding: UTF-8
 Depends:

diff --git a/R/Ghosal_Hooker_2018.R b/R/Ghosal_Hooker_2018.R
@@ -12,15 +12,10 @@
 ## Email: cjohnsto@iastate.edu
 ##
 ## ---------------------------
-##
-## Notes:
-## multiple boosts functional; testing needs to occur; seems to be an issue with variance estimates
-## variant 2 implemented
-## --------------------------
 
-#' implements RF prediction interval method in Ghosal, Hooker 2018
+#' Implements RF prediction interval method in Ghosal, Hooker 2018. Helper function.
 #'
-#' This function implements variant one of the prediction interval methods in Ghosal, Hooker 2018.
+#' This function implements variant one and two of the prediction interval methods in Ghosal, Hooker 2018. Used inside rfint().
 #' @param formula Object of class formula or character describing the model to fit. Interaction terms supported only for numerical variables.
 #' @param train_data Training data of class data.frame, matrix, dgCMatrix (Matrix) or gwaa.data (GenABEL). Matches ranger() requirements.
 #' @param pred_data Test data of class data.frame, matrix, dgCMatrix (Matrix) or gwaa.data (GenABEL). Utilizes ranger::predict() to get prediction intervals for test data.
@@ -34,14 +29,9 @@
 #' @param variant Choose which variant to use. Currently variant 2 not implemented.
 #' @param num_stages Number of boosting stages. Functional for >= 2; variance estimates need adjustment for variant 2.
 #' @param num_threads The number of threads to use in parallel. Default is the current number of cores.
+#' @param interval_type Type of prediction interval to generate.
+#' Options are \code{method = c("two-sided", "lower", "upper")}. Default is  \code{method = "two-sided"}.
 #' @keywords internal
-#' @examples
-#' GhosalBoostRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, num_trees = 500,
-#' min_node_size = NULL, m_try = NULL, keep_inbag = TRUE,
-#' intervals = FALSE, alpha = NULL, forest_type = "RF",
-#' replace = TRUE, prop = 1, variant = 1,
-#' num_threads = num_threads)
-#' @noRd
 GhosalBoostRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, num_trees = NULL,
                           min_node_size = NULL, m_try = NULL, keep_inbag = TRUE,
                           intervals = FALSE, alpha = NULL, prop = NULL, variant = 1,
@@ -76,17 +66,10 @@ GhosalBoostRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, n
                      interval_type = interval_type)
 }
 
-#' generate stage 1 RF for Ghosal, Hooker RF implementation
+#' Generates stage 1 RF for Ghosal, Hooker RF implementation. Helper function.
 #'
-#' This function is primarily meant to be used within the GhosalBoostRF function. All parameters are same as in GhosalBoostRF().
+#' This function is primarily meant to be used within GhosalBoostRF().
 #' @keywords internal
-#' @examples
-#' genCombRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, num_trees = num_trees,
-#' min_node_size = NULL, m_try = NULL, keep_inbag = TRUE,
-#' intervals = TRUE,
-#' alpha = alpha, forest_type = "RF", importance = "none" , weights = NULL,
-#' replace = replace, prop = prop, inbag = NULL, num_threads = num_threads)
-#' @noRd
 genCombRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, num_trees = num_trees,
                   min_node_size = NULL, m_try = NULL, keep_inbag = TRUE,
                   intervals = TRUE,
@@ -103,12 +86,10 @@ genCombRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, num_t
 
 }
 
-#' generate stage 2 (and more) RF for Ghosal, Hooker RF implementation
+#' Generates stage 2 (and more) RF for Ghosal, Hooker RF implementation. Helper function.
 #'
-#' This function is primarily meant to be used within the GhosalBoostRF() function. All parameters are same as in GhosalBoostRF().
+#' Used within GhosalBoostRF().
 #' @keywords internal
-#' @noRd
-#boosting function
 boostStage <- function(rf, formula = NULL, train_data = NULL, pred_data = NULL, num_trees = num_trees,
                        min_node_size = NULL, m_try = NULL, keep_inbag = TRUE,
                        intervals = TRUE, alpha = alpha, weights = NULL, num_stages = 2,
@@ -187,24 +168,11 @@ boostStage <- function(rf, formula = NULL, train_data = NULL, pred_data = NULL,
               inbag = rf$inbag.counts))
 }
 
-#' generate prediction intervals for Ghosal, Hooker 2018 implementation.
+#' Generate prediction intervals for Ghosal, Hooker 2018 implementation. Helper function.
 #'
-#' This function is primarily meant to be used within the GhosalBoostRF() function. All parameters are same as in GhosalBoostRF().
-#' @param love Do you love cats? Defaults to TRUE.
+#' This function is primarily meant to be used within GhosalBoostRF().
 #' @keywords internal
-#' @examples
-#' GHVar <- function(boostRF, train_data, pred_data, variant, dep, alpha, num_threads = num_threads)
-#' @noRd
-#get variance estimate
 GHVar <- function(boostRF, train_data, pred_data, variant, dep, alpha, num_threads = num_threads, interval_type = interval_type){
-  #add variance estimate procedure for variant 2; requires estimates, and inbag for each stage...
-
-  #one sided intervals
-  #if(interval_type == "two-sided"){
-  #  alpha <- alpha
-  #} else {
-  #  alpha <- alpha*2
-  #}
 
   #one sided intervals
   if(interval_type == "two-sided"){
@@ -218,8 +186,6 @@ GHVar <- function(boostRF, train_data, pred_data, variant, dep, alpha, num_threa
     alpha2 <- 1
   }
 
-
-
   #includes original rf
   num_stages <- length(boostRF$boostrf)
 
@@ -232,11 +198,10 @@ GHVar <- function(boostRF, train_data, pred_data, variant, dep, alpha, num_threa
   cov_est <- rep(0, times = pred_n)
 
   #needs to get predictions from boostRF
-  #maybe call this something different
   tree_preds <- boostRF$tree_preds
   num_trees <- ncol(tree_preds)
 
-  #test this; dont know which one is correct...
+  #further testing needed
   in_bag <- unlist(boostRF$inbag)
   dim(in_bag) <- c(dim(train_data)[1], num_trees)
   in_bag <- in_bag >= 1

diff --git a/R/HDI_quantregforest.R b/R/HDI_quantregforest.R
@@ -5,7 +5,7 @@
 # Date First Created: 2019-09-13
 # Reference:  Zhu, Lin, Jiaxin Lu, and Yihong Chen. "HDI-Forest: Highest Density Interval Regression Forest." arXiv preprint arXiv:1905.10101 (2019).
 
-#' implements HDI RF prediction interval method in ...
+#' Implements HDI RF prediction interval method in Zhu 2019. Helper function.
 #'
 #' This function implements an HDI RF prediction interval method.
 #' @param formula Object of class formula or character describing the model to fit. Interaction terms supported only for numerical variables.
@@ -19,11 +19,6 @@
 #' @param replace Sample with replacement, or not. Utilized for the two different variants outlined in Ghosal, Hooker 2018. Currently variant 2 not implemented.
 #' @param num_threads The number of threads to use in parallel. Default is the current number of cores.
 #' @keywords internal
-#' @examples
-#' HDI_quantregforest <- function(formula = NULL, train_data = NULL, test_data = NULL, alpha = NULL,
-#' num_tree = NULL, mtry = NULL, min_node_size = NULL, max_depth = NULL, replace = TRUE, verbose = FALSE,
-#' num_threads = NULL)
-#' @noRd
 HDI_quantregforest <- function(formula = NULL,
                                train_data = NULL,
                                test_data = NULL,

diff --git a/R/Romano_Patterson_Candes_2018.R b/R/Romano_Patterson_Candes_2018.R
@@ -18,10 +18,9 @@
 ##
 ## --------------------------
 
-#' implements RF prediction interval using split conformal prediction as outlined in Romano, Patterson, Candes 2018.
+#' implements RF prediction interval using split conformal prediction as outlined in Romano, Patterson, Candes 2018. Helper function.
 #'
-#' This function implements split conformal prediction intervals for RFs.
-#' @param love Do you love cats? Defaults to TRUE.
+#' This function implements split conformal prediction intervals for RFs. Currently used in rfint().
 #' @param formula Object of class formula or character describing the model to fit. Interaction terms supported only for numerical variables.
 #' @param train_data Training data of class data.frame, matrix, dgCMatrix (Matrix) or gwaa.data (GenABEL). Matches ranger() requirements.
 #' @param pred_data Test data of class data.frame, matrix, dgCMatrix (Matrix) or gwaa.data (GenABEL). Utilizes ranger::predict() to get prediction intervals for test data.
@@ -32,17 +31,11 @@
 #' @param intervals Generate prediction intervals or not.
 #' @param alpha Significance level for prediction intervals.
 #' @param forest_type Determines what type of forest: regression forest vs. quantile regression forest. *Should not be an option...
-#' @param replace Sample with replacement, or not. Utilized for the two different variants outlined in Ghosal, Hooker 2018. Currently variant 2 not implemented.
-#' @param prop Proportion of training data to sample for each tree. Currently variant 2 not implemented.
-#' @param variant Choose which variant to use. Currently variant 2 not implemented.
 #' @param num_threads The number of threads to use in parallel. Default is the current number of cores.
+#' @param interval_type Type of prediction interval to generate.
+#' Options are \code{method = c("two-sided", "lower", "upper")}. Default is  \code{method = "two-sided"}.
 #' @keywords internal
 #' @import stats
-#' @examples
-#' CQRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, num_trees = NULL,
-#' min_node_size = NULL, m_try = NULL, keep_inbag = TRUE,
-#' intervals = TRUE, alpha = NULL, forest_type = "RF", num_threads = NULL)
-#' @noRd
 CQRF <- function(formula = NULL, train_data = NULL, pred_data = NULL, num_trees = NULL,
                  min_node_size = NULL, m_try = NULL, keep_inbag = TRUE,
                  intervals = TRUE, alpha = NULL, forest_type = "RF", num_threads = NULL,