Merge pull request #34 from jmwerner/add_vignettes

Add updated vignettes and docs
jmwerner · Apr 8, 2019 · 887e952 · 887e952
2 parents 02597f2 + c406e32
commit 887e952
Show file tree

Hide file tree

Showing 17 changed files with 320 additions and 135 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,3 +10,4 @@ export(aml_k_means)
 export(aml_neural_network)
 export(aml_random_forest)
 export(create_tree)
+export(predict_all)
diff --git a/R/trees.R b/R/trees.R
@@ -25,9 +25,12 @@ aml_random_forest <- function(data, response, b, m = NULL, evaluation_criterion
         if(verbose & (i %% 10) == 1){
             print(paste("Iteration", i, "of", b))
         }
-        bootstrapped_data = .create_single_bootstrapped_data_frame(data)
+        bootstrapped_data = .create_bootstrapped_data(data, response)
         sampled_columns = sample(names(data), m)
-        create_tree(bootstrapped_data[,sampled_columns], response, evaluation_criterion, min_obs, max_depth)
+        create_tree(bootstrapped_data$bootstrapped_data[,sampled_columns], 
+                    bootstrapped_data$bootstrapped_response, evaluation_criterion, 
+                    min_obs, 
+                    max_depth)
     })
     bootstrapped_trees[["n_trees"]] = b
     forest = .prepend_class(bootstrapped_trees, "aml_random_forest")
@@ -224,9 +227,9 @@ sum_of_squares <- function(response_vector, prediction){
     sum((response_vector - prediction)^2)
 }
 
-.create_single_bootstrapped_data_frame <- function(data){
+.create_bootstrapped_data <- function(data, response){
     row_indicators = sample(1:nrow(data), nrow(data), replace = TRUE)
-    data[row_indicators,]
+    list(bootstrapped_data=data[row_indicators,], bootstrapped_response=response[row_indicators])
 }
 
 .find_one_column_split <- function(data, split_column_name, response, evaluation_criterion){

diff --git a/data/gbm.RDS b/data/gbm.RDS
diff --git a/data/random_forest.RDS b/data/random_forest.RDS
diff --git a/inst/doc/k-means.html b/inst/doc/k-means.html
diff --git a/inst/doc/neural-network.html b/inst/doc/neural-network.html
@@ -12,7 +12,7 @@
 
 <meta name="author" content="Jeremy Werner" />
 
-<meta name="date" content="2019-03-24" />
+<meta name="date" content="2019-04-07" />
 
 <title>Neural Network Vignette</title>
 
@@ -70,7 +70,7 @@
 
 <h1 class="title toc-ignore">Neural Network Vignette</h1>
 <h4 class="author"><em>Jeremy Werner</em></h4>
-<h4 class="date"><em>2019-03-24</em></h4>
+<h4 class="date"><em>2019-04-07</em></h4>
 
 
 
@@ -119,9 +119,6 @@ <h1>Example 2: MNIST</h1>
 <span class="kw">library</span>(dplyr)</code></pre></div>
 <pre><code>## 
 ## Attaching package: 'dplyr'</code></pre>
-<pre><code>## The following object is masked from 'package:testthat':
-## 
-##     matches</code></pre>
 <pre><code>## The following objects are masked from 'package:stats':
 ## 
 ##     filter, lag</code></pre>

diff --git a/inst/doc/trees.R b/inst/doc/trees.R
@@ -1,7 +1,9 @@
 ## ----echo=FALSE----------------------------------------------------------
 library(ggplot2)
+library(rprojroot)
 abalone_data_file = system.file("external_data", "abalone_data.RDS", package="ArtisanalMachineLearning", mustWork=TRUE)
 abalone_data = readRDS(abalone_data_file)
+set.seed(1337)
 
 ## ------------------------------------------------------------------------
 library(ArtisanalMachineLearning)
@@ -10,11 +12,80 @@ library(ArtisanalMachineLearning)
 dim(abalone_data$data)
 summary(abalone_data$data)
 
-## ---- echo=FALSE---------------------------------------------------------
+## ---- echo=FALSE, fig.width = 7, fig.height = 5--------------------------
 ggplot(data=data.frame(response=abalone_data$response), aes(response)) + 
     geom_histogram(breaks=seq(0, 30, by = 1), 
                    col="grey", 
                    fill="blue") + 
     labs(x="", y="Count", title="Histogram of Response") + 
     theme_bw()
 
+## ---- eval=FALSE---------------------------------------------------------
+#  random_forest = aml_random_forest(data=abalone_data$data,
+#                                    response=abalone_data$response,
+#                                    b=200,
+#                                    m=6,
+#                                    evaluation_criterion=sum_of_squares,
+#                                    min_obs=5,
+#                                    max_depth=16,
+#                                    verbose=FALSE)
+
+## ----echo = FALSE--------------------------------------------------------
+# Do a cooking show trick and bring out an already baked rf
+random_forest = readRDS(file.path(find_root('DESCRIPTION'), 'data/random_forest.RDS'))
+
+## ------------------------------------------------------------------------
+random_forest_predictions = predict_all(random_forest, abalone_data$data, n_trees=200)
+
+## ------------------------------------------------------------------------
+sum((abalone_data$response - random_forest_predictions)^2) / length(abalone_data$response)
+
+## ---- echo=FALSE, fig.width=7, fig.height=5------------------------------
+plotting_data_rf = data.frame(predicted=random_forest_predictions, 
+                               actual=abalone_data$response, 
+                               Difference=abs(random_forest_predictions - abalone_data$response))
+
+ggplot(plotting_data_rf, aes(x=actual, y=predicted, color=Difference)) +
+    geom_point() + 
+    scale_y_continuous(limits = c(-1, 29)) + 
+    geom_jitter() + 
+    scale_color_viridis() + 
+    geom_abline(intercept = 0, slope = 1, color="black", size=1.5) + 
+    labs(x="Actual", y="Predicted", title="Actual vs Predicted") + 
+    theme_bw()
+
+## ---- eval=FALSE---------------------------------------------------------
+#  gbm = aml_gbm(abalone_data$data,
+#                abalone_data$response,
+#                learning_rate=.1,
+#                n_trees=50,
+#                evaluation_criterion=sum_of_squares,
+#                min_obs=10,
+#                max_depth=4,
+#                verbose=FALSE)
+
+## ----echo = FALSE--------------------------------------------------------
+# Do a cooking show trick and bring out an already baked gbm
+gbm = readRDS(file.path(find_root('DESCRIPTION'), 'data/gbm.RDS'))
+
+## ------------------------------------------------------------------------
+gbm_predictions = predict_all(gbm, abalone_data$data, n_trees=50)
+
+## ------------------------------------------------------------------------
+sum((abalone_data$response - gbm_predictions)^2) / length(abalone_data$response)
+
+## ---- echo=FALSE, fig.width=7, fig.height=5------------------------------
+
+plotting_data_gbm = data.frame(predicted=gbm_predictions, 
+                               actual=abalone_data$response, 
+                               Difference=abs(gbm_predictions - abalone_data$response))
+
+ggplot(plotting_data_gbm, aes(x=actual, y=predicted, color=Difference)) +
+    geom_point() + 
+    scale_y_continuous(limits = c(-1, 29)) + 
+    geom_jitter() + 
+    scale_color_viridis() + 
+    geom_abline(intercept = 0, slope = 1, color="black", size=1.5) + 
+    labs(x="Actual", y="Predicted", title="Actual vs Predicted") + 
+    theme_bw()
+
diff --git a/inst/doc/trees.Rmd b/inst/doc/trees.Rmd
@@ -11,28 +11,30 @@ vignette: >
 
 Tree-based models are amazing. Here's a very simple vignette demonstrating how to use Artisanal Machine Learning's Random Forest and GBM models.
 
-# Example: Abalone Data
+# Read Abalone Data
 
 ```{r echo=FALSE}
 library(ggplot2)
+library(rprojroot)
 abalone_data_file = system.file("external_data", "abalone_data.RDS", package="ArtisanalMachineLearning", mustWork=TRUE)
 abalone_data = readRDS(abalone_data_file)
+set.seed(1337)
 ```
 
 ```{r}
 library(ArtisanalMachineLearning)
 ```
 
-This example will be using the 'Abalone' dataset that I robbed from the internet.
+This example will be using the 'Abalone' dataset that I robbed from the internet here: https://archive.ics.uci.edu/ml/datasets/abalone where we try to predict the age of abalones from measured characteristics.
 
 ```{r}
 dim(abalone_data$data)
 summary(abalone_data$data)
 ```
 
-The data set has many numeric columns and a numeric response that takes integer values from 1-29
+The semi-large data set has many numeric columns and a numeric response that takes integer values from 1-29.
 
-```{r, echo=FALSE}
+```{r, echo=FALSE, fig.width = 7, fig.height = 5}
 ggplot(data=data.frame(response=abalone_data$response), aes(response)) + 
     geom_histogram(breaks=seq(0, 30, by = 1), 
                    col="grey", 
@@ -41,43 +43,107 @@ ggplot(data=data.frame(response=abalone_data$response), aes(response)) +
     theme_bw()
 ```
 
+# Random Forest Model
+
+```{r, eval=FALSE}
+random_forest = aml_random_forest(data=abalone_data$data, 
+                                  response=abalone_data$response, 
+                                  b=200, 
+                                  m=6, 
+                                  evaluation_criterion=sum_of_squares, 
+                                  min_obs=5, 
+                                  max_depth=16, 
+                                  verbose=FALSE)
+```
 
-#root_directory = system('git rev-parse --show-toplevel', intern=TRUE)
-#abalone_data = readRDS(file.path(root_directory, "data", "abalone_data.RDS"))
+```{r echo = FALSE}
+# Do a cooking show trick and bring out an already baked rf
+random_forest = readRDS(file.path(find_root('DESCRIPTION'), 'data/random_forest.RDS'))
+```
 
-#gbm = aml_gbm(abalone_data$data, abalone_data$response, learning_rate = .25, n_trees = 3, evaluation_criterion = sum_of_squares, min_obs = 10, max_depth = 4, verbose = FALSE)
+Now that we have a random forest model, let's simply verify that it's fitting a better-than-garbage model on the training data. 
 
-#gbm_predictions = predict_all(gbm, abalone_data$data, n_trees = 3)
+```{r}
+random_forest_predictions = predict_all(random_forest, abalone_data$data, n_trees=200)
+```
 
+## SSE on training data
 
-# Vignette structure 
-# load data, print data summary
+```{r}
+sum((abalone_data$response - random_forest_predictions)^2) / length(abalone_data$response)
+```
 
-# Fit basic rf
+## Comparison of predicted and actual for Random Forest
 
-# Fit basic gbm
+```{r, echo=FALSE, fig.width=7, fig.height=5}
+plotting_data_rf = data.frame(predicted=random_forest_predictions, 
+                               actual=abalone_data$response, 
+                               Difference=abs(random_forest_predictions - abalone_data$response))
 
+ggplot(plotting_data_rf, aes(x=actual, y=predicted, color=Difference)) +
+    geom_point() + 
+    scale_y_continuous(limits = c(-1, 29)) + 
+    geom_jitter() + 
+    scale_color_viridis() + 
+    geom_abline(intercept = 0, slope = 1, color="black", size=1.5) + 
+    labs(x="Actual", y="Predicted", title="Actual vs Predicted") + 
+    theme_bw()
+```
 
+Not bad! This model is clearly picking up some signal. Let's try out a small GBM now just for kicks.
 
 
+# GBM Model
 
-#gbm = aml_gbm(data, response, learning_rate = .25, n_trees = 50, evaluation_criterion = sum_of_squares, min_obs = 10, max_depth = 4, verbose = TRUE)
+```{r, eval=FALSE}
+gbm = aml_gbm(abalone_data$data, 
+              abalone_data$response, 
+              learning_rate=.1, 
+              n_trees=50, 
+              evaluation_criterion=sum_of_squares, 
+              min_obs=10, 
+              max_depth=4, 
+              verbose=FALSE)
+```
 
+```{r echo = FALSE}
+# Do a cooking show trick and bring out an already baked gbm
+gbm = readRDS(file.path(find_root('DESCRIPTION'), 'data/gbm.RDS'))
+```
 
+Circa the RF model, let's see if this picks up any signal at all on the training data.
 
-#all_errors_gbm = sapply(c(10, 25, 40, 50), function(x){
-#    print(paste("Tree number", x))
-#    predictions = predict_all(gbm, data, n_trees = x)
-#    sum((response - predictions)^2) / length(predictions)
-#})
+```{r}
+gbm_predictions = predict_all(gbm, abalone_data$data, n_trees=50)
+```
+
+## SSE on training data
+
+```{r}
+sum((abalone_data$response - gbm_predictions)^2) / length(abalone_data$response)
+```
 
+## Comparison of predicted and actual for GBM
+
+```{r, echo=FALSE, fig.width=7, fig.height=5}
+
+plotting_data_gbm = data.frame(predicted=gbm_predictions, 
+                               actual=abalone_data$response, 
+                               Difference=abs(gbm_predictions - abalone_data$response))
+
+ggplot(plotting_data_gbm, aes(x=actual, y=predicted, color=Difference)) +
+    geom_point() + 
+    scale_y_continuous(limits = c(-1, 29)) + 
+    geom_jitter() + 
+    scale_color_viridis() + 
+    geom_abline(intercept = 0, slope = 1, color="black", size=1.5) + 
+    labs(x="Actual", y="Predicted", title="Actual vs Predicted") + 
+    theme_bw()
+```
 
-#random_forest = aml_random_forest(data, response, b = 50, m = 6, evaluation_criterion = sum_of_squares, min_obs = 10, max_depth = 4, verbose = TRUE)
 
-#all_errors = sapply(5:15, function(x){
-#    print(paste("Tree number", x))
-#    predictions = predict_all(random_forest, data, n_trees = x)
-#    sum((response - predictions)^2) / length(predictions)
-#})
+# Conclusion
 
+The RF model is outperforming the GBM, but the GBM is significantly smaller and the author didn't spend much time tuning the hyperparameters ¯\_(ツ)_/¯ 
 
+Also, this illustration only includes looking at statistics on the training data set, so we definitely can't make huge conclusions. The author simply wanted to demonstrate these hand-crafted models were producing better-than-trash results, and that has been achieved. 
diff --git a/inst/doc/trees.html b/inst/doc/trees.html
diff --git a/man/aml_gbm.Rd b/man/aml_gbm.Rd
diff --git a/man/aml_random_forest.Rd b/man/aml_random_forest.Rd
diff --git a/man/create_tree.Rd b/man/create_tree.Rd