Skip to content

Commit

Permalink
Merge pull request #34 from jmwerner/add_vignettes
Browse files Browse the repository at this point in the history
Add updated vignettes and docs
  • Loading branch information
jmwerner authored Apr 8, 2019
2 parents 02597f2 + c406e32 commit 887e952
Show file tree
Hide file tree
Showing 17 changed files with 320 additions and 135 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ export(aml_k_means)
export(aml_neural_network)
export(aml_random_forest)
export(create_tree)
export(predict_all)
11 changes: 7 additions & 4 deletions R/trees.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,12 @@ aml_random_forest <- function(data, response, b, m = NULL, evaluation_criterion
if(verbose & (i %% 10) == 1){
print(paste("Iteration", i, "of", b))
}
bootstrapped_data = .create_single_bootstrapped_data_frame(data)
bootstrapped_data = .create_bootstrapped_data(data, response)
sampled_columns = sample(names(data), m)
create_tree(bootstrapped_data[,sampled_columns], response, evaluation_criterion, min_obs, max_depth)
create_tree(bootstrapped_data$bootstrapped_data[,sampled_columns],
bootstrapped_data$bootstrapped_response, evaluation_criterion,
min_obs,
max_depth)
})
bootstrapped_trees[["n_trees"]] = b
forest = .prepend_class(bootstrapped_trees, "aml_random_forest")
Expand Down Expand Up @@ -224,9 +227,9 @@ sum_of_squares <- function(response_vector, prediction){
sum((response_vector - prediction)^2)
}

.create_single_bootstrapped_data_frame <- function(data){
.create_bootstrapped_data <- function(data, response){
row_indicators = sample(1:nrow(data), nrow(data), replace = TRUE)
data[row_indicators,]
list(bootstrapped_data=data[row_indicators,], bootstrapped_response=response[row_indicators])
}

.find_one_column_split <- function(data, split_column_name, response, evaluation_criterion){
Expand Down
Binary file added data/gbm.RDS
Binary file not shown.
Binary file modified data/random_forest.RDS
Binary file not shown.
11 changes: 7 additions & 4 deletions inst/doc/k-means.html

Large diffs are not rendered by default.

7 changes: 2 additions & 5 deletions inst/doc/neural-network.html
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

<meta name="author" content="Jeremy Werner" />

<meta name="date" content="2019-03-24" />
<meta name="date" content="2019-04-07" />

<title>Neural Network Vignette</title>

Expand Down Expand Up @@ -70,7 +70,7 @@

<h1 class="title toc-ignore">Neural Network Vignette</h1>
<h4 class="author"><em>Jeremy Werner</em></h4>
<h4 class="date"><em>2019-03-24</em></h4>
<h4 class="date"><em>2019-04-07</em></h4>



Expand Down Expand Up @@ -119,9 +119,6 @@ <h1>Example 2: MNIST</h1>
<span class="kw">library</span>(dplyr)</code></pre></div>
<pre><code>##
## Attaching package: 'dplyr'</code></pre>
<pre><code>## The following object is masked from 'package:testthat':
##
## matches</code></pre>
<pre><code>## The following objects are masked from 'package:stats':
##
## filter, lag</code></pre>
Expand Down
73 changes: 72 additions & 1 deletion inst/doc/trees.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
## ----echo=FALSE----------------------------------------------------------
library(ggplot2)
library(rprojroot)
abalone_data_file = system.file("external_data", "abalone_data.RDS", package="ArtisanalMachineLearning", mustWork=TRUE)
abalone_data = readRDS(abalone_data_file)
set.seed(1337)

## ------------------------------------------------------------------------
library(ArtisanalMachineLearning)
Expand All @@ -10,11 +12,80 @@ library(ArtisanalMachineLearning)
dim(abalone_data$data)
summary(abalone_data$data)

## ---- echo=FALSE---------------------------------------------------------
## ---- echo=FALSE, fig.width = 7, fig.height = 5--------------------------
ggplot(data=data.frame(response=abalone_data$response), aes(response)) +
geom_histogram(breaks=seq(0, 30, by = 1),
col="grey",
fill="blue") +
labs(x="", y="Count", title="Histogram of Response") +
theme_bw()

## ---- eval=FALSE---------------------------------------------------------
# random_forest = aml_random_forest(data=abalone_data$data,
# response=abalone_data$response,
# b=200,
# m=6,
# evaluation_criterion=sum_of_squares,
# min_obs=5,
# max_depth=16,
# verbose=FALSE)

## ----echo = FALSE--------------------------------------------------------
# Do a cooking show trick and bring out an already baked rf
random_forest = readRDS(file.path(find_root('DESCRIPTION'), 'data/random_forest.RDS'))

## ------------------------------------------------------------------------
random_forest_predictions = predict_all(random_forest, abalone_data$data, n_trees=200)

## ------------------------------------------------------------------------
sum((abalone_data$response - random_forest_predictions)^2) / length(abalone_data$response)

## ---- echo=FALSE, fig.width=7, fig.height=5------------------------------
plotting_data_rf = data.frame(predicted=random_forest_predictions,
actual=abalone_data$response,
Difference=abs(random_forest_predictions - abalone_data$response))

ggplot(plotting_data_rf, aes(x=actual, y=predicted, color=Difference)) +
geom_point() +
scale_y_continuous(limits = c(-1, 29)) +
geom_jitter() +
scale_color_viridis() +
geom_abline(intercept = 0, slope = 1, color="black", size=1.5) +
labs(x="Actual", y="Predicted", title="Actual vs Predicted") +
theme_bw()

## ---- eval=FALSE---------------------------------------------------------
# gbm = aml_gbm(abalone_data$data,
# abalone_data$response,
# learning_rate=.1,
# n_trees=50,
# evaluation_criterion=sum_of_squares,
# min_obs=10,
# max_depth=4,
# verbose=FALSE)

## ----echo = FALSE--------------------------------------------------------
# Do a cooking show trick and bring out an already baked gbm
gbm = readRDS(file.path(find_root('DESCRIPTION'), 'data/gbm.RDS'))

## ------------------------------------------------------------------------
gbm_predictions = predict_all(gbm, abalone_data$data, n_trees=50)

## ------------------------------------------------------------------------
sum((abalone_data$response - gbm_predictions)^2) / length(abalone_data$response)

## ---- echo=FALSE, fig.width=7, fig.height=5------------------------------

plotting_data_gbm = data.frame(predicted=gbm_predictions,
actual=abalone_data$response,
Difference=abs(gbm_predictions - abalone_data$response))

ggplot(plotting_data_gbm, aes(x=actual, y=predicted, color=Difference)) +
geom_point() +
scale_y_continuous(limits = c(-1, 29)) +
geom_jitter() +
scale_color_viridis() +
geom_abline(intercept = 0, slope = 1, color="black", size=1.5) +
labs(x="Actual", y="Predicted", title="Actual vs Predicted") +
theme_bw()

114 changes: 90 additions & 24 deletions inst/doc/trees.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,30 @@ vignette: >

Tree-based models are amazing. Here's a very simple vignette demonstrating how to use Artisanal Machine Learning's Random Forest and GBM models.

# Example: Abalone Data
# Read Abalone Data

```{r echo=FALSE}
library(ggplot2)
library(rprojroot)
abalone_data_file = system.file("external_data", "abalone_data.RDS", package="ArtisanalMachineLearning", mustWork=TRUE)
abalone_data = readRDS(abalone_data_file)
set.seed(1337)
```

```{r}
library(ArtisanalMachineLearning)
```

This example will be using the 'Abalone' dataset that I robbed from the internet.
This example will be using the 'Abalone' dataset that I robbed from the internet here: https://archive.ics.uci.edu/ml/datasets/abalone where we try to predict the age of abalones from measured characteristics.

```{r}
dim(abalone_data$data)
summary(abalone_data$data)
```

The data set has many numeric columns and a numeric response that takes integer values from 1-29
The semi-large data set has many numeric columns and a numeric response that takes integer values from 1-29.

```{r, echo=FALSE}
```{r, echo=FALSE, fig.width = 7, fig.height = 5}
ggplot(data=data.frame(response=abalone_data$response), aes(response)) +
geom_histogram(breaks=seq(0, 30, by = 1),
col="grey",
Expand All @@ -41,43 +43,107 @@ ggplot(data=data.frame(response=abalone_data$response), aes(response)) +
theme_bw()
```

# Random Forest Model

```{r, eval=FALSE}
random_forest = aml_random_forest(data=abalone_data$data,
response=abalone_data$response,
b=200,
m=6,
evaluation_criterion=sum_of_squares,
min_obs=5,
max_depth=16,
verbose=FALSE)
```

#root_directory = system('git rev-parse --show-toplevel', intern=TRUE)
#abalone_data = readRDS(file.path(root_directory, "data", "abalone_data.RDS"))
```{r echo = FALSE}
# Do a cooking show trick and bring out an already baked rf
random_forest = readRDS(file.path(find_root('DESCRIPTION'), 'data/random_forest.RDS'))
```

#gbm = aml_gbm(abalone_data$data, abalone_data$response, learning_rate = .25, n_trees = 3, evaluation_criterion = sum_of_squares, min_obs = 10, max_depth = 4, verbose = FALSE)
Now that we have a random forest model, let's simply verify that it's fitting a better-than-garbage model on the training data.

#gbm_predictions = predict_all(gbm, abalone_data$data, n_trees = 3)
```{r}
random_forest_predictions = predict_all(random_forest, abalone_data$data, n_trees=200)
```

## SSE on training data

# Vignette structure
# load data, print data summary
```{r}
sum((abalone_data$response - random_forest_predictions)^2) / length(abalone_data$response)
```

# Fit basic rf
## Comparison of predicted and actual for Random Forest

# Fit basic gbm
```{r, echo=FALSE, fig.width=7, fig.height=5}
plotting_data_rf = data.frame(predicted=random_forest_predictions,
actual=abalone_data$response,
Difference=abs(random_forest_predictions - abalone_data$response))
ggplot(plotting_data_rf, aes(x=actual, y=predicted, color=Difference)) +
geom_point() +
scale_y_continuous(limits = c(-1, 29)) +
geom_jitter() +
scale_color_viridis() +
geom_abline(intercept = 0, slope = 1, color="black", size=1.5) +
labs(x="Actual", y="Predicted", title="Actual vs Predicted") +
theme_bw()
```

Not bad! This model is clearly picking up some signal. Let's try out a small GBM now just for kicks.


# GBM Model

#gbm = aml_gbm(data, response, learning_rate = .25, n_trees = 50, evaluation_criterion = sum_of_squares, min_obs = 10, max_depth = 4, verbose = TRUE)
```{r, eval=FALSE}
gbm = aml_gbm(abalone_data$data,
abalone_data$response,
learning_rate=.1,
n_trees=50,
evaluation_criterion=sum_of_squares,
min_obs=10,
max_depth=4,
verbose=FALSE)
```

```{r echo = FALSE}
# Do a cooking show trick and bring out an already baked gbm
gbm = readRDS(file.path(find_root('DESCRIPTION'), 'data/gbm.RDS'))
```

Circa the RF model, let's see if this picks up any signal at all on the training data.

#all_errors_gbm = sapply(c(10, 25, 40, 50), function(x){
# print(paste("Tree number", x))
# predictions = predict_all(gbm, data, n_trees = x)
# sum((response - predictions)^2) / length(predictions)
#})
```{r}
gbm_predictions = predict_all(gbm, abalone_data$data, n_trees=50)
```

## SSE on training data

```{r}
sum((abalone_data$response - gbm_predictions)^2) / length(abalone_data$response)
```

## Comparison of predicted and actual for GBM

```{r, echo=FALSE, fig.width=7, fig.height=5}
plotting_data_gbm = data.frame(predicted=gbm_predictions,
actual=abalone_data$response,
Difference=abs(gbm_predictions - abalone_data$response))
ggplot(plotting_data_gbm, aes(x=actual, y=predicted, color=Difference)) +
geom_point() +
scale_y_continuous(limits = c(-1, 29)) +
geom_jitter() +
scale_color_viridis() +
geom_abline(intercept = 0, slope = 1, color="black", size=1.5) +
labs(x="Actual", y="Predicted", title="Actual vs Predicted") +
theme_bw()
```

#random_forest = aml_random_forest(data, response, b = 50, m = 6, evaluation_criterion = sum_of_squares, min_obs = 10, max_depth = 4, verbose = TRUE)

#all_errors = sapply(5:15, function(x){
# print(paste("Tree number", x))
# predictions = predict_all(random_forest, data, n_trees = x)
# sum((response - predictions)^2) / length(predictions)
#})
# Conclusion

The RF model is outperforming the GBM, but the GBM is significantly smaller and the author didn't spend much time tuning the hyperparameters ¯\_(ツ)_

Also, this illustration only includes looking at statistics on the training data set, so we definitely can't make huge conclusions. The author simply wanted to demonstrate these hand-crafted models were producing better-than-trash results, and that has been achieved.
124 changes: 57 additions & 67 deletions inst/doc/trees.html

Large diffs are not rendered by default.

9 changes: 6 additions & 3 deletions man/aml_gbm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 8 additions & 4 deletions man/aml_random_forest.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/create_tree.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 887e952

Please sign in to comment.