-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBoosting for Classification.R
52 lines (43 loc) · 1.91 KB
/
Boosting for Classification.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
# Boosting - linear regression
library(gbm)
library(ISLR)
library(tidyverse)
df <- read.csv('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/SAheart.data', header = TRUE, stringsAsFactors = TRUE)
names(df)[names(df)==names(df[ncol(df)])] <- 'last'
#df$last <- as.factor(df$last)
last <- df[, ncol(df)] # This is our target, we will use this to measure the success of our analysis
df <- df[,1:(ncol(df))-1] #Remove the last column (we just saved it)
mean.squared.error <- 1000
mean.squared.error.df <- data.frame(mean.squared.error)
min.MSE <- 1000
minMSE.df <- data.frame(min.MSE)
for (m in 1:10){
for (i in 1:ncol(df)){ # this creates all possible permutations of the columns
print (c(m, "of 10"))
print (c(i, "of", c(ncol(df))))
combin <- combinations(n = ncol(df), r = i, repeats.allowed = FALSE)
for (j in 1:nrow(combin)){
colvals <- c(combin[j,])
newdf <- data.frame(df[,colvals])
newdf <- cbind(newdf, last)
newdf <- as.data.frame(newdf)
ratio <- round(runif(1, 0.25, 0.75),2)
train <- sort(sample(nrow(df), nrow(df)*ratio))
df.train <- as.data.frame(df[train,])
df.test <- as.data.frame(df[-train,])
gbm.fit <- gbm(last[train]~., data = df.train, distribution = "bernoulli",n.trees = 5000, shrinkage = 0.2)
boost.predict <- predict(gbm.fit, newdata = df.test, n.trees = 5000, type = "link")
mean.squared.error <- mean((boost.predict - last[-train])^2)
mean.squared.error.df <- rbind(mean.squared.error.df, mean.squared.error)
if(mean.squared.error <= min.MSE){
min.MSE = mean.squared.error
minMSE.df <- rbind(minMSE.df, min.MSE)
best.boost <- saveRDS(object = boost.df, file = "/tmp/best.boost.rda")
}
}
}
}
sprintf("The best model predicts within %f miles per gallon of the true value", sqrt(min(minMSE.df)))
best.boost <- readRDS(file = "/tmp/best.boost.rda")
best.boost
minMSE.df