week15_assi7_sol1.R

# # -*- coding: utf-8 -*-
# """Week15 Assi7 Sol1.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1_Et_v9mmzBrmbE8jR1XTGHR_m1d5DieY
# """

###########################################################################
## Week-15, Homework-7, Sol-1 
## Sreya Dhar 
## Created: Dec 09, 2020
## Edited: Dec 16, 2020
###########################################################################

rm(list=ls())
## installing all the libraries in R kernel


# install.packages("corrplot")
# install.packages("forecast")
# install.packages("zoo")
# install.packages("rsample")
# install.packages("leaps")
# install.packages("car")
# install.packages("caret")
# install.packages("ROCR")
# install.packages("PerformanceAnalytics")
# install.packages("funModeling")
# install.packages("hrbrthemes")
# install.packages("ggthemes")
# install.packages("GGally")
# install.packages("glmnet")
# install.packages("ISLR")
# install.packages("kableExtra")
# install.packages("broom")
# install.packages("knitr")
# install.packages("psych")
# install.packages("aod")
# install.packages("epiDisplay")
# install.packages("e1071")
# install.packages("class")
# install.packages("rpart.plot")
# install.packages("party")
# install.packages("partykit")
# install.packages("rattle")

## importing the libraries in R kernel

library(ggplot2)
library(dplyr)
library(tidyverse)
library(tidyr)
library(corrplot)
library(repr)
library(reshape2)
library(forecast)
library(zoo)
library(rsample)
library(gplots)
library(ROCR)
library(class)
library(readr)
library(leaps)
library(car)
library(PerformanceAnalytics)
library(funModeling)
library(gridExtra)
library(caret)
library(MASS)
library(Hmisc)
library(hrbrthemes)
library(GGally)
library(glmnet)
library(pROC)
library(psych)
library(aod)
library(epiDisplay)
library(e1071)
library(ggthemes)
library(kableExtra)
library(broom)
library(knitr)
library(devtools)
library(rpart)  #for trees
library(rattle)    # Fancy tree plot This is a difficult library to install (https://gist.github.com/zhiyzuo/a489ffdcc5da87f28f8589a55aa206dd) 
library(rpart.plot)             # Enhanced tree plots
library(RColorBrewer)       # Color selection for fancy tree plot
library(party)                  # Alternative decision tree algorithm
library(partykit)               # Convert rpart object to BinaryTree
library(randomForest)
library(viridis)
library(tree)
library(factoextra)
library(neuralnet)

## set directory ##
setwd("C:/File G/EAS 506 Statistical Mining I/Week 15/Assignment-7")

## converting the RData file into .csv file
load("C:/File G/EAS 506 Statistical Mining I/Week 15/Assignment-7/cleveland.RData")
write.csv(cleveland,'cleveland.csv')
cleveland <- read.csv("cleveland.csv", header = TRUE)
cleveland<- cleveland[,-1]
cleveland<- cleveland[,-15]
glimpse(cleveland)

names(cleveland)

status(cleveland)

profiling_num(cleveland)

describe(cleveland)

summary(cleveland)


options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
cleveland_n <- cleveland %>% mutate_if(is.character, as.factor)
cleveland_n1 <- cleveland_n %>% mutate_if(is.factor, as.numeric)
L <- cor(cleveland_n1)
corrplot(L, method = "circle",  type = "lower")

pairs.panels(cleveland_n1[,1:13], main = "Pairs plot on Cleveland dataset, unclassed on Class", pch = 21, bg = c("blue", "green")[unclass(cleveland_n1$diag1)], hist.col="red")

plot_num(cleveland_n1)

################## Preparing the data for Analysis ########################
# ## min-max scaling on vehicle dataset predictors prior to classification
normalized<-function(y) {
  
  x<-y[!is.na(y)]
  
  x<-(x - min(x)) / (max(x) - min(x))
  
  y[!is.na(y)]<-x
  
  return(y)
}

cleveland[,1:13]<-apply(cleveland_n1[,c(1:13)],2,normalized)
head(cleveland)



#####################################################
####################### ANN #########################
#####################################################

data <- cleveland %>% mutate_if(is.character, as.factor)
data_1 <- data %>% mutate_if(is.factor, as.numeric)
data_1$diag1<- data_1$diag1 -1
set.seed(4444) ## seeding the sampling
data_split <- initial_split(data_1, prop = 0.75) ## splitting the data by library 'rsample'
data_train <- training(data_split)
data_test  <- testing(data_split)


nn1=neuralnet(diag1 ~., data=data_train, 
             hidden=c(7, 3),
             act.fct = "logistic",
             linear.output = FALSE,
             err.fct = 'ce', 
             likelihood = TRUE)

# plot neural network
plot(nn1, rep = 'best')

train_err <- nn1$result.matrix[1,1]
paste("CE Error: ", round(train_err, 3)) 

nn1_AIC <- nn1$result.matrix[4,1]
paste("AIC: ", round(nn1_AIC,3))

nn1_BIC <- nn1$result.matrix[5,1]
paste("BIC: ", round(nn1_BIC, 3))

################# Classification Hyperparameters #######################
########################################################################

### 2-Hidden Layers, Layer-1 2-neurons, Layer-2, 1-neuron
nn2 <- neuralnet(diag1 ~., data=data_train, 
                linear.output = FALSE, 
                err.fct = 'ce', 
                likelihood = TRUE, 
                hidden = c(8,4))

# 2-Hidden Layers, Layer-1 2-neurons, Layer-2, 2-neurons

nn3 <- neuralnet(diag1 ~.,      data = data_train, 
                                linear.output = FALSE, 
                                err.fct = 'ce', 
                                likelihood = TRUE, 
                                hidden = c(7))

# 2-Hidden Layers, Layer-1 1-neuron, Layer-2, 2-neuron

nn4 <-  neuralnet(diag1 ~.,     data = data_train, 
                                linear.output = FALSE, 
                                err.fct = 'ce', 
                                likelihood = TRUE, 
                                hidden = c(9,5))

# Bar plot of results
nn_ics <- tibble('Network' = rep(c("NN1", "NN2", "NN3", "NN4"), each = 3), 
                       'Metric' = rep(c('AIC', 'BIC', 'ce Error * 100'), length.out = 12),
                       'Value' = c(nn1$result.matrix[4,1], nn1$result.matrix[5,1], 100*nn1$result.matrix[1,1], 
                                   nn2$result.matrix[4,1], nn2$result.matrix[5,1], 100*nn2$result.matrix[1,1],
                                   nn3$result.matrix[4,1], nn3$result.matrix[5,1], 100*nn3$result.matrix[1,1], 
                                   nn4$result.matrix[4,1], nn4$result.matrix[5,1], 100*nn4$result.matrix[1,1]))

nn_ics %>%
  ggplot(aes(Network, Value, fill = Metric)) +
  geom_col(position = 'dodge')  +
  ggtitle("AIC, BIC, and Cross-Entropy Error of the Classification ANNs", "Note: ce Error displayed is 100 times its true value")

nn_3_pred <- predict(nn3, data_train)
nn_3_pred_te <- predict(nn3, data_test)

confusionMatrix(table(actual = data_train$diag1, prediction = round(nn_3_pred)))
confusionMatrix(table(actual = data_test$diag1, prediction = round(nn_3_pred_te)))
results <- data.frame(actual = data_test$diag1, prediction = round(nn_3_pred_te))


###########################################
## put in a loop for tuning.
###########################################
train_err_store <- c()
test_err_store <- c()
for (i in 1:10){
  
  # fit neural network with "i" neurons
  nn0 <- neuralnet(diag1 ~.,   data = data_train, 
                   hidden = i, stepmax = 10^9, err.fct = "ce", linear.output = FALSE)
  
  # calculate the train error
  pred <- predict(nn0, newdata = data_train)
  y_hat_train <- round(pred)
  train_err <- length(which(data_train$diag1 != y_hat_train))/length(y_hat_train)
  train_err_store <- c(train_err_store, train_err) #store the error at each iteration
  
  pred <- predict(nn0, newdata = data_test)
  y_hat_test <- round(pred)
  test_err <- length(which(data_test$diag1 != y_hat_test))/length(y_hat_test)
  test_err_store <- c(test_err_store, test_err) #store the error at each iteration	
}

train_err_store
test_err_store 

err_score <- data.frame(index=c( 1,2,3, 4, 5, 6,7,8, 9, 10),train_err_store, test_err_store)
err_score <- melt(err_score ,  id.vars = 'index', variable.name = 'series')
#create line plot for each column in data frame
ggplot(err_score, aes(index, value)) +
  geom_line(aes(colour = series))+labs(x = "No of Hidden Layers", y= 'Misclassification error rate', title='Tuning hidden Layers of ANN')+theme_bw()

###########################################################################
# #Test the resulting output from best tuned ANN with #hidden input = 6 ###
###########################################################################
nn_best <- neuralnet(diag1 ~.,      data = data_train, 
                     linear.output = FALSE, 
                     err.fct = 'ce', 
                     likelihood = TRUE, 
                     hidden = c(2))
plot(nn_best, rep = 'best')
nn_best_pred <- predict(nn_best, data_train)
nn_best_pred_te <- predict(nn_best, data_test)

confusionMatrix(table(actual = data_train$diag1, prediction = round(nn_best_pred)))
confusionMatrix(table(actual = data_test$diag1, prediction = round(nn_best_pred_te)))
results <- data.frame(actual = data_test$diag1, prediction = round(nn_best_pred_te))

################################################
pROC_obj <- roc(results[,1],results[,2],
                smoothed = TRUE,
                # arguments for ci
                ci=TRUE, ci.alpha=0.95, stratified=FALSE,
                # arguments for plot
                plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                print.auc=TRUE, show.thres=TRUE)


sens.ci <- ci.se(pROC_obj)
plot(sens.ci, type="shape", col="lightblue")

## splitting the dataset into train and test sets
set.seed(4444) ## seeding the sampling
data_split <- initial_split(cleveland, prop = 0.75) ## splitting the data by library 'rsample'
data_train <- training(data_split)
data_test  <- testing(data_split)

#########################################################
########################### CART ########################
#########################################################
tree_mod = tree(factor(diag1) ~., data = data_train, split='deviance')
plot(tree_mod)
text(tree_mod, cex=1.0)

tree_pred_tr = predict(tree_mod, data_train[,-14], type='class')
tree_tab_tr <- table(Original=data_train$diag1, Prediction=tree_pred_tr)
confusionMatrix(tree_tab_tr)

tree_pred_te = predict(tree_mod, data_test[,-14], type='class')
tree_tab_te <- table(Original=data_test$diag1, Prediction=tree_pred_te)
confusionMatrix(tree_tab_te)

cv_model <- cv.tree(tree_mod, FUN = prune.misclass)
plot(cv_model)

cv_model$dev  # gives the deviance for each K (small is better)

best_size <- cv_model$size[which(cv_model$dev==min(cv_model$dev))] # which size is better?
best_size

# let's refit the tree model (the number of leafs will be no more than best.size)
cv_model_pruned <- prune.misclass(tree_mod, best=best_size)
summary(cv_model_pruned)

options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
plot(cv_model_pruned)
text(cv_model_pruned)

pruned_tr_cv <- predict(cv_model_pruned, data_train, type="class") # give the predicted class
confusionMatrix(table(Prediction=pruned_tr_cv, Reference=data_train$diag1))

pruned_pred_cv <- predict(cv_model_pruned, data_test, type="class") # give the predicted class
confusionMatrix(table(Prediction=pruned_pred_cv, Reference=data_test$diag1))


################################################
pROC_obj <- roc(as.numeric(as.factor(data_test$diag1)),as.numeric(as.factor(tree_pred_te)),
                smoothed = TRUE,
                # arguments for ci
                ci=TRUE, ci.alpha=0.95, stratified=FALSE,
                # arguments for plot
                plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                print.auc=TRUE, show.thres=TRUE)


sens.ci <- ci.se(pROC_obj)
plot(sens.ci, type="shape", col="lightblue")

###################################################################
################## Random Forest for Classification ###############
###################################################################
rf <- randomForest(factor(diag1) ~ ., mtry = 3, ntrees=500, importance = TRUE, data_train) ## suggested mtry=sqrt(p) 
plot(rf)
summary(rf)
print(rf)
options(repr.plot.width=10, repr.plot.height=5, repr.plot.res = 200)
varImpPlot(rf, main="Variable Importance Accuracy and Gini coeff. from Random Forest")

importance(rf)
rf_pred_tr <- predict(rf, data_train[,-14], type='class')  
rf_pred_te <- predict(rf, data_test[,-14], type='class')
rf_tab_tr <- table(Original=data_train$diag1, Prediction=rf_pred_tr)
rf_tab_te <- table(Original=data_test$diag1, Prediction=rf_pred_te)
confusionMatrix(rf_tab_tr)
confusionMatrix(rf_tab_te)

# importance(rf, type = 1)
# varImpPlot(rf, type = 1)

################################################
pROC_obj <- roc(as.numeric(as.factor(data_test$diag1)),as.numeric(as.factor(rf_pred_te)),
                smoothed = TRUE,
                # arguments for ci
                ci=TRUE, ci.alpha=0.95, stratified=FALSE,
                # arguments for plot
                plot=TRUE, auc.polygon=TRUE, max.auc.polygon=TRUE, grid=TRUE,
                print.auc=TRUE, show.thres=TRUE)


sens.ci <- ci.se(pROC_obj)
plot(sens.ci, type="shape", col="lightblue")

################## Iteration on mtry for RF #############
#########################################################

rf.c <- list()
yhat.rf <-list()
misclass_rf<-list()
for ( i in 1:13 ) {
  set.seed(4444)
  rf.c[[i]]<-randomForest(factor(diag1) ~ ., data = data_train, mtry = i, importance = TRUE)
  yhat.rf[[i]]<-predict(rf.c[[i]], newdata = data_train[,-14])
  misclass_rf[[i]]<- mean(yhat.rf[[i]] != data_test$diag1)
}

par(bg = "white" )
matplot(1:13, misclass_rf, xlab = 'No. of Variables (mtry)', ylab = 'Misclassification error', main = "Subset of predictors for Misclassification error")
lines(1:13, misclass_rf, type = "o")

## tree size estimation ##
rf_tree <- getTree(rf, k=2) # show the second tree
# print(rf_tree)

treesize(rf) # size of trees of the ensemble

mean(treesize(rf))

options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
hist(treesize (rf), col = "red")
plotres(rf)
plotmo(rf,  pmethod="partdep", all1=TRUE, all2=TRUE)
## end ##