test of using all columns December 26 2020.R

rm(list = ls())
library(tidyverse)
library(ISLR)
library(gtools)
summarydf <- data.frame()
#df <- read.csv('https://web.stanford.edu/~hastie/ElemStatLearn/datasets/SAheart.data', header = TRUE, stringsAsFactors = TRUE)
#
#df <- read.csv('https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv', header = TRUE)
#df <- read.csv('https://raw.githubusercontent.com/mlittmancs/great_courses_ml/master/data/ship.csv', header = TRUE, stringsAsFactors = TRUE)
df <- mtcars
df$value <- ifelse(df$mpg>25,1,0)
df
df <- select(df, cyl,  disp,  hp, drat,   wt, qsec, vs, am, gear, carb, value)
names(df)[names(df)==names(df[ncol(df)])] = 'last'
df <- df[sample(nrow(df)),]
last <- df[,ncol(df)]
df <- df[,1:ncol(df)-1]

accuracy <- 0
accuracytmp <- 0
table1.df <- data.frame("X0" = c(0, 0), "X1" = c(0, 0))
rownames(table1.df) = c("No", "Yes")
colnames(table1.df) = c("0", "1")
accuracy.table <- data.frame(table1.df)
accuracy.df <- data.frame(accuracy)

sensitivity <- 0
sensitivitytmp <- 0
sensitivity.table <- data.frame()
sensitivity.df <- data.frame(sensitivity)

sumtable1 <- 0
dummy1.df <- data.frame(table1.df)
j <- 0
pos.percentage.df <- data.frame(j)
maxyes <- 0

specificity <- 0
specificitytmp <- 0
specificity.table <- data.frame()
specificity.df <- data.frame(specificity)
sumtable1 <- 0
dummy1.df <- data.frame(table1.df)
j <- 0
neg.percentage.df <- data.frame(j)
maxno <- 0


misclassification.error <- 0
misclassificationtmp <- 0
misclass <- data.frame(misclassification.error)

total <- data.frame(accuracy, sensitivity, specificity, misclass)

table1.df <- data.frame()
tables.df <- data.frame()

temptable <- matrix(data = c(0,0,0,0),nrow = 2)
rownames(temptable) = c("No", "Yes")
temptable <- as.table(temptable)

temptable1 <- matrix(data = c(0,0,0,0),nrow = 2)
rownames(temptable1) = c("No", "Yes")
temptable1 <- as.table(temptable1)

for(i in 1:10){
  for (j in 1:ncol(df)){ # this creates all possible permutations of the columns
      combin <- combinations(n = ncol(df), r = j, repeats.allowed = FALSE)  #### change back to a variable!!!
      for (k in 1:nrow(combin)){
        colvals <- c(combin[k,]) ##### change back to a variable!!
        newdf <- data.frame(df[,colvals])
        newdf
        newdf <- cbind(newdf, last)
        newdf
        ratio <- round(runif(1, 0.25, 0.75),2)
        dfsize <- as.integer((nrow(newdf))*ratio)
        train <- sample(nrow(newdf), nrow(newdf)*ratio,replace = FALSE)
        train.df <- newdf[train,]
        train.df <- as.data.frame(train.df)
        train.df
        print(train.df)
        test.df <- newdf[-train,]
        test.df <- as.data.frame(test.df)
        print(test.df)
        
        glm.fits <- glm(last~., data = train.df, family = binomial)
        glm.probs <- predict(glm.fits, test.df, type = "response")
        glm.pred <- rep("No", nrow(test.df))
        glm.pred[glm.probs > .5] = "Yes"
        table1 <- table(glm.pred, test.df$last)
        table1.df <- data.frame(unclass(table1))
        
        if(nrow(table1) == ncol(table1)){
          if(ncol(table1) == 2){
            tables.df <- rbind(tables.df, table1.df)
          }
        }
        

        #### - Measure overall accuracy of the GLM ####
        if(nrow(table1) == ncol(table1)){
          accuracy = sum(diag(table1)) / sum(table1)
          if (accuracy>accuracytmp){
            accuracytmp = accuracy
            accuracy.table = rbind(accuracy.table, table1.df)
            accuracy.df = rbind(accuracy.df, accuracy)
            saveRDS(glm.fits, file = "/tmp/glm.max.accuracy.rda")
          }
        }
         
         ### Measure the Sensitivity (positive accuracy) of the GLM
         if (nrow(table1) == ncol(table1)){
           sensitivity <- table1[4:4] / sum(table1[3:4])
           if(sensitivity>sensitivitytmp | sensitivity == 1){
             if(table1[4:4]> temptable[4:4]){
               sensitivity.table.df <- as.data.frame.matrix(table1)
               sensitivity.table <- rbind(sensitivity.table, sensitivity.table.df)
               sensitivity.df <- rbind(sensitivity.df, sensitivity)
               sensitivitytmp = sensitivity
               temptable = table1
               pos.percentage.df <- rbind(pos.percentage.df,j)
               saveRDS(glm.fits, file = "/tmp/glm.max.sensitivity.rda")
             }
           }
         }
        # 
         if (nrow(table1) == ncol(table1)){
           specificity <- table1[1:1] / sum(table1[1:2])
           if(specificity>specificitytmp | specificity == 1){
             if(table1[1:1]> temptable1[1:1]){
               specificity.table.df <- as.data.frame.matrix(table1)
               specificity.table <- rbind(specificity.table, specificity.table.df)
               specificity.df <- rbind(specificity.df, specificity)
               specificitytmp = specificity
               temptable1 = table1
               pos.percentage.df <- rbind(pos.percentage.df,j)
               saveRDS(glm.fits, file = "/tmp/glm.max.specificity.rda")
             }
           }
      }
    }
         }
  }
}


############ ----------- display results to the user ---------------- ####################


print("The highest overall accuracy:")
print(max(accuracy.df))
max.accuracy <- readRDS(file = "/tmp/glm.max.accuracy.rda")
summary(max.accuracy)
accuracy.table[(nrow(accuracy.table)-1):nrow(accuracy.table),]


 print("The highest sensitivity (positive accuracy)")
 print(max(sensitivity.df))
 max.sensitivity <- readRDS(file = "/tmp/glm.max.sensitivity.rda")
 summary(max.sensitivity)
 sensitivity.table[(nrow(sensitivity.table)-1):nrow(sensitivity.table),]

 
 print("The highest specificity (negative accuracy")
 print(max(specificity.df))
 max.specificity <- readRDS(file = "/tmp/glm.max.specificity.rda")
 summary(max.specificity)
 specificity.table[(nrow(specificity.table)-1):nrow(specificity.table),]