-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathFinal_MentalHealth_Code.R
189 lines (141 loc) · 7.2 KB
/
Final_MentalHealth_Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#-----
#Install Packages
#-----
library(ggplot2)
library(gridExtra)
library(Hmisc)
library(reshape2)
library(DMwR)
library(caret)
library(pROC)
library(RWeka)
library(corrplot)
library(mice)
library(VIM)
library(lattice)
library(plotROC)
library(pROC)
#-----
#Load dB
#This will automatically load your data based on the file path given
#----
readfile <- function()
{
n <- readline(prompt="Enter the file path for the BRFSS data: ")
return(n)
}
mydata=sasxport.get((readfile()))
#----
#Data Scrubbing and Preprocessing
#---
#rename dataframe to prevent overwriting the original file, and identify the column names
mydataclean=mydata
names(mydataclean)=(contents(mydata))$contents$Labels
#remove the 'Record Identification' records from the dB (unnecessary for downstream analysis)
mydatacleanremove=mydataclean[,-c(1:26)]
#count NAs in Data, replace 88, 7, 9 in the data with NAs (see CodeBook Link in final report)
na_count <-sapply(mydatacleanremove, function(y) sum(length(which(is.na(y)))))
na_count <- data.frame(na_count)
na_count
mydatacleanremovenum=as.data.frame(mydatacleanremove)
mydatacleanremovenum[] <- lapply(mydatacleanremove[], as.numeric)
mydatacleanremovenum[mydatacleanremovenum == 88] <- NA
mydatacleanremovenum[mydatacleanremovenum == 7] <- NA
mydatacleanremovenum[mydatacleanremovenum == 9] <- NA
sum(is.na(mydatacleanremovenum))
#replace all NA values with means
for(i in 1:ncol(mydatacleanremovenum)){
mydatacleanremovenum[is.na(mydatacleanremovenum[,i]), i]=mean(mydatacleanremovenum[,i], na.rm=TRUE)
}
sum(is.na(mydatacleanremovenum))
for(i in 1:ncol(mydatacleanremovenum)){
if(sum(is.na(mydatacleanremovenum[,i]))){print(sum(is.na(mydatacleanremovenum[,i])))
print(names(mydatacleanremovenum[i]))}
}
#remove attributes that have no clinical relavance, such as calculated values, and ensure there are no NAs in the data
columnsnotneeded=c(2,3,15,23,30,31,32,50,52,55:57,59,64,67,69,71,77,78,82,83,85,92:95,97,103,107,135:139,155,157,159,160161,164,165,167:170,177:249)
mydatacleanremovenum=mydatacleanremovenum[,-columnsnotneeded]
sum(is.na(mydatacleanremovenum))
#Attribute exploration
#How many Nos arein the mental crises attribute? Yeses?
nos=(mydatacleanremovenum[17]==2)
sum(nos)
yes=(mydatacleanremovenum[17]==1)
sum(yes)
#Keep everything that isn't NA
mentalcrisesrough=mydatacleanremovenum
mentalcrisesdf=mentalcrisesrough[!is.na(mentalcrisesrough[17]),]
#explore the mental health dataset, and move the class attribute to the last column (necessary for downstream analyses)
dim(mentalcrisesdf)
sum(is.na(mentalcrisesdf))
depressive=mentalcrisesdf[17]
colnames(depressive)="depressive"
mentalcrisesdf=data.frame(mentalcrisesdf, depressive)
mentalcrisesdf=mentalcrisesdf[,-c(17)]
#Explore how many Nos and Yeses and NAs are in the dataset (remember - NAs were replaced by the mean)
print(prop.table(table(mentalcrisesdf$depressive)))
#Remove all NAs
mentalcrisesdf=mentalcrisesdf[!(mentalcrisesdf$depressive=="1.82412113226952"),]
print(prop.table(table(mentalcrisesdf$depressive)))
#Subsample the dataframe (this part can be skipped, but was done due to computation limitations)
set.seed(123)
mentalcrisesdfsmall = sample(1:nrow(mentalcrisesdf), size=0.1*nrow(mentalcrisesdf))
mentalsmall = mentalcrisesdf[mentalcrisesdfsmall,]
dim(mentalsmall)
prop.table(table(mentalsmall$depressive))
#Note the Yes/No ratio is approximately the same in the full vs small dataset, suggesting the smaller dataset
#is representative of the whole.
#SMOTE
#Synthetic Minority Over-sampling Technique allows us to balance the class design, eliminating any bias that might hinder
#our downstream analyses.
mentalsmall$depressive = as.factor(mentalsmall$depressive)
mentalsmall_smote=SMOTE(depressive~., data=mentalsmall, k=5, perc.over=110)
table(mentalsmall_smote$depressive)
levels(mentalsmall_smote$depressive)=c("Yes", "No")
sum(mentalsmall_smote$depressive=="Yes")
sum(mentalsmall_smote$depressive=="No")
View(mentalsmall_smote)
dim(mentalsmall_smote)
#FIGURE
#compare the number of classes before and after SMOTE
levels(mentalsmall_smote$depressive)=c("Yes", "No")
smote_graph=ggplot(mentalsmall_smote, aes(depressive, fill=depressive))+geom_bar()+theme_classic(base_size = 22)+scale_fill_manual(values=c("midnightblue","firebrick4"))+guides(fill=F)+
ylab("Number of samples")+xlab("")+ggtitle("B")
levels(mentalsmall$depressive)=c("Yes", "No")
graph=ggplot(mentalsmall, aes(depressive,fill=depressive))+geom_bar()+theme_classic(base_size=22)+
scale_fill_manual(values=c("midnightblue", "firebrick4"))+guides(fill=F)+
ylab("Number of samples")+xlab("")+ggtitle("A")
final=grid.arrange(graph, smote_graph, nrow=1)
final
#Correlation
#To further clean the dataset, only keep attributes with 10%+ correlation with the class attribute (calculated through
#Pearson's correlation)
#pearson's correlation to determine correlation and save the values as a CSV
mentalsmall_smote$depressive=as.numeric(mentalsmall_smote$depressive)
write.csv(cor(mentalsmall_smote, method="pearson"), "correlationoutput.csv")
#include only values with correlation =>0.1
mentalsmall_smote_cor=mentalsmall_smote[, c(1,4,5,7,12,15,16,19,20,21,22,23,24,26,34,35,36,37,38,39,40,75,77,129,130,131, 133)]
#---
#Classification Algorithm
#---
#10 fold cross validation, C4.5 (or J48) tree
tc=trainControl("cv", 10, classProbs = TRUE, savePredictions = T)
mentalsmall_smote_cor$depressive=as.factor(mentalsmall_smote_cor$depressive)
levels(mentalsmall_smote_cor$depressive)=c("Yes", "No")
Clean_c45_mental_tightcor=train(depressive~., data=mentalsmall_smote_cor, method="J48", trControl=tc)
Clean_c45_mental_tightcor$results
confusionMatrix(Clean_c45_mental_tightcor)
Clean_c45_mental_tightcor$finalModel
#---
#Assessing Algorithm
#---
#Claculating the ROC area
pred=predict(Clean_c45_mental_tightcor, newdata=mentalsmall_smote_cor, type="prob")
rocgraphno=roc(predictor=pred$No, response=mentalsmall_smote_cor$depressive)
rocgraphno
rocgraphyes=roc(predictor=pred$Yes, response=mentalsmall_smote_cor$depressive)
rocgraphyes$specificities=1-rocgraphyes$specificities
rocgraphno$specificities=1-rocgraphno$specificities
g=ggroc(list(yes=rocgraphyes, no=rocgraphno))+theme_classic(base_size = 22)+scale_colour_manual(values=c("black", "black"))+guides(colour=FALSE)+
xlab("Specificity")+ylab("True Positive Rate (Sensitivity)")
g