-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRF.m
181 lines (130 loc) · 6.4 KB
/
RF.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
tic
clear;
clc;
%import data
brc = readtable('Breast_cancer2.csv');
% description of brc reveals the existence of three columns with
% significant outliers (area_mean , area_se , area_worst)
summary(brc);
% size table (569 rows, 32 columns)
[r c] = size(brc);
% seperation of dependent and independent variables
X = table2array(brc(:,3:32));
Y = table2array(brc(:,2));
%save all variables
dep_variables = brc.Properties.VariableNames;
X_variables = brc(:,3:end);
%define classes
count_y0_y1 = tabulate(Y)% we have 357 malignant and 212 benign tumor
%seperate data into training and test sets using cv partition
%for slight imbalance in target class
cv = cvpartition(Y,'holdout',0.3);
X_Train = X(training(cv,1),:);
y_Train = Y(training(cv,1));
X_Test = X(test(cv,1),:);
y_Test = Y(test(cv,1),:);
% Data normalization
%We applied data normalization with mu=0 and standard deviation = 1 to the
%variables in response to 3 signigicant outliers in three from
%the summary.
[X_Train, mu, stddev] = normalize(X_Train);
for i=1:size(X_Test, 2)
X_Test(:,i) = (X_Test(:,i)-mu(1,i))/stddev(1,i);
end
%feature scaling is restricted to the test set.
%We opted for Bayesian optimisation for the first tuning of
%hyperparameter because of highly correlated dataset.
% This method create one figure(beacause we optimize 3 variables) :
%a) second shows the estimated and observed value of the Objective function
%Three variables are created with spesific name,Type and range
feat_2 = optimizableVariable('feat_2',[1, 50],'Type','integer');
split_2 = optimizableVariable('split_2',{'gdi', 'deviance'}, 'Type','categorical');
num_lc2 = optimizableVariable('num_lc2',[10, 400], 'Type','integer');
n2 = size(X_Train, 1);
rng(1);
cv2 = cvpartition(n2,'Kfold',10);
%Objective function, Returns a measure of loss for hyperparameter tuning
fun = @(x)kfoldLoss(fitcensemble(X_Train, y_Train, 'CVPartition', cv2, 'Method', 'Bag'));
% set the limit for objective evaluation than acquisition function and expected improvement plus after trying for 150 observations
bo_res2 = bayesopt(fun,[feat_2, split_2, num_lc2],'Verbose',1, 'MaxObjectiveEvaluations',150)
%saved best combination which give min_error
feat2_bo = bo_res2.XAtMinObjective.feat_2;
split2_bo = bo_res2.XAtMinObjective.split_2;
numlc2_bo = bo_res2.XAtMinObjective.num_lc2;
% minimun error for the set of hyperparameters that evaluated on validation set
min_error =bo_res2.MinObjective;
% Create optimised tree and generate results
t2 = templateTree('NumVariablesToSample', feat2_bo, 'SplitCriterion', char(split2_bo));
% and refit the RF with this tree template as the base learner
rng(1)
bomdl2 = fitcensemble(X_Train, y_Train, 'Method', 'Bag', 'NumLearningCycles',numlc2_bo,'learners', t2);
bomdl2_loss = loss(bomdl2,X_Test, y_Test); %classification effective of training data based on model predictions
bomdl2_rloss = resubLoss(bomdl2); %Misclassifications from the predictions above
% Create Matrix for bayesian optimisation results and test model
figure()
bo_Predict2 = predict(bomdl2, X_Test);
cm_bo_rf = confusionmat(bo_Predict2, y_Test)
matrx = confusionchart(bo_Predict2, y_Test)
%accursacy
Accuracy_bo = 100*(cm_bo_rf(1,1)+cm_bo_rf(2,2))./(cm_bo_rf(1,1)+cm_bo_rf(2,2)+cm_bo_rf(1,2)+cm_bo_rf(2,1))
%accurcy of the model with bayesopt tuning
rf_bo_precision = cm_bo_rf(1,1)./(cm_bo_rf(1,1)+cm_bo_rf(1,2));
%Recall is the ability to identify the number of samples that would really count positive for tumor.
rf_bo_recall = cm_bo_rf(1,1)./(cm_bo_rf(1,1)+cm_bo_rf(2,1));
%F1-score means a statistical measure of the accuracy. Also , F1 score is used because FN and TN are crusial for our results.
f1_Scores_bo = 2*(rf_bo_precision.*rf_bo_recall)./(rf_bo_precision+rf_bo_recall)
% Identify misclassified tumours
testheight = size(X,1)
trainheight = size(Y,1)
misClass_rf_bayesopt = (cm_bo_rf(1,2)+cm_bo_rf(2,1));
errTumour_bo = 100*misClass_rf_bayesopt/testheight;
%Grid search cross validation for HP tuning.
%This method is highly computationally expensive,
%we have reduced steps - the biggest run was done
%at 2000 trees but we have set this at 100 for computational efficiency.
numTrees = 100
%Create arrays to house hyperparameter performance, based on error
% and average error create a boolean that will
%track hyperparameter performance
opt_LS_grdSrch= 100
opt_PTS_grdSrch = 100
base_misclass = 1
for min_LS = 1:20
for num_PTS = 1:30
rf_mdl_grdSrch = TreeBagger(numTrees, X_Train, y_Train, 'Method', 'classification', 'OOBPrediction', 'on', 'MinLeafSize', min_LS, 'NumPredictorsToSample',num_PTS);
misclass_grdSrch = oobError(rf_mdl_grdSrch);
avg_misclass =sum(misclass_grdSrch)/numTrees
if avg_misclass < base_misclass
opt_LS_grdSrch= min_LS;
opt_PTS_grdSrch = num_PTS;
base_misclass = avg_misclass;
end
end
end
%create optimised model based on results of hyperparameter tuning
bg_Mdl_grdSrch = TreeBagger(numTrees, X_Train, y_Train, 'Method', 'Classification', 'OOBPrediction', 'on', 'MinLeafSize', opt_LS_grdSrch, 'NumPredictorsToSample',opt_PTS_grdSrch)
%Print best hyperparameters
opt_LS_grdSrch
opt_PTS_grdSrch
%test model
label1 = predict(bg_Mdl_grdSrch, X_Test)
label2 = str2double(label1)
ta = table(y_Test, label1, 'VariableNames', {'TrueLabel', 'PredictedLabel'});
%create confusion matrix
figure()
rf_cnf_grdSrch = confusionmat(label2, y_Test)
rf_con_chart = confusionchart(label2, y_Test)
%accurcy of the model with bayesopt tuning
Accuracy_rf_grdSrch = 100*(rf_cnf_grdSrch(1,1)+rf_cnf_grdSrch(2,2))./(rf_cnf_grdSrch(1,1)+rf_cnf_grdSrch(2,2)+rf_cnf_grdSrch(1,2)+rf_cnf_grdSrch(2,1))
%Precision defines the accuracy of judgment.
rf_precision_grdSrch = rf_cnf_grdSrch(1,1)./(rf_cnf_grdSrch(1,1)+rf_cnf_grdSrch(1,2));
%Recall is the ability to identify the number of samples that would really count positive for tumor.
rf_recall_grdSrch = rf_cnf_grdSrch(1,1)./(rf_cnf_grdSrch(1,1)+rf_cnf_grdSrch(2,1));
%F1 score means a statistical measure of the accuracy
f1_Scores_grdSrch = 2*(rf_precision_grdSrch.*rf_recall_grdSrch)./(rf_precision_grdSrch+rf_recall_grdSrch)
% Identify misclassified tumours
testheight = size(X,1)
trainheight = size(Y,1)
misClass_rf_grdSrch = (rf_cnf_grdSrch(1,2)+rf_cnf_grdSrch(2,1));
errTumour_grdSrch = 100*misClass_rf_grdSrch/testheight;
toc