-
Notifications
You must be signed in to change notification settings - Fork 5
/
evaluate.py
430 lines (390 loc) · 20.4 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#evaluate.py
#Copyright (c) 2020 Rachel Lea Ballantyne Draelos
#MIT License
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE
#Imports
import os
import copy
import time
import bisect
import shutil
import operator
import itertools
import numpy as np
import pandas as pd
import sklearn.metrics
from scipy import interp
from itertools import cycle
import matplotlib
matplotlib.use('agg') #so that it does not attempt to display via SSH
import seaborn
import matplotlib.pyplot as plt
plt.ioff() #turn interactive plotting off
#suppress numpy warnings
import warnings
warnings.filterwarnings('ignore')
#######################
# Reporting Functions #---------------------------------------------------------
#######################
def initialize_evaluation_dfs(all_labels, num_epochs):
"""Create empty "eval_dfs_dict"
Variables
<all_labels>: a list of strings describing the labels in order
<num_epochs>: int for total number of epochs"""
if len(all_labels)==2:
index = [all_labels[1]]
numrows = 1
else:
index = all_labels
numrows = len(all_labels)
#Initialize empty pandas dataframe to store evaluation results across epochs
#for accuracy, AUROC, and AP
result_df = pd.DataFrame(data=np.zeros((numrows, num_epochs)),
index = index,
columns = ['epoch_'+str(n) for n in range(0,num_epochs)])
#Initialize empty pandas dataframe to store evaluation results for top k
top_k_result_df = pd.DataFrame(np.zeros((len(all_labels), num_epochs)),
index=[x for x in range(1,len(all_labels)+1)], #e.g. 1,...,64 for len(all_labels)=64
columns = ['epoch_'+str(n) for n in range(0,num_epochs)])
#Make eval results dictionaries
eval_results_valid = {'accuracy':copy.deepcopy(result_df),
'auroc':copy.deepcopy(result_df),
'avg_precision':copy.deepcopy(result_df),
'top_k':top_k_result_df}
eval_results_test = copy.deepcopy(eval_results_valid)
return eval_results_valid, eval_results_test
def save(eval_dfs_dict, results_dir, descriptor):
"""Variables
<eval_dfs_dict> is a dict of pandas dataframes
<descriptor> is a string"""
for k in eval_dfs_dict.keys():
eval_dfs_dict[k].to_csv(os.path.join(results_dir, descriptor+'_'+k+'_Table.csv'))
def save_final_summary(eval_dfs_dict, best_valid_epoch, setname, results_dir):
"""Save to overall df and print summary of best epoch."""
#final_descriptor is e.g. '2019-11-15-awesome-model_epoch15
final_descriptor = results_dir.replace('results/','')+'_epoch'+str(best_valid_epoch)
if setname=='valid': print('***Summary for',setname,results_dir,'***')
for metricname in list(eval_dfs_dict.keys()):
#metricnames are accuracy, auroc, avg_precision, and top_k.
#df holds a particular metric for the particular model we just ran.
#for accuracy, auroc, and avg_precision, df index is diseases, columns are epochs.
#for top_k, df index is the k value (an int) and columns are epochs.
df = eval_dfs_dict[metricname]
#all_df tracks results of all models in one giant table.
#all_df has index of diseases or k value, and columns which are particular models.
all_df_path = os.path.join('results',setname+'_'+metricname+'_all.csv') #e.g. valid_accuracy_all.csv
if os.path.isfile(all_df_path):
all_df = pd.read_csv(all_df_path,header=0,index_col=0)
all_df[final_descriptor] = np.nan
else: #all_df doesn't exist yet - create it.
all_df = pd.DataFrame(np.empty((df.shape[0],1)),
index = df.index.values.tolist(),
columns = [final_descriptor])
#Print off and save results for best_valid_epoch
if setname=='valid': print('\tEpoch',best_valid_epoch,metricname)
for label in df.index.values:
#print off to console
value = df.at[label,'epoch_'+str(best_valid_epoch)]
if setname=='valid': print('\t\t',label,':',str( round(value, 3) ))
#save in all_df
all_df.at[label,final_descriptor] = value
all_df.to_csv(all_df_path,header=True,index=True)
def clean_up_output_files(best_valid_epoch, results_dir):
"""Delete output files that aren't from the best epoch"""
#Delete all the backup parameters (they take a lot of space and you do not
#need to have them)
shutil.rmtree(os.path.join(results_dir,'backup'))
#Delete all the extra output files:
for subdir in ['heatmaps','curves','pred_probs']:
#Clean up saved ROC and PR curves
fullpath = os.path.join(results_dir,subdir)
if os.path.exists(fullpath): #e.g. there may not be a heatmaps dir for a non-bottleneck model
allfiles = os.listdir(fullpath)
for filename in allfiles:
if str(best_valid_epoch) not in filename:
os.remove(os.path.join(fullpath,filename))
print('Output files all clean')
#########################
# Calculation Functions #-------------------------------------------------------
#########################
def evaluate_all(eval_dfs_dict, epoch, label_meanings,
true_labels_array, pred_probs_array):
"""Fill out the pandas dataframes in the dictionary <eval_dfs_dict>
which is created in cnn.py. <epoch> and <which_label> are used to index into
the dataframe for the metric. Metrics calculated for the provided vectors
are: accuracy, AUC, partial AUC (threshold 0.2), and average precision.
If <subjective> is set to True, additional metrics will be calculated
(confusion matrix, sensitivity, specificity, PPV, NPV.)
Variables:
<all_eval_results> is a dictionary of pandas dataframes created in cnn.py
<epoch> is an integer indicating which epoch it is, starting from epoch 1
<true_labels_array>: array of true labels. examples x labels
<pred_probs_array>: array of predicted probabilities. examples x labels"""
#Accuracy, AUROC, and AP (iter over labels)
for label_number in range(len(label_meanings)):
which_label = label_meanings[label_number] #descriptive string for the label
true_labels = true_labels_array[:,label_number]
pred_probs = pred_probs_array[:,label_number]
pred_labels = (pred_probs>=0.5).astype(dtype='int') #decision threshold of 0.5
#Accuracy and confusion matrix (dependent on decision threshold)
(eval_dfs_dict['accuracy']).at[which_label, 'epoch_'+str(epoch)] = compute_accuracy(true_labels, pred_labels)
#confusion_matrix, sensitivity, specificity, ppv, npv = compute_confusion_matrix(true_labels, pred_labels)
#AUROC and AP (sliding across multiple decision thresholds)
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true = true_labels,
y_score = pred_probs,
pos_label = 1)
(eval_dfs_dict['auroc']).at[which_label, 'epoch_'+str(epoch)] = sklearn.metrics.auc(fpr, tpr)
(eval_dfs_dict['avg_precision']).at[which_label, 'epoch_'+str(epoch)] = sklearn.metrics.average_precision_score(true_labels, pred_probs)
#Top k eval metrics (iter over examples)
eval_dfs_dict['top_k'] = evaluate_top_k(eval_dfs_dict['top_k'],
epoch, true_labels_array, pred_probs_array)
return eval_dfs_dict
#################
# Top K Metrics #---------------------------------------------------------------
#################
def evaluate_top_k(eval_top_k_df, epoch, true_labels_array,
pred_probs_array):
"""<eval_top_k_df> is a pandas dataframe with epoch number as columns and
k values as rows, where k is an integer"""
num_labels = true_labels_array.shape[1] #e.g. 64
total_examples = true_labels_array.shape[0]
vals = [0 for x in range(1,num_labels+2)] #e.g. length 65 list but the index of the last element is 64 for num_labels=64
for example_number in range(total_examples):
#iterate through individual examples (predictions for an individual CT)
#rather than iterating through predicted labels
true_labels = true_labels_array[example_number,:]
pred_probs = pred_probs_array[example_number,:]
for k in range(1,num_labels+1): #e.g. 1,...,64
previous_value = vals[k]
incremental_update = calculate_top_k_accuracy(true_labels, pred_probs, k)
new_value = previous_value + incremental_update
vals[k] = new_value
#Now update the dataframe. Should reach 100% performance by the end.
for k in range(1,num_labels+1):
eval_top_k_df.at[k,'epoch_'+str(epoch)] = vals[k]/total_examples
##Now average over all the examples
#eval_top_k_df.loc[:,'epoch_'+str(epoch)] = eval_top_k_df.loc[:,'epoch_'+str(epoch)] / total_examples
return eval_top_k_df
def calculate_top_k_accuracy(true_labels, pred_probs, k):
k = min(k, len(true_labels)) #avoid accessing array elements that don't exist
#argpartition described here: https://stackoverflow.com/questions/6910641/how-do-i-get-indices-of-n-maximum-values-in-a-numpy-array
#get the indices of the largest k probabilities
ind = np.argpartition(pred_probs, -1*k)[-1*k:]
#now figure out what percent of these top predictions were equal to 1 in the
#true_labels.
#Note that the denominator should not exceed the number of true labels, to
#avoid penalizing the model inappropriately:
denom = min(k, np.sum(true_labels))
if denom == 0: #because np.sum(true_labels) is 0
#super important! must return 1 to avoid dividing by 0 and producing nan
#we don't return 0 because then the model can never get perfect performance
#even at k=num_labels because it'll get 0 for anything that has no labels
return 1
else:
return float(np.sum(true_labels[ind]))/denom
######################
# Accuracy and AUROC #----------------------------------------------------------
######################
def compute_accuracy(true_labels, labels_pred):
"""Print and save the accuracy of the model on the dataset"""
correct = (true_labels == labels_pred)
correct_sum = correct.sum()
return (float(correct_sum)/len(true_labels))
def compute_confusion_matrix(true_labels, labels_pred):
"""Return the confusion matrix"""
cm = sklearn.metrics.confusion_matrix(y_true=true_labels,
y_pred=labels_pred)
if cm.size < 4: #cm is too small to calculate anything
return np.nan, np.nan, np.nan, np.nan, np.nan
true_neg, false_pos, false_neg, true_pos = cm.ravel()
sensitivity = float(true_pos)/(true_pos + false_neg)
specificity = float(true_neg)/(true_neg + false_pos)
ppv = float(true_pos)/(true_pos + false_pos)
npv = float(true_neg)/(true_neg + false_neg)
return((str(cm).replace("\n","_")), sensitivity, specificity, ppv, npv)
def compute_partial_auroc(fpr, tpr, thresh = 0.2, trapezoid = False, verbose=False):
fpr_thresh, tpr_thresh = get_fpr_tpr_for_thresh(fpr, tpr, thresh)
if len(fpr_thresh) < 2:#can't calculate an AUC with only 1 data point
return np.nan
if verbose:
print('fpr: '+str(fpr))
print('fpr_thresh: '+str(fpr_thresh))
print('tpr: '+str(tpr))
print('tpr_thresh: '+str(tpr_thresh))
return sklearn.metrics.auc(fpr_thresh, tpr_thresh)
def get_fpr_tpr_for_thresh(fpr, tpr, thresh):
"""The <fpr> and <tpr> are already sorted according to threshold (which is
sorted from highest to lowest, and is NOT the same as <thresh>; threshold
is the third output of sklearn.metrics.roc_curve and is a vector of the
thresholds used to calculate FPR and TPR). This function figures out where
to bisect the FPR so that the remaining elements are no greater than
<thresh>. It bisects the TPR in the same place."""
p = (bisect.bisect_left(fpr, thresh)-1) #subtract one so that the FPR
#of the remaining elements is NO GREATER THAN <thresh>
return fpr[: p + 1], tpr[: p + 1]
######################
# Plotting Functions #----------------------------------------------------------
######################
def plot_pr_and_roc_curves(results_dir, label_meanings, true_labels, pred_probs,
epoch):
#Plot Precision Recall Curve
#Plot ROC Curve
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true = true_labels,
y_score = pred_probs,
pos_label = 1)
plot_roc_curve(fpr, tpr, epoch, outfilepath)
def plot_roc_curve_multi_class(label_meanings, y_test, y_score,
outdir, setname, epoch):
"""<label_meanings>: list of strings, one for each label
<y_test>: matrix of ground truth
<y_score>: matrix of predicted probabilities
<outdir>: directory to save output file
<setname>: string e.g. 'train' 'valid' or 'test'
<epoch>: int for epoch"""
#Modified from https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
n_classes = len(label_meanings)
lw = 2
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = sklearn.metrics.roc_curve(y_test[:, i], y_score[:, i])
roc_auc[i] = sklearn.metrics.auc(fpr[i], tpr[i])
#make order df. (note that roc_auc is a dictionary with ints as keys
#and AUCs as values.
order = pd.DataFrame(np.zeros((n_classes,1)), index = [x for x in range(n_classes)],
columns = ['roc_auc'])
for i in range(n_classes):
order.at[i,'roc_auc'] = roc_auc[i]
order = order.sort_values(by='roc_auc',ascending=False)
#Plot all ROC curves
#Plot in order of the rainbow colors, from highest AUC to lowest AUC
plt.figure()
colors_list = ['palevioletred','darkorange','yellowgreen','olive','deepskyblue','royalblue','navy']
curves_plotted = 0
for i in order.index.values.tolist()[0:10]: #only plot the top ten so the plot is readable
color_idx = curves_plotted%len(colors_list) #cycle through the colors list in order of colors
color = colors_list[color_idx]
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='{:5s} (area {:0.2f})'.format(label_meanings[i], roc_auc[i]))
curves_plotted+=1
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(setname.lower().capitalize()+' ROC Epoch '+str(epoch))
plt.legend(loc="lower right",prop={'size':6})
outfilepath = os.path.join(outdir,setname+'_ROC_ep'+str(epoch)+'.pdf')
plt.savefig(outfilepath)
plt.close()
def plot_pr_curve_multi_class(label_meanings, y_test, y_score,
outdir, setname, epoch):
"""<label_meanings>: list of strings, one for each label
<y_test>: matrix of ground truth
<y_score>: matrix of predicted probabilities
<outdir>: directory to save output file
<setname>: string e.g. 'train' 'valid' or 'test'
<epoch>: int for epoch"""
#Modified from https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
#https://stackoverflow.com/questions/29656550/how-to-plot-pr-curve-over-10-folds-of-cross-validation-in-scikit-learn
n_classes = len(label_meanings)
lw = 2
#make order df.
order = pd.DataFrame(np.zeros((n_classes,1)), index = [x for x in range(n_classes)],
columns = ['prc'])
for i in range(n_classes):
order.at[i,'prc'] = sklearn.metrics.average_precision_score(y_test[:,i], y_score[:,i])
order = order.sort_values(by='prc',ascending=False)
#Plot
plt.figure()
colors_list = ['palevioletred','darkorange','yellowgreen','olive','deepskyblue','royalblue','navy']
curves_plotted = 0
for i in order.index.values.tolist()[0:10]: #only plot the top ten so the plot is readable
color_idx = curves_plotted%len(colors_list) #cycle through the colors list in order of colors
color = colors_list[color_idx]
average_precision = sklearn.metrics.average_precision_score(y_test[:,i], y_score[:,i])
precision, recall, _ = sklearn.metrics.precision_recall_curve(y_test[:,i], y_score[:,i])
plt.step(recall, precision, color=color, where='post',
label='{:5s} (area {:0.2f})'.format(label_meanings[i], average_precision))
curves_plotted+=1
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title(setname.lower().capitalize()+' PRC Epoch '+str(epoch))
plt.legend(loc="lower right",prop={'size':6})
outfilepath = os.path.join(outdir,setname+'_PR_ep'+str(epoch)+'.pdf')
plt.savefig(outfilepath)
plt.close()
def plot_pr_curve_single_class(true_labels, pred_probs, outfilepath):
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html
average_precision = sklearn.metrics.average_precision_score(true_labels, pred_probs)
precision, recall, _ = sklearn.metrics.precision_recall_curve(true_labels, pred_probs)
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2,
color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
average_precision))
plt.savefig(outfilepath)
plt.close()
def plot_roc_curve_single_class(fpr, tpr, epoch, outfilepath):
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py
roc_auc = sklearn.metrics.auc(fpr, tpr)
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig(outputfilepath)
plt.close()
def plot_learning_curves(train_loss, valid_loss, results_dir, descriptor):
"""Variables
<train_loss> and <valid_loss> are numpy arrays with one numerical entry
for each epoch quanitfying the loss for that epoch."""
x = np.arange(0,len(train_loss))
plt.plot(x, train_loss, color='blue', lw=2, label='train')
plt.plot(x, valid_loss, color='green',lw = 2, label='valid')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend(loc='lower right')
plt.savefig(os.path.join(results_dir, descriptor+'_Learning_Curves.png'))
plt.close()
#save numpy arrays of the losses
np.save(os.path.join(results_dir,'train_loss.npy'),train_loss)
np.save(os.path.join(results_dir,'valid_loss.npy'),valid_loss)
def plot_heatmap(outprefix, numeric_array, center, xticklabels, yticklabels):
"""Save a heatmap based on numeric_array"""
seaborn.set(font_scale=0.6)
seaplt = (seaborn.heatmap(numeric_array,
center=center,
xticklabels=xticklabels,
yticklabels=yticklabels)).get_figure()
seaplt.savefig(outprefix+'.png')
seaplt.clf()