-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathgeneric_wrapper.py
604 lines (508 loc) · 27.2 KB
/
generic_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
""" Generic wrapper class meant to help search the space of parameters
for different types of models, save the results, and determine the
best parameter settings."""
import numpy as np
import pandas as pd
import os
import sys
import copy
import ast
from time import time
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
CODE_PATH = os.path.dirname(os.getcwd())
sys.path.append(CODE_PATH)
DEFAULT_MAIN_DIRECTORY = '/Your/path/here/'
DEFAULT_NUM_CROSS_FOLDS = 5
DEFAULT_CLEAN_FILE = 'all_modalities_present.csv'
DEFAULT_NOISY_FILE = 'extra_noisy_data.csv'
import data_funcs
import helper_funcs as helper
def reload_dependencies():
reload(data_funcs)
reload(helper)
class Wrapper:
"""This generic parent class defines functions common to any wrapper that must test
different hyperparameter settings for a model in order to find the best ones. It
can be inherited to build wrappers for many types of models.
Flexibly allows the child class to define the names and values of all of the
hyperparameters it needs to test in a dictionary.
"""
def __init__(self, filename, cont=False, classifier_name='MMAE',
num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, dropbox_path=DEFAULT_MAIN_DIRECTORY,
datasets_path='Data/', results_path=None, check_test=False,
normalize_and_fill=False, normalization='between_0_and_1',
optimize_for='val_score', min_or_max='max',
save_results_every_nth=1, cross_validation=True):
"""
Initializes the parent class.
Args:
filename: The name of a .csv file containing the data.
cont: A boolean. If true, will try to load a saved results .csv and continue
training on the next unfinished result.
classifier_name: String name of the classifier trained. Used to know where to save
results.
num_cross_folds: An integer number of folds to use in cross validation.
dropbox_path: The path to the main dropbox directory which contains the results and
data directories.
datasets_path: The path from the main dropbox to the datasets directory.
results_path: The path from the main dropbox to the directory where results should
be saved.
check_test: A boolean. If true, will evaluate final results on held-out test set
after running.
normalize_and_fill: If True, will ask DataLoader to normalize the data and fill
missing values.
normalization: How to normalize the input data. Can be 'z_score', 'between_0_and_1',
or None.
optimize_for: The name of the criteria the wrapper is trying to optimize.
min_or_max: A string that can be either 'min' if the wrapper is trying to minimize the
score on the validation data, or 'max' if it should be maximized.
save_results_every_nth: An integer representing the number of settings to test before
writing the results df to a csv file.
cross_validation: set to False to not use cross validation.
"""
# memorize arguments and construct paths
self.filename = filename
self.cont = cont
self.classifier_name = classifier_name
self.num_cross_folds = num_cross_folds
self.dropbox_path = dropbox_path
self.datasets_path = dropbox_path + datasets_path
if results_path is None:
self.results_path = dropbox_path + 'Results/' + self.classifier_name + '/'
else:
self.results_path = dropbox_path + results_path
self.check_test = check_test
self.save_results_every_nth = save_results_every_nth
self.optimize_for = optimize_for
self.normalize_and_fill = normalize_and_fill
self.normalization = normalization
self.min_or_max = min_or_max
self.cross_validation = cross_validation
self.save_prefix = self.get_save_prefix(filename, replace=cont)
self.params = {}
self.define_params()
self.load_data()
self.construct_list_of_params_to_test()
self.num_settings = len(self.list_of_param_settings)
#storing the results
self.time_sum = 0
if cont:
self.val_results_df = pd.DataFrame.from_csv(self.results_path + self.save_prefix + '.csv')
print '\nPrevious validation results df loaded. It has', len(self.val_results_df), "rows"
self.started_from = len(self.val_results_df)
else:
self.val_results_df = pd.DataFrame()
self.started_from = 0
# These functions need to be overwritten by the child class
def define_params(self):
""" This function should set self.params to a dict where they keys represent names of parameters
to test (e.g. for SVM, 'C') as they should be saved to the val_results_df, and the values of
self.params should be a list of values for the parameter that need to be tested. An example
dict:
self.params['C'] = [1,10,100]
self.params['beta'] = [.001, .01, .1]
"""
print "Error! define_params should be overwritten in child class"
raise NotImplementedError
def train_and_predict(self, param_dict):
print "Error! train_model_for_task should be overwritten in child class"
raise NotImplementedError
def test_on_test(self, param_dict):
print "Error! train_model_for_task should be overwritten in child class"
raise NotImplementedError
# The following functions do not need to be overwritten in the child class.
def load_data(self):
"""Initialize's the classes data_loader object, which takes care of loading
data from a file."""
self.data_loader = data_funcs.DataLoader(self.datasets_path + self.filename,
normalize_and_fill=self.normalize_and_fill,
cross_validation=self.cross_validation,
normalization=self.normalization)
def construct_list_of_params_to_test(self):
"""Will make a class level variable that is a list of parameter dicts.
Each entry in the list is a dict of parameter settings,
eg. {'C'=1.0, 'beta'=.01, ...}. This list represents all of the combinations
of hyperparameter settings that need to be tested.
"""
self.list_of_param_settings = []
self.recurse_and_append_params(copy.deepcopy(self.params), {})
def recurse_and_append_params(self, param_settings_left, this_param_dict, debug=False):
"""Given a dictionary listing all the settings needed for each parameter (key) in the
dict, recursively performs a breadth-first-search over all of the possible combinations
of hyperparameter settings.
For each setting still left in the dict, creates a node of the breadth-first search tree
where that hyperparameter is set to that specific setting.
Saves all of the combinations into the class's list_of_param_settings field.
Args:
param_settings_left: A dictionary of lists. The keys are parameters
(like 'C'), the values are the list of settings for those parameters that
need to be tested (like [1.0, 10.0, 100.0]).
this_param_dict: A dictionary containing a single setting for each parameter. If
a parameter is not in this_param_dict's keys, a setting for it has not been chosen yet.
debug: A Boolean. If True, will print debugging statements.
"""
if debug: print "Working on a parameter dict containing", this_param_dict
for key in self.params.keys():
if key in this_param_dict:
continue
else:
try:
this_setting = param_settings_left[key].pop()
except:
print "ERROR! could not pop from param_setting", key, "which is", param_settings_left[key]
if debug: print "Popped", key, "=", this_setting, "off the params left"
if len(param_settings_left[key]) > 0:
if debug: print "Recursing on remaining parameters", param_settings_left
self.recurse_and_append_params(copy.deepcopy(param_settings_left),
copy.deepcopy(this_param_dict))
if debug: print "Placing the popped setting", key, "=", this_setting, "into the parameter dict"
this_param_dict[key] = this_setting
self.list_of_param_settings.append(this_param_dict)
if debug: print "Appending parameter dict to list:", this_param_dict, "\n"
def get_save_prefix(self, filename, replace=False):
"""Computes a prefix to use when saving results files based on the classifier
name and the data file name.
Args:
filename: String name of data file.
replace: A Boolean. If True, and the code detects an existing results file
with the same name, it will replace it.
Returns: The string save prefix.
"""
end_loc = filename.find('.')
prefix = self.classifier_name + '-' + filename[0:end_loc]
if not replace:
while os.path.exists(self.results_path + prefix + '.csv'):
prefix = prefix + '2'
return prefix
def setting_already_done(self, param_dict):
"""Returns True if a particular setting of the hyperparameters has already been tested.
Args:
param_dict: A dictionary representing a setting for all of the hyperparameters.
Returns: Boolean.
"""
mini_df = self.val_results_df
for key in param_dict.keys():
setting = param_dict[key]
if type(setting) == list:
setting = str(setting)
mini_df = mini_df[mini_df[key] == setting]
if len(mini_df) == 0:
return False
print "Setting already tested"
return True
def convert_param_dict_for_use(self, setting_dict):
"""When loading rows from a saved results df in csv format, some
of the settings may end up being converted to a string representation
and need to be converted back to actual numbers and objects.
May need to be overwritten in child class."""
if 'architecture' in setting_dict.keys():
if type(setting_dict['architecture']) == str:
setting_dict['architecture'] = ast.literal_eval(setting_dict['architecture'])
if 'optimizer' in setting_dict.keys():
if 'GradientDescent' in setting_dict['optimizer']:
setting_dict['optimizer'] = tf.train.GradientDescentOptimizer
elif 'Adagrad' in setting_dict['optimizer']:
setting_dict['optimizer'] = tf.train.AdagradOptimizer
else:
setting_dict['optimizer'] = tf.train.AdamOptimizer
if 'batch_size' in setting_dict.keys():
setting_dict['batch_size'] = int(setting_dict['batch_size'])
print "batch size just got changed in convert_param_dict. It's an", type(setting_dict['batch_size'])
return setting_dict
def sweep_all_parameters(self):
"""Runs through all of the computed combinations of hyperparameter settings,
storing the results of testing with each."""
print "\nYou have chosen to test a total of", self.num_settings, "settings"
sys.stdout.flush()
#sweep all possible combinations of parameters
for param_dict in self.list_of_param_settings:
self.test_one_setting(param_dict)
self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
print "\n--------------PARAMETER SWEEP IS COMPLETE--------------"
def test_one_setting(self, param_dict):
"""Tests a single setting of the hyperparameters.
Args:
param_dict: A dictionary with the hyperparameter names as keys and the values
they should be set to as values.
"""
if self.cont and self.setting_already_done(param_dict):
return
# Times the computation for each setting
t0 = time()
results_dict = self.get_cross_validation_results(param_dict)
self.val_results_df = self.val_results_df.append(results_dict,ignore_index=True)
t1 = time()
this_time = t1 - t0
self.time_sum = self.time_sum + this_time
print "\n", self.val_results_df.tail(n=1)
print "It took", this_time, "seconds to obtain this result"
self.print_time_estimate()
sys.stdout.flush()
# Output the results file every few iterations for safekeeping
if len(self.val_results_df) % self.save_results_every_nth == 0:
self.val_results_df.to_csv(self.results_path + self.save_prefix + '.csv')
def get_cross_validation_results(self, param_dict):
"""Gets the score from testing on each cross validation fold and saves the average.
Args:
param_dict: A dictionary with the hyperparameter names as keys and the values
they should be set to as values.
"""
scores = []
for f in range(self.num_cross_folds):
self.data_loader.set_to_cross_validation_fold(f)
scores.append(self.train_and_predict(param_dict))
print "Scores for each fold:", scores
param_dict[self.optimize_for] = np.mean(scores)
return param_dict
def print_time_estimate(self):
"""Prints an estimate of the total time remaining to finish testing all of the
hyperparameter settings."""
num_done = len(self.val_results_df)-self.started_from
num_remaining = self.num_settings - num_done - self.started_from
avg_time = self.time_sum / num_done
total_secs_remaining = int(avg_time * num_remaining)
hours, mins, secs = helper.get_secs_mins_hours_from_secs(total_secs_remaining)
print "\n", num_done, "settings processed so far,", num_remaining, "left to go"
print "Estimated time remaining:", hours, "hours", mins, "mins", secs, "secs"
def find_best_setting(self, optimize_for=None, min_or_max=None):
"""After all testing is finished, locates the row in the results file that
contains the best possible hyperparameter settings.
Args:
optimize_for: String name of the result column that the wrapper should
optimize. Defaults to class value.
min_or_max: Whether the value of optimize_for should be minimized or
maximized. Defaults to class value.
Returns:
A dictionary containing the best setting
"""
if optimize_for is None:
optimize_for = self.optimize_for
if min_or_max is None:
min_or_max = self.min_or_max
scores = self.val_results_df[optimize_for].tolist()
if min_or_max == 'min':
best_score = min(scores)
else:
best_score = max(scores)
best_idx = scores.index(best_score)
best_setting = self.val_results_df.iloc[best_idx]
print "\nThe best", optimize_for, "was", best_setting[optimize_for]
print "It was found with the following settings:"
print best_setting
print "\n"
return best_setting
def get_final_results(self):
"""Finds the best setting of the hyperparameters, then gets the final results on
the test set if the field check_test is set to True."""
best_setting = self.find_best_setting()
if not self.check_test:
print "check_test is set to false, Will not evaluate performance on held-out test set."
return
print "\nAbout to evaluate results on held-out test set!!"
print "Will use the settings that produced the best", optimize_for
best_setting = self.convert_param_dict_for_use(best_setting)
test_score = self.test_on_test(best_setting)
print "\nFINAL TEST RESULTS:", test_score
def run(self):
"""Tests all of the settings, then finds the best one and possibly tests on the
test set."""
self.sweep_all_parameters()
self.get_final_results()
class ClassificationWrapper(Wrapper):
"""A class that inherits from the generic wrapper, and provides another abstract
parent class that can be used to easily build wrappers that test classification
models.
"""
def __init__(self, filename, wanted_label=None, cont=False, classifier_name='SVM',
num_cross_folds=DEFAULT_NUM_CROSS_FOLDS, dropbox_path=DEFAULT_MAIN_DIRECTORY,
datasets_path='Data/', results_path=None, check_test=False,
normalize_and_fill=False, normalization='z_score', optimize_for='val_acc',
min_or_max='max', save_results_every_nth=1, check_noisy_data=False,
cross_validation=True):
"""Initializes both the parent ClassificationWrapper and its parent Wrapper.
Args: almost entirely the same as the parent class, except:
wanted_label: The name of the column containing the labels that we are trying
to classify.
check_noisy_data: If True, will tell the data loader to separate out the noisy
data in the data file, and will compute results on this data separately.
"""
self.wanted_label = wanted_label
self.check_noisy_data = check_noisy_data
Wrapper.__init__(self, filename=filename, cont=cont, classifier_name=classifier_name,
num_cross_folds=num_cross_folds, dropbox_path=dropbox_path,
datasets_path=datasets_path, results_path=results_path,
check_test=check_test, normalize_and_fill=normalize_and_fill,
normalization=normalization, optimize_for=optimize_for,
min_or_max=min_or_max, save_results_every_nth=save_results_every_nth,
cross_validation=cross_validation)
def predict_on_data(self, X):
print "Error! predict_on_data should be overwritten in child class"
raise NotImplementedError
def load_data(self):
"""Initializes the data loader object of the class. Specific to classification
because the data loader must load supervised data, based on the wanted class label,
and possibly separate noisy data."""
self.data_loader = data_funcs.DataLoader(self.datasets_path + self.filename,
normalize_and_fill=self.normalize_and_fill,
cross_validation=self.cross_validation,
supervised=True,
wanted_label=self.wanted_label,
normalization=self.normalization,
separate_noisy_data=self.check_noisy_data)
def get_save_prefix(self, filename, replace=False):
"""Overloads the parent function for computing a save prefix by including the class label
in the name.
Args: same as parent class"""
end_loc = filename.find('.')
prefix = self.classifier_name + '-' + filename[0:end_loc]
if self.wanted_label is not None:
prefix += '-' + helper.get_friendly_label_name(self.wanted_label)
if not replace:
while os.path.exists(self.results_path + prefix + '.csv'):
prefix = prefix + '2'
return prefix
def get_cross_validation_results(self, param_dict):
"""Gets cross validation results specific to classification, by computing a number
of classification metrics including accuracy, AUC, F1, precision, and recall, and
computing scores on both noisy and clean data.
Args:
param_dict: A dictionary with the hyperparameter names as keys and the values
they should be set to as values.
Returns: The same param_dict now containing the numerical results for each metric.
"""
all_acc = []
all_auc = []
all_f1 = []
all_precision = []
all_recall = []
if self.check_noisy_data:
noisy_acc = []
noisy_auc = []
clean_acc = []
clean_auc = []
for f in range(self.num_cross_folds):
self.data_loader.set_to_cross_validation_fold(f)
preds = self.train_and_predict(param_dict)
true_y = self.data_loader.val_Y
if preds is None or true_y is None:
continue
acc, auc, f1, precision, recall = compute_all_classification_metrics(preds, true_y)
all_acc.append(acc)
all_auc.append(auc)
all_f1.append(f1)
all_precision.append(precision)
all_recall.append(recall)
if self.check_noisy_data:
noisy_preds = self.predict_on_data(self.data_loader.noisy_val_X)
acc, auc, f1, precision, recall = compute_all_classification_metrics(noisy_preds, self.data_loader.noisy_val_Y)
noisy_acc.append(acc)
noisy_auc.append(auc)
clean_preds = self.predict_on_data(self.data_loader.clean_val_X)
acc, auc, f1, precision, recall = compute_all_classification_metrics(clean_preds, self.data_loader.clean_val_Y)
clean_acc.append(acc)
clean_auc.append(auc)
print "Accuracy for each fold:", all_auc
param_dict['val_acc'] = np.nanmean(all_acc)
param_dict['val_auc'] = np.nanmean(all_auc)
param_dict['val_f1'] = np.nanmean(all_f1)
param_dict['val_precision'] = np.nanmean(all_precision)
param_dict['val_recall'] = np.nanmean(all_recall)
if self.check_noisy_data:
param_dict['noisy_val_acc'] = np.nanmean(noisy_acc)
param_dict['noisy_val_auc'] = np.nanmean(noisy_auc)
print "Perf on noisy data:", np.nanmean(noisy_acc), "acc", np.nanmean(noisy_auc), "auc"
param_dict['clean_val_acc'] = np.nanmean(clean_acc)
param_dict['clean_val_auc'] = np.nanmean(clean_auc)
print "Perf on clean data:", np.nanmean(clean_acc), "acc", np.nanmean(clean_auc), "auc"
return param_dict
def get_classification_predictions_from_df(self):
"""Will predict the class labels for the data contained in the wrapper's data loader
object.
Returns: A pandas dataframe containing the data loader's data with class label
predictions added as an extra column.
"""
df = copy.deepcopy(self.data_loader.df)
X = df[self.data_loader.wanted_feats].as_matrix()
preds = self.predict_on_data(X)
assert(len(X) == len(preds))
for i,label in enumerate(self.data_loader.wanted_labels):
df['predictions_'+label] = preds[:,i]
return df
def get_final_results(self):
"""Finds the best setting for a number of different metrics, may test on the held-out
test set if the check_test field is True."""
for metric in ['val_acc', 'noisy_val_acc', 'clean_val_acc']:
if metric in self.val_results_df.columns.values:
best_setting = self.find_best_setting(optimize_for=metric, min_or_max='max')
print "\nThe best", metric, "was", best_setting[metric]
print "It was found with the following settings:"
print best_setting
if not self.check_test:
print "check_test is set to false, Will not evaluate performance on held-out test set."
return
print "\nAbout to evaluate results on held-out test set!!"
print "Will use the settings that produced the best", self.optimize_for
best_setting = self.convert_param_dict_for_use(best_setting)
preds = self.test_on_test(best_setting)
true_y = true_y = self.data_loader.test_Y
acc, auc, f1, precision, recall = compute_all_classification_metrics(preds, true_y)
print "\nFINAL TEST RESULTS ON ALL DATA:"
print 'Acc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall
if self.check_noisy_data:
noisy_preds = self.predict_on_data(self.data_loader.noisy_test_X)
acc, auc, f1, precision, recall = compute_all_classification_metrics(noisy_preds, self.data_loader.noisy_test_Y)
print "\nFINAL TEST RESULTS ON NOISY DATA:"
print 'Acc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall
clean_preds = self.predict_on_data(self.data_loader.clean_test_X)
acc, auc, f1, precision, recall = compute_all_classification_metrics(clean_preds, self.data_loader.clean_test_Y)
print "\nFINAL TEST RESULTS ON CLEAN DATA:"
print 'Acc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall
def get_baseline(Y):
"""Gets the proportion of the class label that is most frequent in the data.
Args:
Y: an array of class labels
Returns: The percent labels that belong to the most frequence class.
"""
Y = Y.tolist()
percent_true = float(Y.count(1.0)) / float(len(Y))
if percent_true < 0.5:
return 1.0 - percent_true
else:
return percent_true
def compute_classification_metric(metric, true_y, preds):
"""Computes a classification metric such as F1 score given the true and predicted labels.
Args:
metric: A function that computes a classfiication metric.
true_y: The ground truth labels.
preds: The model's predicted labels.
Returns: The metric score.
"""
try:
result = metric(true_y, preds)
except Exception, e:
print "Error in computing metric:", e
return np.nan
return result
def binary_accuracy(true_y, preds):
"""Computes the percentage of labels that were correctly predicted by the model.
Args:
true_y: The ground truth labels.
preds: The model's predicted labels.
Returns: Float accuracy.
"""
assert len(preds)==len(true_y)
correct_labels = [1 for i in range(len(preds)) if preds[i]==true_y[i]]
return len(correct_labels)/float(len(preds))
def compute_all_classification_metrics(preds, true_y):
"""Computes the accuracy, AUC, F1, precision, and recall for the model's predictions.
Args:
true_y: The ground truth labels.
preds: The model's predicted labels.
Returns: float accuracy, AUC, F1, precision, and recall
"""
acc = compute_classification_metric(binary_accuracy, true_y, preds)
auc = compute_classification_metric(roc_auc_score, true_y, preds)
f1 = compute_classification_metric(f1_score, true_y, preds)
precision = compute_classification_metric(precision_score, true_y, preds)
recall = compute_classification_metric(recall_score, true_y, preds)
return acc, auc, f1, precision, recall