-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathautoencoder_classification_wrapper.py
408 lines (348 loc) · 20.4 KB
/
autoencoder_classification_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
import numpy as np
import pandas as pd
import os
import sys
CODE_PATH = os.path.dirname(os.getcwd())
sys.path.append(CODE_PATH)
DEFAULT_MAIN_DIRECTORY = '/Your/path/here/'
DEFAULT_NUM_CROSS_FOLDS = 5
LABELS_TO_PREDICT = ['happiness', 'health', 'calmness']
import data_funcs
import helper_funcs as helper
from generic_wrapper import ClassificationWrapper
import generic_wrapper as gen_wrap
import multimodal_autoencoder as mmae
def reload_dependencies():
reload(data_funcs)
reload(helper)
reload(mmae)
class MMAEClassificationWrapper(ClassificationWrapper):
"""A class that inherits from the generic wrapper, enabling the testing and evaluation
of different hyperparameters settings for use with a Multimodal Autoencoder (MMAE) -
with additional classification layers. Performs a grid search over every combination of
settings.
Note: this class is different from the MMAEWrapper because it trains in two steps. First,
it trains the MMAE to reconstruct data with missing modalities. Then, in a second step,
it trains the MMAE to classify supervised labels using a different part of the network
connected to the encoding layers.
"""
def __init__(self, mmae_filename, classification_filename,
mmae_layer_sizes=[[1000,100],[200,100],[500,100]],
classification_layer_sizes=[[50,20],[25,10],[100,50],[100]],
tie_weights=[True,False], mmae_dropout_probs=[1.0,0.5], mmae_weight_penalties=[.01,.001],
weight_initializers=['normal'], mmae_activation_funcs=['relu'],
mmae_test_variational=[True,False],
weight_penalties=[0.0,.001], dropout_probs=[0.5,1.0], activation_funcs=['relu'],
classification_learning_rate=.0001, classification_batch_size=100, classification_num_steps=15000,
cont=False, classifier_name='MMAE_NN_classifier', num_cross_folds=DEFAULT_NUM_CROSS_FOLDS,
dropbox_path=DEFAULT_MAIN_DIRECTORY, datasets_path='Data/Cleaned/', results_path=None,
check_test=False, normalization='between_0_and_1', optimize_for='val_acc', min_or_max='max',
save_results_every_nth=1, check_noisy_data=True, wanted_label=None):
"""Initializes both the MMAEClassificationWrapper and its parent Wrapper with some
settings. Other class settings like the loss function used to train the models are built
into the class and have to be changed elsewhere.
Args: almost entirely the same as the parent class and the MMAEWrapper (where the
mmae prefix indicates its an MMAE hyperparameter), except:
classification_layer_sizes: A list of sizes of the classification layers
connected to the encoder to test.
weight_penalities: A list of L2 weight regularization penalties to test in the
classification layers.
dropout_probs: A list of dropout keep probabilities to test in the classification
layers.
activation_funcs: A list of strings describing different activation functions
to test within the classification layers. Can contain 'softsign', 'relu',
'tanh', 'softplus', or 'linear'.
classification_learning_rate: The initial learning rate to use in the second
step of training the classification portion of the network.
classification_batch_size: The batch size used in the second classification
training step.
classficication_num_steps: The number of training steps to use when training
the classification portion of the network.
wanted_label: The name of the column containing the labels that we are trying
to classify.
"""
# Memorize arguments
self.classification_filename = classification_filename
self.mmae_filename = mmae_filename
# Hyperparameters to test
self.mmae_layer_sizes = mmae_layer_sizes
self.classification_layer_sizes = classification_layer_sizes
self.tie_weights = tie_weights
self.mmae_dropout_probs = mmae_dropout_probs
self.mmae_weight_penalties = mmae_weight_penalties
self.weight_initializers = weight_initializers
self.mmae_activation_funcs = mmae_activation_funcs
self.mmae_test_variational = mmae_test_variational
self.weight_penalties = weight_penalties
self.dropout_probs = dropout_probs
self.activation_funcs = activation_funcs
# fixed hyperparameters
self.classification_learning_rate = classification_learning_rate
self.classification_num_steps = classification_num_steps
self.classification_batch_size = classification_batch_size
self.mmae_loss_func = 'sigmoid_cross_entropy'
self.mmae_learning_rate = .001
self.mmae_num_steps = 15000
self.mmae_batch_size = 20
ClassificationWrapper.__init__(self, filename=classification_filename, wanted_label=wanted_label,
cont=cont, classifier_name=classifier_name, num_cross_folds=num_cross_folds,
dropbox_path=dropbox_path, datasets_path=datasets_path, results_path=results_path,
check_test=check_test, normalization=normalization, optimize_for=optimize_for,
min_or_max=min_or_max, save_results_every_nth=save_results_every_nth,
check_noisy_data=check_noisy_data)
self.trim_extra_vae_params()
self.model = None
def load_data(self):
"""Use the DataLoader class to load unsupervised and supervised data from
files for the MMAE and classification portions of the network, respectively.
"""
self.data_loader = data_funcs.DataLoader(
self.datasets_path + self.mmae_filename,
normalize_and_fill=False,
supervised=False,
cross_validation=True,
separate_noisy_data=self.check_noisy_data)
self.classification_data_loader = data_funcs.DataLoader(
self.datasets_path + self.classification_filename,
normalize_and_fill=False,
cross_validation=True,
supervised=True,
separate_noisy_data=self.check_noisy_data,
wanted_label=self.wanted_label)
def define_params(self):
"""Defines the list of hyperparameters that will be tested."""
self.params = {}
self.params['mmae_architecture'] = self.mmae_layer_sizes
self.params['classification_layers'] = self.classification_layer_sizes
self.params['tie_weights'] = self.tie_weights
self.params['mmae_dropout_prob'] = self.mmae_dropout_probs
self.params['mmae_weight_penalty'] = self.mmae_weight_penalties
self.params['weight_initialization'] = self.weight_initializers
self.params['mmae_activation_function'] = self.mmae_activation_funcs
self.params['variational'] = self.mmae_test_variational
self.params['weight_penalty'] = self.weight_penalties
self.params['dropout_prob'] = self.dropout_probs
self.params['activation_func'] = self.activation_funcs
def initialize_model(self, param_dict):
"""Initializes an internal instance of an MMAE with the hyperparameter
settings in param_dict. Also sets the classification parameters as a
second step.
Args:
param_dict: A dictionary with keys representing parameter names and
values representing settings for those parameters.
"""
self.model = mmae.MultimodalAutoencoder(
# constant factors that don't change
batch_size=self.mmae_batch_size, learning_rate=self.mmae_learning_rate,
model_name=self.classifier_name, verbose=False, loss_func=self.mmae_loss_func,
checkpoint_dir=self.dropbox_path + 'temp_saved_models/',
# factors that change with param dict
layer_sizes=param_dict['mmae_architecture'],
classification_layer_sizes=param_dict['classification_layers'],
variational=param_dict['variational'], tie_weights=param_dict['tie_weights'],
dropout_prob=param_dict['mmae_dropout_prob'],
weight_penalty=param_dict['mmae_weight_penalty'],
activation_func=param_dict['mmae_activation_function'],
weight_initialization=param_dict['weight_initialization'],
# feed in the data
data_loader=self.data_loader, classification_data_loader=self.classification_data_loader)
if self.wanted_label is not None:
classification_loss = 'cross_entropy'
else:
classification_loss = 'sigmoid_cross_entropy'
self.model.set_classification_params(weight_penalty=param_dict['weight_penalty'],
learning_rate=self.classification_learning_rate,
dropout_prob=param_dict['dropout_prob'],
activation_func=param_dict['activation_func'],
batch_size=self.classification_batch_size,
loss_func=classification_loss,
suppress_warning=True)
def trim_extra_vae_params(self):
"""Using a variational loss is already such a strong regularizer, that its
inadvisable to use it while also tying the weights of the encoder and decoder.
Therefore, removes these settings from the settings the wrapper will test.
"""
i = 0
while i < len(self.list_of_param_settings):
setting = self.list_of_param_settings[i]
if setting['variational'] == True and setting['tie_weights'] == True:
self.list_of_param_settings.remove(setting)
continue
i += 1
self.num_settings = len(self.list_of_param_settings)
def train_and_predict(self, param_dict, predict_on='Val'):
"""Initializes an MMAE according to the desired parameter settings, trains
it using reconstruction loss as a first step, then trains it to perform
classification as a second step.
Args:
param_dict: A dictionary with keys representing parameter names and
values representing settings for those parameters.
predict_on: The dataset used for evaluating the model. Can set to
'Test' to get final results.
Returns: the reconstruction loss obtained, and the classification
predictions.
"""
if predict_on == 'Test':
unsup_predict_X = self.data_loader.test_X
sup_predict_X = self.classification_data_loader.test_X
else:
unsup_predict_X = self.data_loader.val_X
sup_predict_X = self.classification_data_loader.val_X
self.initialize_model(param_dict)
self.model.train(self.mmae_num_steps, record_every_nth=self.mmae_num_steps/10,
save_every_nth=self.mmae_num_steps*2)
loss = self.model.get_performance_on_data_with_noise(unsup_predict_X)
print "\tFinished training MMAE, loss was", loss
self.model.train_classification(num_steps=self.classification_num_steps,
record_every_nth=self.classification_num_steps/10,
save_every_nth=self.classification_num_steps*2)
preds = self.predict_on_data(sup_predict_X)
return loss, preds
def predict_on_data(self, X):
"""Gets the classification predictions on some data X.
Args:
X: a matrix of data
Returns: the predicted Y labels.
"""
return self.model.get_classification_predictions(X)
def get_cross_validation_results(self, param_dict):
"""Goes through every cross-validation fold in the class's DataLoader,
assesses all necessary metrics for each fold, and saves them into the
param_dict.
Args:
param_dict: A dictionary with keys representing parameter names and
values representing settings for those parameters.
Returns: The param_dict augmented with keys for the names of metrics and
values representing the score on those metrics.
"""
num_labels = len(self.classification_data_loader.wanted_labels)
all_acc = np.empty((self.num_cross_folds, num_labels))
all_auc = np.empty((self.num_cross_folds, num_labels))
all_f1 = np.empty((self.num_cross_folds, num_labels))
all_precision = np.empty((self.num_cross_folds, num_labels))
all_recall = np.empty((self.num_cross_folds, num_labels))
all_loss = [np.nan] * self.num_cross_folds
if self.check_noisy_data:
noisy_acc = np.empty((self.num_cross_folds, num_labels))
noisy_auc = np.empty((self.num_cross_folds, num_labels))
clean_acc = np.empty((self.num_cross_folds, num_labels))
clean_auc = np.empty((self.num_cross_folds, num_labels))
for f in range(self.num_cross_folds):
self.data_loader.set_to_cross_validation_fold(f)
self.classification_data_loader.set_to_cross_validation_fold(f)
all_loss[f], preds = self.train_and_predict(param_dict)
true_y = self.classification_data_loader.val_Y
if self.wanted_label is None:
for l in range(num_labels):
(all_acc[f,l], all_auc[f,l], all_f1[f,l],
all_precision[f,l], all_recall[f,l]) = gen_wrap.compute_all_classification_metrics(preds[:,l], true_y[:,l])
print "\tFinished training classifier, average acc was", np.mean(all_auc[f,:])
else:
(all_acc[f], all_auc[f], all_f1[f],
all_precision[f], all_recall[f]) = gen_wrap.compute_all_classification_metrics(preds, true_y)
print "\tFinished training classifier, acc was", all_acc[f]
if self.check_noisy_data:
noisy_preds = self.predict_on_data(self.classification_data_loader.noisy_val_X)
clean_preds = self.predict_on_data(self.classification_data_loader.clean_val_X)
if self.wanted_label is None:
for l in range(num_labels):
noisy_acc[f,l], noisy_auc[f,l], _, _, _ = gen_wrap.compute_all_classification_metrics(
noisy_preds[:,l], self.classification_data_loader.noisy_val_Y[:,l])
clean_acc[f,l], clean_auc[f,l], _, _, _ = gen_wrap.compute_all_classification_metrics(
clean_preds[:,l], self.classification_data_loader.clean_val_Y[:,l])
else:
noisy_acc[f], noisy_auc[f], _, _, _ = gen_wrap.compute_all_classification_metrics(
noisy_preds, self.classification_data_loader.noisy_val_Y)
clean_acc[f], clean_auc[f], _, _, _ = gen_wrap.compute_all_classification_metrics(
clean_preds, self.classification_data_loader.clean_val_Y)
param_dict['val_loss'] = np.nanmean(all_loss)
param_dict['val_acc'] = np.nanmean(all_acc)
param_dict['val_auc'] = np.nanmean(all_auc)
param_dict['val_f1'] = np.nanmean(all_f1)
param_dict['val_precision'] = np.nanmean(all_precision)
param_dict['val_recall'] = np.nanmean(all_recall)
print "Finished training all folds, average acc was", np.nanmean(all_acc)
if self.wanted_label is None:
for i,label in enumerate(LABELS_TO_PREDICT):
param_dict['val_acc_'+label] = np.nanmean(all_acc[:,i])
param_dict['val_auc_'+label] = np.nanmean(all_auc[:,i])
print "Average accuracy for label", label, "=", np.nanmean(all_acc[:,i])
if self.check_noisy_data:
param_dict['noisy_val_acc'] = np.nanmean(noisy_acc)
param_dict['noisy_val_auc'] = np.nanmean(noisy_auc)
print "Perf on noisy data:", np.nanmean(noisy_acc), "acc", np.nanmean(noisy_auc), "auc"
param_dict['clean_val_acc'] = np.nanmean(clean_acc)
param_dict['clean_val_auc'] = np.nanmean(clean_auc)
print "Perf on clean data:", np.nanmean(clean_acc), "acc", np.nanmean(clean_auc), "auc"
if self.wanted_label is None:
for i, label in enumerate(LABELS_TO_PREDICT):
param_dict['svm_noisy_val_acc_'+label] = np.nanmean(noisy_acc[:,i])
param_dict['svm_noisy_val_auc_'+label] = np.nanmean(noisy_auc[:,i])
param_dict['svm_clean_val_acc_'+label] = np.nanmean(clean_acc[:,i])
param_dict['svm_clean_val_auc_'+label] = np.nanmean(clean_auc[:,i])
return param_dict
def get_final_results(self):
"""Find the best setting and use it to test on the test data and
print the results."""
best_setting = self.find_best_setting()
print "\nThe best", self.optimize_for, "was", best_setting[self.optimize_for]
print "It was found with the following settings:"
print best_setting
if not self.check_test:
print "check_test is set to false, Will not evaluate performance on held-out test set."
return
print "\nAbout to evaluate results on held-out test set!!"
print "Will use the settings that produced the best", optimize_for
best_setting = self.convert_param_dict_for_use(best_setting)
print "\nFINAL TEST RESULTS:"
loss, preds = self.test_on_test(best_setting)
true_y = self.classification_data_loader.test_Y
accs = []
aucs = []
for i,label in enumerate(LABELS_TO_PREDICT):
acc, auc, f1, precision, recall = gen_wrap.compute_all_classification_metrics(preds[:,i], true_y[:,i])
print label, 'Acc:', acc, 'AUC:', auc, 'F1:', f1, 'Precision:', precision, 'Recall:', recall
accs.append(acc)
aucs.append(auc)
print "Overall:", 'Acc:', np.mean(accs), 'AUC:', np.mean(aucs)
def test_on_test(self, param_dict):
"""Trains the model and tests it on the tests data.
Args:
param_dict: A dictionary with keys representing parameter names and
values representing settings for those parameters.
Returns: the reconstruction loss obtained, and the classification
predictions.
"""
return train_and_predict(self, param_dict, predict_on='Test')
if __name__ == "__main__":
print "MMAE NN MODEL SELECTION"
print "\tThis code will sweep a set of parameters to find the ideal settings for an MMAE+NN on a single dataset"
datasets_path = 'Data/Cleaned/'
if len(sys.argv) < 3:
print "Error: usage is python svm.py <filename> <label> <continue>"
print "\t<MMAE filename>: e.g. all_modalities_present.csv - program will look in the following directory for this file", DEFAULT_MAIN_DIRECTORY + datasets_path
print "\t<classification filename>: e.g. modalities_missing.csv - program will look in the same directory for this file and use it for classification data"
print "\t<extra arguments>: optional. If 'True', the wrapper will continue from where it left off by loading a previous validation results file",
print "if the name of a label, the wrapper will only train to classify on that label. Can have multiple extra arguments, e.g. happiness True"
sys.exit()
filename = sys.argv[1] #get data file from command line argument
classification_filename = sys.argv[2]
print "\nLoading dataset", DEFAULT_MAIN_DIRECTORY + datasets_path + filename
print ""
if len(sys.argv) >=4:
extra_args = sys.argv[3]
label = None
if 'true' in extra_args.lower() or 'cont' in extra_args.lower():
cont = True
else:
label = extra_args
if len(sys.argv) >= 5 and sys.argv[4] == 'True':
cont = True
print "Okay, will continue from a previously saved validation results file for this problem"
else:
cont = False
print ""
wrapper = MMAEClassificationWrapper(filename, classification_filename, dropbox_path=PATH_TO_DROPBOX,
datasets_path=datasets_path, cont=cont, wanted_label=label)
print "\nThe validation results dataframe will be saved in:", wrapper.results_path + wrapper.save_prefix + '.csv'
wrapper.run()