-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig.py
171 lines (131 loc) · 10.3 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def load_parameters():
'''
Loads the defined parameters
'''
# Input data params
DATASET_NAME = 'cnn_polarity'
SRC_LAN = 'de' # Input language
TRG_LAN = 'en' # Outputs language
MODE = 'semisupervised-selection' # 'training', 'sampling', 'semisupervised-selection'
BINARY_SELECTION = True # Binary classification problem (currently, 'semisupervised-selection' only supports BINARY_SELECTION)
BILINGUAL_SELECTION = True # Use source and target text for classification
# Path to data
ROOT_PATH = '/media/HDD_2TB/DATASETS/%s/' % DATASET_NAME # Root path to the data folder
DATA_ROOT_PATH = ROOT_PATH + 'DATA/Emea-Euro/De-En' # Path to the corpora folder
DEST_ROOT_PATH = ROOT_PATH + 'Selection-Keras/' + SRC_LAN + TRG_LAN # Path to store results
DEBUG = False # If True, it will store temporal files
INSTANCES_TO_ADD = 50000 # 'r' parameter. Number of sentences added at each iteration
if BINARY_SELECTION:
POSITIVE_FILENAME = 'EMEA.de-en.clean' # In-domain corpus (I)
NEGATIVE_FILENAME = 'dev' # Initial negative corpus (N_0)
if 'semisupervised' in MODE:
POOL_FILENAME = 'training' # Initial pool of out-of-domain sentences (G_0)
# Fill these dictionaries for a regular sentence classification task
TEXT_FILES = {}#{'train': 'training.' + SRC_LAN, 'val': 'val.' + SRC_LAN}
CLASS_FILES = {}#{'train': 'training.class', 'val': 'val.class'}
# Dataset parameters
INPUTS_IDS_DATASET = ['source_text', 'target_text'] if BILINGUAL_SELECTION else ['source_text'] # Corresponding inputs of the dataset
OUTPUTS_IDS_DATASET = ['class'] # Corresponding outputs of the dataset
INPUTS_IDS_MODEL = ['source_text', 'target_text'] if BILINGUAL_SELECTION else ['source_text'] # Corresponding inputs of the built model
OUTPUTS_IDS_MODEL = ['class'] # Corresponding outputs of the built model
# Evaluation params
METRICS = ['multilabel_metrics'] # Metric used for evaluating model after each epoch (leave empty if only prediction is required)
EVAL_ON_SETS = [] # Possible values: 'train', 'val' and 'test' (external evaluator)
EVAL_ON_SETS_KERAS = [] # Possible values: 'train', 'val' and 'test' (Keras' evaluator)
START_EVAL_ON_EPOCH = 1 # First epoch where the model will be evaluated
EVAL_EACH_EPOCHS = True # Select whether evaluate between N epochs or N updates
EVAL_EACH = 1 # Sets the evaluation frequency (epochs or updates)
# Early stop parameters
EARLY_STOP = True # Turns on/off the early stop protocol
PATIENCE = 15 # We'll stop if the val STOP_METRIC does not improve after this number of evaluations
STOP_METRIC = 'accuracy' # Metric for the stop
# Word representation params
TOKENIZATION_METHOD = 'tokenize_CNN_sentence' # Select which tokenization we'll apply:
# Input text parameters
VOCABULARY_SIZE = 0 # Size of the input vocabulary. Set to 0 for using all, otherwise will be truncated to these most frequent words.
MIN_OCCURRENCES_VOCAB = 0 # Minimum number of occurrences allowed for the words in the vocabulay. Set to 0 for using them all.
MAX_INPUT_TEXT_LEN = 40 # Maximum length of the input sequence
# Input text parameters
INPUT_VOCABULARY_SIZE = 0 # Size of the input vocabulary. Set to 0 for using all, otherwise will be truncated to these most frequent words.
MIN_OCCURRENCES_VOCAB = 0 # Minimum number of occurrences allowed for the words in the vocabulay. Set to 0 for using them all.
# Optimizer parameters (see model.compile() function)
LOSS = 'categorical_crossentropy'
CLASS_MODE = 'categorical'
OPTIMIZER = 'Adam' # Optimizer
LR = 0.0001 # (recommended values - Adam 0.001 - Adadelta 1.0
WEIGHT_DECAY = 1e-4 # L2 regularization
CLIP_C = 9. # During training, clip gradients to this norm
SAMPLE_WEIGHTS = False # Select whether we use a weights matrix (mask) for the data outputs
# Training parameters
MAX_EPOCH = 10 # Stop when computed this number of epochs
BATCH_SIZE = 512 # Training batch size
N_ITER = 15 # Iterations to perform of the semisupervised selection
HOMOGENEOUS_BATCHES = False # Use batches with homogeneous output lengths for every minibatch (Dangerous!)
PARALLEL_LOADERS = 8 # Parallel data batch loaders
EPOCHS_FOR_SAVE = 1 # Number of epochs between model saves
WRITE_VALID_SAMPLES = True # Write valid samples in file
DATA_AUGMENTATION = False # Apply data augmentation on input data (still unimplemented for text inputs)
# Model parameters
MODEL_TYPE = 'BLSTM_Classifier' # Model type. See model_zoo.py
CLASSIFIER_ACTIVATION = 'softmax' # Last layer activation
PAD_ON_BATCH = 'CNN' not in MODEL_TYPE # Padded batches
FILL = 'end' if 'CNN' not in MODEL_TYPE else 'center' # whether we fill the 'end', 'center' or start' of the sentence with 0s
N_CLASSES = 2 # Number of classes
# Word embedding parameters
SRC_PRETRAINED_VECTORS = '/media/HDD_2TB/DATASETS/cnn_polarity/DATA/word2vec.%s.npy' % SRC_LAN # Path to pretrained vectors. Set to None if you don't want to use pretrained vectors.
SRC_PRETRAINED_VECTORS_TRAINABLE = True # Finetune or not the word embedding vectors.
TRG_PRETRAINED_VECTORS = '/media/HDD_2TB/DATASETS/cnn_polarity/DATA/word2vec.%s.npy' % TRG_LAN # Path to pretrained vectors. Set to None if you don't want to use pretrained vectors.
TRG_PRETRAINED_VECTORS_TRAINABLE = True # Finetune or not the word embedding vectors.
SRC_TEXT_EMBEDDING_HIDDEN_SIZE = 300 # When using pretrained word embeddings, this parameter must match with the word embeddings size
TRG_TEXT_EMBEDDING_HIDDEN_SIZE = 300 # When using pretrained word embeddings, this parameter must match with the word embeddings size
# LSTM layers dimensions (Only used if needed)
LSTM_ENCODER_HIDDEN_SIZE = 300 # For models with LSTM encoder
# FC layers for initializing the first LSTM state
# Here we should only specify the activation function of each layer (as they have a potentially fixed size)
# (e.g INIT_LAYERS = ['tanh', 'relu'])
INIT_LAYERS = ['tanh']
# CNN layers parameters (Only used if needed)
NUM_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
POOL_LENGTH = 2
CNN_ACTIVATION = 'relu'
# General architectural parameters
# Fully-connected layers for visual embedding
# Here we should specify the activation function and the output dimension
# (e.g ADDITIONAL_EMBEDDING_LAYERS = [('linear', 1024)]
ADDITIONAL_EMBEDDING_LAYERS = []
# additional Fully-Connected layers's sizes applied before softmax.
# Here we should specify the activation function and the output dimension
# (e.g DEEP_OUTPUT_LAYERS = [('tanh', 600), ('relu',400), ('relu':200)])
DEEP_OUTPUT_LAYERS = [('relu', 200), ('linear', 100)]
# Regularizers
USE_DROPOUT = False # Use dropout
DROPOUT_P = 0.5 # Percentage of units to drop
USE_NOISE = True # Use gaussian noise during training
NOISE_AMOUNT = 0.01 # Amount of noise
USE_BATCH_NORMALIZATION = True # If True it is recommended to deactivate Dropout
BATCH_NORMALIZATION_MODE = 1 # See documentation in Keras' BN
USE_PRELU = False # use PReLU activations as regularizer
USE_L2 = False # L2 normalization on the features
# Results plot and models storing parameters
EXTRA_NAME = '' # This will be appended to the end of the model name
MODEL_NAME = DATASET_NAME + '_' + MODEL_TYPE + '_txtemb_' + str(SRC_TEXT_EMBEDDING_HIDDEN_SIZE) + \
'_addemb_' + '_'.join([layer[0] for layer in ADDITIONAL_EMBEDDING_LAYERS]) + \
'_' + str(LSTM_ENCODER_HIDDEN_SIZE) + \
'_deepout_' + '_'.join([layer[0] for layer in DEEP_OUTPUT_LAYERS]) + \
'_' + OPTIMIZER
MODEL_NAME += EXTRA_NAME
STORE_PATH = 'trained_models/' + MODEL_NAME + '/' # Models and evaluation results will be stored here
DATASET_STORE_PATH = 'datasets/' # Dataset instance will be stored here
SAMPLING_SAVE_MODE = 'numpy' # 'list', 'numpy', 'vqa'
VERBOSE = 1 # Verbosity level
RELOAD = 0 # If 0 start training from scratch, otherwise the model
# Saved on epoch 'RELOAD' will be used
REBUILD_DATASET = True # Build again or use stored instance
# Extra parameters for special trainings
TRAIN_ON_TRAINVAL = False # train the model on both training and validation sets combined
FORCE_RELOAD_VOCABULARY = False # force building a new vocabulary from the training samples
# applicable if RELOAD > 1
# ============================================
parameters = locals().copy()
return parameters