-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathasap_reader.py
241 lines (209 loc) · 8.78 KB
/
asap_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import random
import codecs
import sys
import nltk
import logging
import re
import numpy as np
import pickle as pk
logger = logging.getLogger(__name__)
num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')
ref_scores_dtype = 'int32'
asap_ranges = {
0: (0, 60),
1: (2, 12),
2: (1, 6),
3: (0, 3),
4: (0, 3),
5: (0, 4),
6: (0, 4),
7: (0, 30),
8: (0, 60),
9: (0, 25)
}
def get_ref_dtype():
return ref_scores_dtype
def tokenize(string):
tokens = nltk.word_tokenize(string)
for index, token in enumerate(tokens):
if token == '@' and (index + 1) < len(tokens):
tokens[index + 1] = '@' + re.sub('[0-9]+.*', '', tokens[index + 1])
tokens.pop(index)
return tokens
def get_score_range(prompt_id):
return asap_ranges[prompt_id]
def get_model_friendly_scores(scores_array, prompt_id_array):
arg_type = type(prompt_id_array)
assert arg_type in {int, np.ndarray}
if arg_type is int:
low, high = asap_ranges[prompt_id_array]
scores_array = (scores_array - low) / (high - low)
else:
assert scores_array.shape[0] == prompt_id_array.shape[0]
dim = scores_array.shape[0]
low = np.zeros(dim)
high = np.zeros(dim)
for ii in range(dim):
low[ii], high[ii] = asap_ranges[prompt_id_array[ii]]
scores_array = (scores_array - low) / (high - low)
# assert np.all(scores_array >= 0) and np.all(scores_array <= 1)
return scores_array
def convert_to_dataset_friendly_scores(scores_array, prompt_id_array):
arg_type = type(prompt_id_array)
assert arg_type in {int, np.ndarray}
if arg_type is int:
low, high = asap_ranges[prompt_id_array]
scores_array = scores_array * (high - low) + low
assert np.all(scores_array >= low) and np.all(scores_array <= high)
else:
assert scores_array.shape[0] == prompt_id_array.shape[0]
dim = scores_array.shape[0]
low = np.zeros(dim)
high = np.zeros(dim)
for ii in range(dim):
low[ii], high[ii] = asap_ranges[prompt_id_array[ii]]
scores_array = scores_array * (high - low) + low
return scores_array
def is_number(token):
return bool(num_regex.match(token))
def load_vocab(vocab_path):
logger.info('Loading vocabulary from: ' + vocab_path)
with open(vocab_path, 'rb') as vocab_file:
vocab = pk.load(vocab_file)
return vocab
def create_vocab(file_path, prompt_id, maxlen, vocab_size, tokenize_text, to_lower):
logger.info('Creating vocabulary from: ' + file_path)
if maxlen > 0:
logger.info(' Removing sequences with more than ' + str(maxlen) + ' words')
total_words, unique_words = 0, 0
word_freqs = {}
with codecs.open(file_path, mode='r', encoding='UTF8') as input_file:
input_file.next()
for line in input_file:
tokens = line.strip().split('\t')
essay_id = int(tokens[0])
essay_set = int(tokens[1])
content = tokens[2].strip()
if essay_set == prompt_id or prompt_id <= 0:
if to_lower:
content = content.lower()
if tokenize_text:
content = tokenize(content)
if to_lower:
content = [w.lower() for w in content]
else:
content = content.split()
if maxlen > 0 and len(content) > maxlen:
content = content[:maxlen]
for word in content:
try:
word_freqs[word] += 1
except KeyError:
unique_words += 1
word_freqs[word] = 1
total_words += 1
logger.info(' %i total words, %i unique words' % (total_words, unique_words))
import operator
sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)
if vocab_size <= 0:
# Choose vocab size automatically by removing all singletons
vocab_size = 0
for word, freq in sorted_word_freqs:
if freq > 1:
vocab_size += 1
vocab = {'<pad>': 0, '<unk>': 1, '<num>': 2}
vcb_len = len(vocab)
index = vcb_len
for word, _ in sorted_word_freqs[:vocab_size - vcb_len]:
vocab[word] = index
index += 1
return vocab
def read_essays(file_path, prompt_id):
logger.info('Reading tsv from: ' + file_path)
essays_list = []
essays_ids = []
with codecs.open(file_path, mode='r', encoding='UTF8') as input_file:
input_file.next()
for line in input_file:
tokens = line.strip().split('\t')
if int(tokens[1]) == prompt_id or prompt_id <= 0:
if tokens[3] != '##':
essays_list.append(tokens[2].strip())
essays_ids.append(int(tokens[0]))
return essays_list, essays_ids
def read_dataset(file_path, prompt_id, maxlen, vocab, tokenize_text, to_lower, score_index=6, char_level=False):
logger.info('Reading dataset from: ' + file_path)
if maxlen > 0:
logger.info(' Removing sequences with more than ' + str(maxlen) + ' words')
data_x, data_y, prompt_ids = [], [], []
num_hit, unk_hit, total = 0., 0., 0.
maxlen_x = -1
with codecs.open(file_path, mode='r', encoding='UTF8') as input_file:
input_file.next()
for line in input_file:
tokens = line.strip().split('\t')
if tokens[3] != '##':
essay_id = int(tokens[0])
essay_set = int(tokens[1])
content = tokens[2].strip()
if essay_set == 9:
score = float(tokens[3])
else:
score = float(tokens[score_index])
if essay_set == prompt_id or prompt_id <= 0:
if to_lower:
content = content.lower()
if char_level:
# content = list(content)
raise NotImplementedError
else:
if tokenize_text:
content = tokenize(content)
else:
content = content.split()
if maxlen > 0 and len(content) > maxlen:
content = content[:maxlen]
indices = []
if char_level:
raise NotImplementedError
else:
for word in content:
if is_number(word):
indices.append(vocab['<num>'])
num_hit += 1
elif word in vocab:
indices.append(vocab[word])
else:
indices.append(vocab['<unk>'])
unk_hit += 1
total += 1
data_x.append(indices)
data_y.append(score)
prompt_ids.append(essay_set)
if maxlen_x < len(indices):
maxlen_x = len(indices)
logger.info(' <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100 * num_hit / total, 100 * unk_hit / total))
return data_x, data_y, prompt_ids, maxlen_x
def get_data(paths, prompt_id, vocab_size, maxlen, tokenize_text=True, to_lower=True, sort_by_len=False,
vocab_path=None, score_index=6):
train_path, dev_path, test_path = paths[0], paths[1], paths[2]
if not vocab_path:
vocab = create_vocab(train_path, prompt_id, maxlen, vocab_size, tokenize_text, to_lower)
if len(vocab) < vocab_size:
logger.warning('The vocabualry includes only %i words (less than %i)' % (len(vocab), vocab_size))
else:
assert vocab_size == 0 or len(vocab) == vocab_size
else:
vocab = load_vocab(vocab_path)
if len(vocab) != vocab_size:
logger.warning(
'The vocabualry includes %i words which is different from given: %i' % (len(vocab), vocab_size))
logger.info(' Vocab size: %i' % (len(vocab)))
train_x, train_y, train_prompts, train_maxlen = read_dataset(train_path, prompt_id, maxlen, vocab, tokenize_text,
to_lower)
dev_x, dev_y, dev_prompts, dev_maxlen = read_dataset(dev_path, prompt_id, maxlen, vocab, tokenize_text, to_lower)
test_x, test_y, test_prompts, test_maxlen = read_dataset(test_path, prompt_id, maxlen, vocab, tokenize_text, to_lower)
overal_maxlen = max(train_maxlen, dev_maxlen, test_maxlen)
return (
(train_x, train_y, train_prompts), (dev_x, dev_y, dev_prompts), (test_x, test_y, test_prompts), vocab, len(vocab),
overal_maxlen, 1)