-
Notifications
You must be signed in to change notification settings - Fork 12
/
driver.py
378 lines (320 loc) · 14.1 KB
/
driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
#!/usr/bin/env python
"""
Implementation of Dell Zhang's solution to Wikipedia's Participation Challenge on Kaggle
Kaggle competition: http://www.kaggle.com/c/wikichallenge
Dell Zhang's solution: http://blog.kaggle.com/2011/10/26/long-live-wikipedia-dell-zhang
"""
from calendar import monthrange
from datetime import datetime
from itertools import takewhile
import cPickle as pickle
import locale
import math
import sys
import numpy as np
from sklearn import linear_model
from sklearn import svm
from sklearn import neighbors
from sklearn import gaussian_process
import cv2
#
# Constants
#
locale.setlocale(locale.LC_ALL, 'en_US')
PERIODS = [1/16.0, 1/8.0, 1/4.0, 1/2.0, 1, 2, 4, 12, 36, 108]
TEST_TIMES = {'training': 116,
'moredata': 111,
'validation': 84,
}
# NB: NeighborsRegressor was deprected in sklearn 0.9
# NB: mode "mean" for NeighborsRegressor does not exist
MODEL_TYPES = {
'ols': linear_model.LinearRegression(),
'ridge': linear_model.Ridge(),
'lasso': linear_model.Lasso(),
'elasticnet': linear_model.ElasticNet(),
'sgd': linear_model.SGDRegressor(),
'svr': svm.SVR(),
'svr_linear': svm.SVR(kernel='linear', C=1),
'svr_rbf': svm.SVR(kernel='rbf', C=1e3),
'5nn': neighbors.KNeighborsRegressor(),
'120nn': neighbors.KNeighborsRegressor(n_neighbors=120),
'gaussian_process': gaussian_process.GaussianProcess(),
'gbt': cv2.GBTrees()
}
GBT_WEAK_COUNT = 1000
#
# Data processing
#
def parse_timestamp(dt_str):
"""Maps a timestamp into months since 1/1/2001
"""
dt = datetime.strptime(dt_str, '%Y-%m-%d %H:%M:%S')
# months
months = (dt.year - 2001) * 12 + (dt.month - 1)
# days
# NB: monthrange() returns the number of days in a particular month
delta = dt - datetime(dt.year, dt.month, 1)
days = delta.days + (delta.seconds + delta.microseconds/1000000.0)/(3600.0*24.0)
days /= float(monthrange(dt.year, dt.month)[1])
return months + days
def process_edits(dataset, time_test):
"""
Read through the list of edits and:
1) Extract a list of all editors who have made an edit in the 1 year prior
to the start of the testing period, and
2) Extract user_id, article_id, and transformed timestamp for each edit
"""
active_editors_test = []
processed_edits = []
data_file = open('data/%s.tsv' % dataset)
data_file.readline() # header
for line_number, line in enumerate(data_file):
# Progress report
if line_number % 10000 == 0: print "Processing line %s" % locale.format('%d', line_number, grouping=True)
# Save user_id, article_id, and transformed timestamp
attr = line.strip().split('\t')
user = int(attr[0])
article = int(attr[1])
parsed_timestamp = parse_timestamp(attr[4])
processed_edits.append([user, article, parsed_timestamp])
# If the edit happened within 1 year of the test time, note the editor
if parsed_timestamp >= time_test - 12 and parsed_timestamp < time_test:
active_editors_test.append(user)
data_file.close()
# deduplicate active_editors
active_editors_test = list(set(active_editors_test))
return active_editors_test, processed_edits
def filter_and_group_edits(active_editors_test, processed_edits):
"""
1) Filter out 1y inactive users by doing a sort-merge join
2) Group the edits for each user
3) Sort the edits for each user by timestamp
Returns grouped_edits[user_id] = [(timestamp1, article1), (timestamp2, article2), ...]
"""
grouped_edits = {}
edits_enumerator = enumerate(processed_edits)
edit_number, edit = edits_enumerator.next()
for editor in active_editors_test:
# Skip to edits by editor
done = False
while edit[0] != editor and not done:
try:
edit_number, edit = edits_enumerator.next()
# Progress report
if edit_number % 10000 == 0: print "Processing edit %s" % locale.format('%d', edit_number, grouping=True)
except StopIteration:
done = True
# Collect all edits by editor
grouped_edits[editor] = []
while edit[0] == editor and not done:
_, article_id, timestamp = edit
grouped_edits[editor].append((timestamp, article_id))
try:
edit_number, edit = edits_enumerator.next()
# Progress report
if edit_number % 10000 == 0: print "Processing edit %s" % locale.format('%d', edit_number, grouping=True)
except StopIteration:
done = True
# Sort edits by editor by timestamp in descending order
# NB: need copysign since cmp needs an integer and int() and math.trunc() round towards 0
# TODO: figure out if this even helps
grouped_edits[editor].sort(cmp=lambda x, y: int(math.copysign(1, x[0] - y[0])), reverse=True)
return grouped_edits
#
# Featurize/targetize
#
def count_edits(edits, deadline_month, months_prior):
"""Calculate the number of edits between deadline_month - months_prior and deadline_month
"""
return len([edit for edit in edits if deadline_month - months_prior < edit[0] < deadline_month])
def count_articles(edits, deadline_month, months_prior):
"""Calculate the number of articles edited between deadline_month - months_prior and deadline_month
"""
return len(set([edit[1] for edit in edits if deadline_month - months_prior < edit[0] < deadline_month]))
def time_between_first_and_last_edit(edits, deadline_month):
"""Calculate the time between the first edit and the last edit before the deadline_month
"""
last_edit_index = len([edit for edit in takewhile(lambda x: x[0] > deadline_month, edits)])
try:
return edits[last_edit_index][0] - edits[-1][0]
except IndexError:
return 0.0 # No edits before deadline_month
def featurize_single_editor(edits, deadline_month):
"""Generate the 21 features for a single editor
"""
vector = []
# number of edits
vector += [float(count_edits(edits, deadline_month, months_prior)) for months_prior in PERIODS]
# number of articles
vector += [float(count_articles(edits, deadline_month, months_prior)) for months_prior in PERIODS]
# natural log of time between first and last edit
vector += [math.log1p(time_between_first_and_last_edit(edits, deadline_month))]
return vector
def featurize_all_editors(editors, grouped_edits, deadline_month):
"""Generate the 21 features for all editors
"""
return [featurize_single_editor(grouped_edits[editor], deadline_month) for editor in editors]
def load_validation_targets_test(active_editors_test):
"""Load the known targets_test for the validation set
"""
validation_targets_test_file = open('data/validation_solutions.csv', 'r')
validation_targets_test_file.readline() # header
edits_count = {}
for line_number, line in enumerate(validation_targets_test_file):
# Progress report
if line_number % 10000 == 0: print "Processing line %s" % locale.format('%d', line_number, grouping=True)
(editor_str, target_str) = line.strip().split(',')
editor = int(editor_str)
target = int(target_str)
if editor not in active_editors_test:
continue
edits_count[editor] = target
validation_targets_test_file.close()
return edits_count
def targetize_all_editors(editors, grouped_edits, deadline_month):
"""
Generate the targets for all editors, where the target for a single editor is the
natural log of the number of edits in a chosen five month period
"""
return [math.log1p(count_edits(grouped_edits[editor], deadline_month, 5)) for editor in editors]
#
# Model fitting and predicting
#
# TODO: parameterize model fitting
def learn(featurized_data_train, targets_train, model_type='gbt'):
"""Fit the model
"""
model = MODEL_TYPES[model_type]
# model.fit(featurized_data_train, targets_train) for sklearn
# 'subsample_portion':0.8, 'shrinkage':0.01
model.train(featurized_data_train,
cv2.CV_ROW_SAMPLE,
targets_train,
params={'weak_count':GBT_WEAK_COUNT})
return model
def drift(active_editors_test, grouped_edits, time_train, time_test):
"""Calculate drift
"""
average_train = sum([math.log1p(count_edits(grouped_edits[editor], time_train, 5))
for editor in active_editors_test])/len(active_editors_test)
average_test = sum([math.log1p(count_edits(grouped_edits[editor], time_test, 5))
for editor in active_editors_test])/len(active_editors_test)
return average_test - average_train
# TODO: parameterize estimation
def estimate(model, featurized_data_test, drift):
# model.predict(featurized_data_test) for sklearn
forecasts = [model.predict(sample) for sample in featurized_data_test]
return [max(y + drift, 0) for y in forecasts]
def rmsle(targets_test_predicted, targets_test):
n = len(targets_test_predicted)
sle = sum([math.pow(targets_test_predicted[i] - targets_test[i], 2) for i in range(n)])
return math.sqrt(sle/n)
#
# Output formatting
#
def format_for_vw(featurized_data, targets):
"""Return a string in Vowpal Wabbit's format
"""
return '\n'.join(['%s |features %s' % (label, ' '.join(['%s:%s' % (i, v) for i, v in enumerate(features)]))
for label, features in izip(targets, featurized_data)])
#
# Main
#
if __name__ == "__main__":
dataset = sys.argv[1]
try:
time_test = TEST_TIMES[dataset]
time_train = time_test - 5
except KeyError:
sys.exit("The dataset %s is unknown." % dataset)
#
# 1. Process data
#
print "Generating active editors and processed edits lists"
active_editors_test, processed_edits = process_edits(dataset, time_test)
print "Done generating active editors and processed edits lists"
print "Sorting and pickling the active editors"
active_editors_test.sort()
pkl_file = open('data/%s_active_editors_test.pkl' % dataset, 'wb')
pickle.dump(active_editors_test, pkl_file, -1)
pkl_file.close()
print "Done sorting and pickling the active editors"
print "Sorting and pickling the processed edits"
processed_edits.sort(cmp=lambda x, y: x[0] - y[0])
pkl_file = open('data/%s_processed_edits.pkl' % dataset, 'wb')
pickle.dump(processed_edits, pkl_file, -1)
pkl_file.close()
print "Done sorting and pickling the processed edits"
print "Filtering and grouping edits"
grouped_edits = filter_and_group_edits(active_editors_test, processed_edits)
print "Done filtering and grouping edits"
print "Pickling grouped edits"
pkl_file = open('data/%s_grouped_edits.pkl' % dataset, 'wb')
pickle.dump(grouped_edits, pkl_file, -1)
pkl_file.close()
print "Done pickling grouped edits"
#
# 2. Generate featurized data and targets for training
#
print "Calculating elgible editors for training"
active_editors_train = [editor for editor in active_editors_test
if count_edits(grouped_edits[editor], time_train, 12)]
pkl_file = open('data/%s_active_editors_train.pkl' % dataset, 'wb')
pickle.dump(active_editors_train, pkl_file, -1)
pkl_file.close()
print "Done calculating elgible editors for training"
print "Generating featurized data for training"
featurized_data_train = np.array(featurize_all_editors(active_editors_train, grouped_edits, time_train), dtype=np.float32)
pkl_file = open('data/%s_featurized_data_train.pkl' % dataset, 'wb')
pickle.dump(featurized_data_train, pkl_file, -1)
pkl_file.close()
print "Done generating featurized data for training"
print "Generating targets for training"
targets_train = np.array(targetize_all_editors(active_editors_train, grouped_edits, time_train + 5), dtype=np.float32)
pkl_file = open('data/%s_targets_train.pkl' % dataset, 'wb')
pickle.dump(targets_train, pkl_file, -1)
pkl_file.close()
print "Done generating targets for training"
#
# 3. Train model
#
print "Fitting model"
model = learn(featurized_data_train, targets_train)
print "Done fitting model"
#
# 4. Generate featurized data and targets for testing
#
print "Generating featurized data for testing"
featurized_data_test = np.array(featurize_all_editors(active_editors_test, grouped_edits, time_test), dtype=np.float32)
pkl_file = open('data/%s_featurized_data_test.pkl' % dataset, 'wb')
pickle.dump(featurized_data_test, pkl_file, -1)
pkl_file.close()
print "Done fenerating featurized data for testing"
print "Generating targets for testing"
if dataset == 'validation':
print "Loading known validation test targets"
edits_count = load_validation_targets_test(active_editors_test)
targets_test = np.array([math.log1p(edits_count[editor]) for editor in active_editors_test], dtype=np.float32)
print "Done loading known validation test targets"
else:
targets_test = np.array(targetize_all_editors(active_editors_test, grouped_edits, time_test + 5), dtype=np.float32)
pkl_file = open('data/%s_targets_test.pkl' % dataset, 'wb')
pickle.dump(targets_test, pkl_file, -1)
pkl_file.close()
print "Done generating targets for testing"
#
# 5. Predict the targets
#
print "Calculating drift"
# validation drift = -0.271349248816
# moredata drift = 0.0112541788177
drift = drift(active_editors_test, grouped_edits, time_train, time_test)
print "Done calculating drift: %s" % drift
print "Predicting the test targets"
targets_test_predicted = estimate(model, featurized_data_test, drift)
print "Done predicting the test targets"
#
# 6. Calculate loss
#
print "RMSLE = %.6f" % rmsle(targets_test_predicted, targets_test)