-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_splitter.py
207 lines (181 loc) · 7.28 KB
/
file_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import csv
import pandas as pd
import numpy as np
import datetime
import scipy.stats as stats
from ml_metrics import mapk
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
def parse_dates(df):
search_date = list(df['date_time'])
search_date = list(map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'), search_date))
df['search_weekday'] = list(map(lambda x: int(x.strftime('%w')), search_date))
df['search_month'] = list(map(lambda x: int(x.strftime('%m')), search_date))
df['search_monthday'] = list(map(lambda x: int(x.strftime('%d')), search_date))
del df['date_time']
chkin_date = list(df['srch_ci'])
chkin_date = list(map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'), chkin_date))
df['ci_weekday'] = list(map(lambda x: int(x.strftime('%w')), chkin_date))
df['ci_month'] = list(map(lambda x: int(x.strftime('%m')), chkin_date))
df['ci_monthday'] = list(map(lambda x: int(x.strftime('%d')), chkin_date))
del df['srch_ci']
chkout_date = list(df['srch_co'])
chkout_date = list(map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'), chkout_date))
df['co_weekday'] = list(map(lambda x: int(x.strftime('%w')), chkout_date))
df['co_month'] = list(map(lambda x: int(x.strftime('%m')), chkout_date))
df['co_monthday'] = list(map(lambda x: int(x.strftime('%d')), chkout_date))
del df['srch_co']
return df
def percent2mapk(predict_percent, k):
predict_map = []
for i_row, pred_row in enumerate(predict_percent):
predict_map.append([])
ranked_row = list(stats.rankdata(pred_row, method='ordinal'))
for op_rank in range(k):
predict_map[i_row].append(ranked_row.index(n_classes - op_rank - 1))
return predict_map
def list2str(predict_list, join_by):
str_list = []
for predict_result in predict_list:
predict_result = list(map(lambda x: str(x), predict_result))
str_list.append(join_by.join(predict_result))
return str_list
def y2list(y_array):
y_list = []
for actual in y_array:
y_list.append([actual])
return y_list
"""
Variables
"""
# Number of classes
n_classes = 100
# Sampling rate of the data
samp = 100
# Number of rows for train
n_rows = 1e8
# Whether to merge the data
merge = True
# sample_train filename, None if not required
train_file = None
# RF classifier properties
classifier = RandomForestClassifier(n_estimators=23, max_depth=30, random_state=42, max_features=0.25)
# Test batch
test_batch = 5000
"""
Read data
"""
# Read destinations
destinations = pd.DataFrame.from_csv('input/destinations.csv')
# Read and sample train
print('Read the train table columns')
train_rows = 0
with open('input/train.csv', newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
columns = spamreader.__next__()
train_samp = []
i = 0
for row in spamreader:
if i % samp == 0:
train_samp.append(row)
train_rows += 1
if train_rows == n_rows:
break
if i % 1e6 == 0:
print(i)
i += 1
csvfile.close()
# build pandas Dataframe
train_samp = pd.DataFrame(train_samp, columns=columns)
train_samp.index = train_samp.user_id
train_samp.hotel_cluster = train_samp.hotel_cluster.astype(int)
# Merge if required
if merge:
print('Merging')
train_samp = pd.merge(train_samp, destinations, left_on=train_samp.srch_destination_id.values.astype(int),
right_on=destinations.index.values, how='left')
# Export train file
if train_file:
print('Saving to sampled train to file')
train_samp.to_csv(train_file)
# Separate X_train and y_train
target = train_samp.hotel_cluster
# Removing excess columns in train
del train_samp['hotel_cluster']
del train_samp['cnt']
del train_samp['is_booking']
# Read test
test = pd.read_csv('input/test.csv', index_col=0)
"""
Feature engineering
"""
print('Feature engineering')
del train_samp['user_id']
del test['user_id']
# Change NaN dates to '1970-01-01'
train_samp[['date_time', 'srch_ci', 'srch_co']] = train_samp[['date_time', 'srch_ci', 'srch_co']].astype(str)
test[['date_time', 'srch_ci', 'srch_co']] = test[['date_time', 'srch_ci', 'srch_co']].astype(str)
train_samp[['date_time', 'srch_ci', 'srch_co']] = train_samp[['date_time', 'srch_ci',
'srch_co']].replace(['', 'nan', '2161-10-00'],
['1970-01-01', '1970-01-01',
'1970-01-01'])
test[['date_time', 'srch_ci', 'srch_co']] = test[['date_time', 'srch_ci',
'srch_co']].replace(['', 'nan', '2161-10-00'], ['1970-01-01',
'1970-01-01',
'1970-01-01'])
# Remove NaNs
train_samp = train_samp.replace('', '-10')
test = test.replace('', '-10')
train_samp = train_samp.fillna(-10)
test = test.fillna(-10)
# Parse date
train_samp = parse_dates(train_samp)
train_samp = train_samp.astype('float32')
test = parse_dates(test)
test = test.astype('float32')
# Debug printing
# print(train_samp.columns.values)
"""
MLing, CV
"""
print('CV')
X_train, X_test, y_train, y_test = train_test_split(train_samp.values, target.values, test_size=0.33, random_state=42)
classifier.fit(X_train, y_train)
train_predict_prob = np.zeros((X_test.shape[0], n_classes))
for batch_i in np.arange(0, X_test.shape[0], test_batch):
if (batch_i + test_batch) < X_test.shape[0]:
train_predict_prob[batch_i: batch_i + test_batch, :] = \
classifier.predict_proba(X_test[batch_i: batch_i + test_batch, :])
else:
train_predict_prob[batch_i:, :] = classifier.predict_proba(X_test[batch_i:, :])
train_predict_map = percent2mapk(train_predict_prob, 5)
y_test_list = y2list(y_test)
print('The mean average precision is %.4f' % mapk(y_test_list, train_predict_map, k=5))
train_predict_str = list2str(train_predict_map, ' ')
"""
MLing
"""
print('Batch predicting test')
classifier.fit(train_samp.values, target.values)
# Freeing memory
del train_samp, target, X_train, X_test, y_train, y_test, train_predict_prob, train_predict_map
if merge:
test = pd.merge(test, destinations, left_on=test.srch_destination_id.values.astype(int),
right_on=destinations.index.values, how='left')
test = test.fillna(-10)
test_predict_prob = np.zeros((test.shape[0], n_classes))
for batch_i in np.arange(0, test.shape[0], test_batch):
if (batch_i + test_batch) < test.shape[0]:
cur_batch = test.iloc[batch_i: batch_i + test_batch, :]
test_predict_prob[batch_i: batch_i + test_batch, :] = classifier.predict_proba(cur_batch.values)
else:
cur_batch = test.iloc[batch_i:, :]
test_predict_prob[batch_i:, :] = classifier.predict_proba(cur_batch.values)
test_predict_map = percent2mapk(test_predict_prob, 5)
test_predict_str = list2str(test_predict_map, ' ')
"""
Submitting
"""
submission = pd.DataFrame.from_csv('input/sample_submission.csv')
submission['hotel_cluster'] = test_predict_str
submission.to_csv('rf_sub_withdates_merged.csv')