-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_id_pair.py
40 lines (32 loc) · 1.72 KB
/
gen_id_pair.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
train_seesion = pd.read_csv("data/train.csv", usecols=[0, 1, 2, 4, 5])
# test_seesion = pd.read_csv("data/test.csv")
output_path = './data/baseline model/'
# 只保留click out
train_seesion = train_seesion[train_seesion.action_type == 'clickout item']
# 只保留 reference有值且是數字
train_seesion = train_seesion[train_seesion['reference'].apply(lambda x: str(x).isdigit())]
train_seesion['reference'] = train_seesion['reference'].astype(int)
# 過濾掉出現次數過少的 item
item_supports = train_seesion.groupby('reference').size()
train_seesion = train_seesion[np.in1d(train_seesion.reference, item_supports[item_supports>=5].index)]
# 取 len > 2的 session id
train_seesion = train_seesion.groupby('session_id').filter(lambda x: len(x) > 2)
# user id mapping成數字
# le = LabelEncoder()
# train_seesion['user_id'] = le.fit_transform(train_seesion['user_id'])
le = LabelEncoder()
train_seesion['session_id'] = le.fit_transform(train_seesion['session_id'])
train_seesion.rename(columns={'session_id': 'SessionId', 'timestamp': 'Time', 'reference': 'ItemId'}, inplace=True)
# 留最後的5萬筆當validation
last_50k_sess = train_seesion.SessionId.unique()[-50000:]
val = train_seesion[train_seesion.SessionId.isin(last_50k_sess)]
train = train_seesion[~train_seesion.SessionId.isin(last_50k_sess)]
train.drop(['user_id'], axis=1, inplace=True) # drop col
val.drop(['user_id'], axis=1, inplace=True)
# 將資料整理成 user/ session_id 跟 item id pair的形式
# 先存成 txt (csv格式的)
train.to_csv(output_path + 'trivago_train_full.txt', sep='\t', index=False)
val.to_csv(output_path + 'trivago_val_full.txt', sep='\t', index=False)