This repository has been archived by the owner on May 17, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
model_training.py
152 lines (120 loc) · 5.8 KB
/
model_training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
from collections import defaultdict
import hashlib
import datetime
import pickle
df = pd.read_csv('job_salary_data/job_salary_data.csv')
df.shape
df.head()
# Missing data
df.count()
Counter(df.ContractType)
Counter(df.ContractTime)
df.SalaryNormalized.median()
feature_registry = {} #To be filled with features below
def feature_fn(f):
"""
Decorator for feature functions. Adds function to registry, and does some basic type checking
or arguments and return values
"""
def decorator(raw_data):
assert type(raw_data) == pd.DataFrame, "Bad argument type"
res = f(raw_data)
assert (type(res) == pd.DataFrame) or (type(res) == pd.Series), "Bad return value type"
return res
feature_registry[f.__name__] = decorator
return decorator
def design_matrix(raw_data, feature_registry_names):
assert len(set(feature_registry_names)) == len(feature_registry_names), \
"Duplicate feature names detected"
assert set(feature_registry_names).issubset(set(feature_registry.keys())), \
"Unknown feature fn name %s" % (set(feature_registry_names) - set(feature_registry.keys()))
feature_dataframes = []
for feature_fn_name in feature_registry_names:
feature_dataframes.append(feature_registry[feature_fn_name](raw_data))
return pd.concat(feature_dataframes, axis=1)
def labels(raw_data):
return (raw_data.SalaryNormalized > 30000).astype(int)
train_test_split_index = int(12200 * 0.7) # 70% of data for training, 30% of data for testing
train_data = df.iloc[:train_test_split_index]
test_data = df.iloc[train_test_split_index:]
category_median_salary_dict = defaultdict(lambda: 30000)
category_median_salary_dict.update(train_data[['Category', 'SalaryNormalized']].groupby('Category').median().to_dict()['SalaryNormalized'])
@feature_fn
def category_median_salary(raw_data):
return pd.DataFrame(data=[category_median_salary_dict[row['Category']] for _, row in raw_data.iterrows()],
index=raw_data.index,
columns=['category_median_salary'])
CONTRACT_TYPES = ['nan', 'part_time', 'full_time']
@feature_fn
def contract_type_one_hot(df):
res = pd.DataFrame(index=df.index)
for contract_type in CONTRACT_TYPES:
res['contract_type_%s' % contract_type] = (df.ContractType.astype(str) == contract_type).astype(int)
return res
CONTRACT_TIMES = ['nan', 'permanent', 'contract']
@feature_fn
def contract_time_one_hot(df):
res = pd.DataFrame(index=df.index)
for contract_type in CONTRACT_TIMES:
res['contract_time_%s' % contract_type] = (df.ContractTime.astype(str) == contract_type).astype(int)
return res
low_salary_counter = Counter(' '.join(train_data[train_data.SalaryNormalized <= 30000].Title).lower().split())
high_salary_counter = Counter(' '.join(train_data[train_data.SalaryNormalized > 30000].Title).lower().split())
low_salary_counter_frequent = Counter(dict([(k,v) for (k,v) in low_salary_counter.iteritems() if v > 10]))
high_salary_counter_frequent = Counter(dict([(k,v) for (k,v) in high_salary_counter.iteritems() if v > 10]))
title_words = set(low_salary_counter_frequent.keys()).union(set(high_salary_counter_frequent.keys()))
rel_freq = {}
for word in title_words:
rel_freq[word] = ((high_salary_counter[word] / float(sum(train_data.SalaryNormalized > 30000))) /
((low_salary_counter[word] + 0.01) / float(sum(train_data.SalaryNormalized <= 30000))))
HIGH_SALARY_WORDS = {key for key, value in rel_freq.iteritems() if value > 1}
LOW_SALARY_WORDS = {key for key, value in rel_freq.iteritems() if value <= 1}
def row_num_high_salary_words(row):
return len([word for word in row['Title'].lower().split() if word in HIGH_SALARY_WORDS])
@feature_fn
def num_high_salary_words(df):
return pd.DataFrame(data=df.apply(row_num_high_salary_words, axis=1),
columns=['num_high_salary_words'])
def row_num_low_salary_words(row):
return len([word for word in row['Title'].lower().split() if word in LOW_SALARY_WORDS])
@feature_fn
def num_low_salary_words(df):
return pd.DataFrame(data=df.apply(row_num_low_salary_words, axis=1),
columns=['num_low_salary_words'])
def model_description(model, model_config, raw_data):
res = {}
res['model_config'] = model_config
res['data_sha'] = hashlib.sha256(str(raw_data.describe)).hexdigest()
if type(model) == LogisticRegression:
res['description'] = str(zip(raw_data.columns, list(model.coef_[0])))
elif type(model) == RandomForestClassifier:
res['description'] = str(zip(raw_data.columns, list(model.feature_importances_)))
else:
raise RuntimeError, "Unknown model type"
return res
MODEL_SPEC = {'logistic_regression': LogisticRegression,
'random_forest': RandomForestClassifier}
def train_model(raw_data, model_config):
X = design_matrix(raw_data, model_config['feature_fns'])
model = MODEL_SPEC[model_config['model_type']]()
model.fit(X, labels(raw_data))
model.description = model_description(model, model_config, raw_data)
now_str = str(datetime.datetime.now()).replace(' ','_').replace('.','_').replace(':','_')
with open('model_descriptions/%s.txt' % now_str, 'w') as f:
f.write(str(model.description))
with open('model.pkl', 'w') as f:
pickle.dump(model, f)
if __name__ == '__main__':
model_config = {
'model_type': 'random_forest',
'feature_fns': ['category_median_salary',
'contract_type_one_hot',
'contract_time_one_hot',
'num_low_salary_words',
'num_high_salary_words']
}
train_model(df, model_config)