-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscoring.py
127 lines (83 loc) · 3.78 KB
/
scoring.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from load_attributes import Attributes
from feature_engineering import FeatureEngineering
import pickle
import scipy as sci
import pandas as pd
class score_users(Attributes):
"""
This is where the scoring of users happen based on their
distance to clusters (our varticals and campaign types)
"""
def __init__(self, path, preprocessor= None, model_pickle= None):
Attributes.__init__(self, path)
self.preprocessor = preprocessor #This saves the pipeline object not initially declared in the Attributes init function.
self.model_pickle = model_pickle #This saves the model object not initially declared in the Attributes init function.
self.result = dict()
def print_data_shape(self):
"""
Function to check if inheritance works properly. Confirms that self.data
is inherited from the attribute class
"""
self.read_yaml_file()
self.read_csv()
print (self.data.head())
print ("The shape of the dataset is {}".format(self.data.shape))
def load_processor(self, processor_path):
"""
Function to load the pipeline pickle file
Args:
None
Returns:
The pipeline object
"""
with open(processor_path, 'rb') as file:
self.preprocessor = pickle.load(file)
return self.preprocessor
def load_model(self, model_path):
"""
Function to load the model pickle file
Args:
None
Returns:
The model object
"""
with open(model_path, 'rb') as file:
self.model_pickle = pickle.load(file)
return self.model_pickle
def fill_na(self): #This function handles NaN values in the dataset
"""
Function handles NaN values in a dataset for both categorical
and numerical variables
"""
for item in self.data[self.num]:
self.data[item] = self.data[item].fillna(self.data[item].mean())
for item in self.data[self.cat]:
self.data[item] = self.data[item].fillna(self.data[item].value_counts().index[0])
#self.data[item] = self.data[item].fillna(method = 'ffill')
def score(self):
"""
Function to score users based on their distance to clusters. Starts by reading the
YAML file for necessary input columns and goes on to read the file. The model
and preprocessor artifact are also loaded and data is eventually returned back to a
dataframe which is saved in an output path.
Args:
None
Returns:
Final DataFrame that contains MSISDNs and their distance to clusters
"""
self.read_yaml_file()
self.read_csv()
self.load_processor('pipeline.pkl')
self.load_model('kmeans.pkl')
self.msisdn = self.data['msisdn']
self.fill_na()
self.data.drop(['msisdn'],axis=1,inplace=True)
self.data_score = self.preprocessor.transform(self.data)
self.data_score = self.data_score.toarray()
self.predictions = self.model_pickle.predict(self.data_score)
self.center = self.model_pickle.cluster_centers_[0]
#self.result = dict()
for i, j in zip(self.data_score, self.msisdn):
self.result[j] = sci.spatial.distance.euclidean(self.center , i)
self.final_result = pd.DataFrame(self.result.items(), columns=['msisdn', 'score'])
return self.final_result