-
Notifications
You must be signed in to change notification settings - Fork 0
/
ResultsData.py
127 lines (99 loc) · 5.13 KB
/
ResultsData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from LogRegModel import LogRegModel
'''This class builds the dataframe and then contains helper functions which take data from the dataframe
and passes the data to the LogRegModel class'''
class ResultsData(object):
testResultsHeader = ["TestID", "VehicleID", "TestDate", "VehicleClass", "TestType", "TestResult", "Mileage",
"PostcodeArea", "Make", "Model", "Colour", "FuelType", "EngineCapacity", "DateOfManufacture"]
def __init__(self, sourcePath):
self.sourcePath = sourcePath
self.df, self.targets = self.build_dfs()
self.model = None
'''This function takes the file path, builds a dataframe and modifies
the dataframe to enable further analysis'''
def build_dfs(self):
df = pd.read_table(self.sourcePath,
sep="|", header=None, names=ResultsData.testResultsHeader,
dtype={'TestID':np.uint32,
'VehicleID':np.uint32,
'Mileage':np.int32,
'VehicleClass':'category',
'EngineCapacity':np.uint16,
'FuelType':'category',
'TestType':'category',
'TestResult':'category',
'PostcodeArea':'category',
'Make':'category',
'Model':'category',
'Colour':'category'},
parse_dates=[2, 13])
#print(df.info())
df = df[df.Mileage < 250000]
df['TestResult'] = pd.Categorical(df['TestResult'], categories=["P", "F", "PRS", "ABA", "ABR", "R", 2, 1, 0])
df['TestResult'] = df.TestResult.replace(to_replace=["P", "F", "PRS", "ABA", "ABR", "R"],
value=[0, 1, 0, 1, 1, 1])
df['TestResult'] = df['TestResult'].astype(np.uint8)
df['VehicleAge'] = df['TestDate'] - df['DateOfManufacture']
df['VehicleAge'] = df['VehicleAge'].dt.days.astype(np.uint32)
df['MakeModel'] = df.Make.astype(str).str.cat(df.Model.astype(str), sep=' ')
df = df[df.VehicleAge < 12000]
targets = df.as_matrix(['TestResult']).ravel()
return df, targets
'''generate_model is called from the main function and builds a logistic
regression model using the data in the dataframe, based on the features passed as a parameter'''
def generate_model(self, features):
codedFeatures = features[:]
catIndices = []
i = 0
for feature in features:
if str(self.df[feature].dtype) == "category":
codedFeatures[i] = self.add_coded_column(features[i])
catIndices.append(i)
i += 1
featureMatrix = self.df.as_matrix(codedFeatures)
self.model = LogRegModel(features, featureMatrix, self.targets, catIndices)
'''The following three functions can be called after a model has been generated.
They can be used to plot the model function, or to make a single prediction based on the model.'''
def plot_continuous_only(self):
self.model.continuous_only_plots()
def plot_with_categorical(self, catsToPlot):
codeMap = self.get_code_map(self.model.features[-1])
self.model.plot_for_categories(catsToPlot, codeMap)
def calculate_one_probability(self, predictor, resultType):
codedPredictor = predictor[:]
i = 0
for feature in self.model.features:
if str(self.df[feature].dtype) == "category":
codedPredictor[i] = self.get_code(feature, predictor[i])
i += 1
result = self.model.getSingleProbability(codedPredictor, resultType)
for i in range(len(self.model.features)):
print("%s: %s" % (self.model.features[i], predictor[i]), end="\t")
print("\nPass rate = %.2f%%" % (result * 100))
'''The functions below are used by the functions above to help
keep track of categorical feature names and codes'''
def add_coded_column(self, category):
codedColumnName = "%sCode" % category
self.df[codedColumnName] = self.df[category].cat.codes
return codedColumnName
def get_code(self, category, value):
codeMap = self.get_code_map(category)
return codeMap.get(value)
def get_code_map(self, category):
codeMap = dict()
makeGroups = self.df.groupby([category, ("%sCode" % category)])
for group, result in makeGroups:
codeMap[group[0]] = group[1]
return codeMap
'''The function below was used to help visualise
the data but is not referenced in the report'''
def simple_scatter(self, feature, maximum):
self.df = self.df[self.df[feature] < maximum]
targets = self.df.as_matrix(['TestResult']).ravel()
featureArray = self.df.as_matrix([feature])
plt.scatter(featureArray, targets.transpose(), s=5, marker='x')
plt.xlabel(feature)
plt.ylabel("0 = Pass, 1 = Fail")
plt.show()