-
Notifications
You must be signed in to change notification settings - Fork 4
/
model.py
88 lines (69 loc) · 3.3 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pickle
import numpy as np
import pandas as pd
from scipy.stats import spearmanr, pearsonr, zscore
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
# Read data from csv file.
dataframe = pd.read_csv("breastcancerdata.csv")
# Explore our data.
dataframe.info()
# Cleaning our data.
data_modified = dataframe.drop(labels=["Unnamed: 32", "id"], axis=1)
data_modified['diagnosis'] = data_modified['diagnosis'].map({'M': 1, 'B': 0})
# Remove outliers.
z_scores = zscore(data_modified)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
data_modified = data_modified[filtered_entries]
# split our data into features and outcome.
x = data_modified.drop("diagnosis", axis=1)
y = data_modified["diagnosis"]
# A function to select the best features by checking their correlation.
def manualFeatureSelection(features, outcome, r=0.76, pro=0.01, corrType='p'):
# First filter
# Check correlation between all features and the outcome.
x1 = []
for i in features:
if corrType == 's':
rho, p = spearmanr(features[str(i)], outcome)
else:
rho, p = pearsonr(features[str(i)], outcome)
if (abs(rho) > r) and (p < pro):
x1.append(i)
x2 = features[x1]
# -----------------------------------------------------------------------------
# Second filter
# Check correlation between all features and each other to eliminate redundant features.
corrColumns = set() # Set of all the names of correlated columns
if corrType == 's':
corr_matrix = x2.corr(method='spearman')
else:
corr_matrix = x2.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i, j]) > r: # we are interested in absolute coeff value
colname = corr_matrix.columns[i] # getting the name of column
corrColumns.add(colname)
return list(corrColumns)
# Best manual features selected after many experiments were:
# ['perimeter_worst', 'area_worst', 'radius_worst', 'concave points_worst']
x_featuresManual = ['perimeter_worst', 'area_worst', 'radius_worst', 'concave points_worst']
for i in x_featuresManual:
print(np.mean(x[x_featuresManual]))
# Split data into (70%) train data and (30%) test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
# As the best model which was used is the random forest model with applying GridSearchCV to tune its hyperparameters,
# To avoid overfitting, And selecting manual selected features, After many experiments.
breastCancerModel = RandomForestClassifier(max_depth=3,
max_features=2,
min_samples_split=25,
n_estimators=500)
breastCancerModel.fit(x_train[x_featuresManual], y_train)
print(accuracy_score(y_test, breastCancerModel.predict(x_test[x_featuresManual])))
score = cross_val_score(breastCancerModel, x, y, cv=KFold(n_splits=5))
print("Cross Validation Scores are {}".format(score))
print("Average Cross Validation score :{}".format(score.mean()))
# Make a pickle file of our model.
pickle.dump(breastCancerModel, open("model.pkl", "wb"))