forked from bulam/Kaggle-Telstra
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKaggle.Telstra.py
123 lines (81 loc) · 4.91 KB
/
Kaggle.Telstra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding: utf-8
# In[161]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics, preprocessing
# read files into pandas dataframes
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
event_type = pd.read_csv('event_type.csv')
log_feature = pd.read_csv('log_feature.csv')
resource_type = pd.read_csv('resource_type.csv')
severity_type = pd.read_csv('severity_type.csv')
# create categorical variables out of locations
train_crosstab = pd.merge(train, pd.crosstab(train.id, train.location), how='left', left_on='id', right_index=True)
train_crosstab = train_crosstab.drop(['location'], axis=1).drop_duplicates()
# create categorical variables out of resource types
resource_type_crosstab = pd.merge(resource_type, pd.crosstab(resource_type.id, resource_type.resource_type), how='left', left_on='id', right_index=True)
resource_type_crosstab = resource_type_crosstab.drop(['resource_type'], axis=1).drop_duplicates()
# create categorical variables out of severity types
severity_type_crosstab = pd.merge(severity_type, pd.crosstab(severity_type.id, severity_type.severity_type), how='left', left_on='id', right_index=True)
severity_type_crosstab = severity_type_crosstab.drop(['severity_type'], axis=1).drop_duplicates()
# create categorical variables out of event types
event_type_crosstab = pd.merge(event_type, pd.crosstab(event_type.id, event_type.event_type), how='left', left_on='id', right_index=True)
event_type_crosstab = event_type_crosstab.drop(['event_type'], axis=1).drop_duplicates()
# pivot the log feature data to create a variable for each log feature; log feature volume is the scalar value
log_feature_pivot = pd.merge(log_feature, log_feature.pivot(index='id', columns='log_feature', values='volume'), how='left', left_on='id', right_index=True)
log_feature_pivot = log_feature_pivot.drop(['log_feature','volume'], axis=1).drop_duplicates()
# merge dataframes into one
merge1 = pd.merge(train_crosstab,log_feature_pivot,on='id',how='left')
merge2 = pd.merge(merge1, event_type_crosstab,on='id',how='left')
merge3 = pd.merge(merge2, resource_type_crosstab,on='id',how='left')
merge4= pd.merge(merge3, severity_type_crosstab,on='id',how='left')
#prepare the test file and put in the same format where the locations are categorical variables
test_crosstab = pd.merge(test, pd.crosstab(test.id, test.location), how='left', left_on='id', right_index=True)
test_crosstab = test_crosstab.drop(['location'], axis=1).drop_duplicates()
# merge the test dataframes into one
merge5 = pd.merge(test_crosstab,log_feature_pivot,on='id',how='left')
merge6 = pd.merge(merge5, event_type_crosstab,on='id',how='left')
merge7 = pd.merge(merge6, resource_type_crosstab,on='id',how='left')
merge8= pd.merge(merge7, severity_type_crosstab,on='id',how='left')
# concatenate the merged train and test dataframes to standardize columns across datasets
concatenated_data = pd.concat([merge4,merge8],axis=0)
# separate the train and test data back
train_data = concatenated_data[pd.notnull(concatenated_data['fault_severity'])]
test_data = concatenated_data[pd.isnull(concatenated_data['fault_severity'])]
# replace NaNs with zeros
train_data = train_data.fillna(value=0)
test_data = test_data.fillna(value=0)
# split the train dataset by columns into training columns and the target column
cols = [col for col in train_data.columns if col not in ['id', 'fault_severity']]
train_columns = train_data[cols]
target_columns = train_data['fault_severity']
train_columns.to_csv('train_columns.csv')
# create Gradient Boosting Classifier object
gbc = GradientBoostingClassifier(n_estimators=200, max_depth=7)
# further split train dataset into train and test for cross-validation
X_train, X_test, Y_train, Y_test = train_test_split(train_columns, target_columns, test_size=0.2,random_state=3)
# train the model
gbc.fit(X_train, Y_train)
# get predictions from the model
Y_pred = gbc.predict(X_test)
# show model accuracy
print(metrics.accuracy_score(Y_test,Y_pred))
# In[162]:
# take only the attribute columns from the test dataset
cols = [col for col in test_data.columns if col not in ['id', 'fault_severity']]
test_columns = test_data[cols]
# get predictions on the test dataset
Y_pred_test = gbc.predict(test_columns)
# add the predictions to the original file and select only id and Predictions columns
predictions = pd.DataFrame(Y_pred_test, columns=['Predictions'])
test_w_predictions = test_data.join(predictions)[['id', 'Predictions']]
# format predictions according to the requested submission format
test_w_predictions = test_w_predictions.pivot(index='id', columns='Predictions', values='Predictions')
test_w_predictions = test_w_predictions.notnull().astype(int)
test_w_predictions = test_w_predictions.rename(columns={0:'predict_0',1:'predict_1',2:'predict_2'})
test_w_predictions.to_csv('predictions.csv')
# In[151]: