-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTitanic_HGB_1.0.py
128 lines (78 loc) · 4.04 KB
/
Titanic_HGB_1.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# 1. Read, Skim and Pre-process data
# 1.0 Initial Codes given from Kaggle
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
# for filename in filenames:
# print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.inspection import permutation_importance
# 1.1 Read and Skim data
df = pd.read_csv('Data/Train.csv')
print(df.head())
df.info()
df.describe()
# 1.2 Find where to pre-processing
print(df["Embarked"].unique()) # ['S' 'C' 'Q' nan]
print(df["Embarked"].value_counts()) # mode : 'S' (644/891)
# Remove : 1 PassengerId, 3 Name, 8 Ticket (useless) / 10 Cabin (too many NaN)
# Replace : 4 Sex(categorical) 5 Age(fill NaN) 11 Embarked(some NaN, categorical)
# 1.3 Pre-processing : Remove or replace NaN
# Remove : 1 PassengerId, 3 Name, 8 Ticket (useless) / 10 Cabin (too many NaN)
# Replace : 4 Sex(categorical) 5 Age(fill NaN) 11 Embarked(some NaN, categorical)
df.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1, inplace=True)
df["Age"].fillna(df.Age.mean(), inplace=True)
df["Embarked"].fillna("S", inplace=True) # "S" : mode
df = pd.get_dummies(df, columns=["Embarked", "Sex"])
# df["Sex"].replace(to_replace="male", value=1, inplace=True)
# df["Sex"].replace(to_replace="female", value=0, inplace=True)
print(df.head())
df.info()
df.describe()
# 2. HGB
# 2.1 Split input and target data
data = df.iloc[:,1:].to_numpy() # except 0 : Survived (target)
target = df.iloc[:,0].to_numpy()
print(len(data)) # 891
print(len(target)) # 891
print(data[:5,])
print(target[:5])
# 2.2 HGB
train_input, valid_input, train_target, valid_target = train_test_split(data, target, test_size=0.2, random_state=604)
hgb = HistGradientBoostingClassifier(random_state=604)
scores = cross_validate(hgb, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))
hgb.fit(train_input, train_target)
result = permutation_importance(hgb, train_input, train_target, n_repeats=100, random_state=604, n_jobs=-1)
print(result.importances_mean)
result = permutation_importance(hgb, valid_input, valid_target, n_repeats=100, random_state=42, n_jobs=-1)
print(result.importances_mean)
print(hgb.score(valid_input, valid_target))
# 3. Submit
# 3.1 Read and pre-process the test data
test = pd.read_csv('Data/Test.csv')
# print(test.head())
test.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True) # "PassengerId" should be remained
test["Age"].fillna(test.Age.mean(), inplace=True)
test["Fare"].fillna(test.Fare.mean(), inplace=True)
test["Embarked"].fillna("S", inplace=True)
test = pd.get_dummies(test, columns=["Embarked", "Sex"])
print(test.head())
test.info()
test_input = test.iloc[:,1:].to_numpy()
# 3.2 Generate the submission file
test_id = test["PassengerId"]
test_output = hgb.predict(test_input)
submission = pd.DataFrame({"PassengerId": test_id, "Survived": test_output})
submission.to_csv("Submission/Submission_HGB_01.csv", index=False)
submission.head()