-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataWrangler.py
92 lines (78 loc) · 3.78 KB
/
DataWrangler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import numpy as np
class DataWrangler:
def wrangle(self, dataframe):
# Let's engineer two new features, FamilySize and if the passenger was alone or not (could influence how easy it was
# to find a place on a boat for everyone)
dataframe["FamilySize"] = dataframe["SibSp"] + dataframe["Parch"]
# IsAlone is a bit more complicated since a child could be accompanied by a nanny or an aunt but still have a
# FamilySize of 0, we'll assume that everyone under the age of 18 was accompanied
dataframe["IsAlone"] = dataframe["FamilySize"].apply(lambda fam_size: 0 if fam_size > 0 else 1)
is_old_enough = dataframe["Age"].apply(lambda age: 0 if age < 18 else 1)
dataframe["IsAlone"] = dataframe["IsAlone"] * is_old_enough
# Taken from https://medium.com/i-like-big-data-and-i-cannot-lie/how-i-scored-in-the-top-9-of-kaggles-titanic-machine-learning-challenge-243b5f45c8e9
# but I don't really like it. Too many assumptions. Same with normalized titles, what if a title is not in our map...
dataframe["Title"] = dataframe["Name"].apply(
lambda name: name.split(',')[1].split('.')[0].strip())
normalized_titles = {
"Capt": "Officer",
"Col": "Officer",
"Major": "Officer",
"Jonkheer": "Royalty",
"Don": "Royalty",
"Sir": "Royalty",
"Dr": "Officer",
"Rev": "Officer",
"the Countess": "Royalty",
"Dona": "Royalty",
"Mme": "Mrs",
"Mlle": "Miss",
"Ms": "Mrs",
"Mr": "Mr",
"Mrs": "Mrs",
"Miss": "Miss",
"Master": "Master",
"Lady": "Royalty"
}
dataframe["Title"] = dataframe["Title"].map(normalized_titles)
grouped = dataframe.groupby(["Sex", "Pclass", "Title"])
dataframe["Age"] = grouped["Age"].apply(lambda x: x.fillna(x.median()))
# Also, let's transform them into age categories. It makes more sense that way
# ToDo : After careful data analysis try to find best age categories
dataframe["Age"] = dataframe["Age"].astype(np.int64)
dataframe["Age"] = dataframe["Age"].apply(
lambda age:
1 if age < 5 else
2 if 5 <= age < 10 else
3 if 10 <= age < 18 else
4 if 18 <= age < 25 else
5 if 25 <= age < 35 else
6 if 35 <= age < 45 else
7
)
dataframe["Fare"].fillna(dataframe["Fare"].dropna().median(), inplace=True)
dataframe["Fare"] = dataframe["Fare"].apply(
lambda fare:
1 if fare <= 8 else
2 if 8 < fare <= 14 else
3 if 14 < fare <= 31 else
4
)
dataframe["Fare"] = dataframe["Fare"].astype(np.int64)
# Some Embarked entries are missing, let's try and replace them with where most of the people embarked from
# ToDo : Would it be possible to do something similar to Age?
dataframe["Embarked"].fillna(
dataframe["Embarked"].value_counts().index[0],
inplace=True
)
# The cabin (first letter) can be indicative of where someone was situated, close to a life boat maybe?
# We fill with U for unknown
dataframe["Cabin"].fillna("U", inplace=True)
dataframe["Cabin"] = dataframe["Cabin"].apply(lambda cabin: cabin[0])
# Transformation for the benefit of Tensorflow
dataframe["Pclass"].apply(np.int64)
dataframe = dataframe.drop("SibSp", axis=1)
dataframe = dataframe.drop("Parch", axis=1)
dataframe = dataframe.drop("Name", axis=1)
dataframe = dataframe.drop("Ticket", axis=1)
dataframe = dataframe.drop("PassengerId", axis=1)
return dataframe