-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_encoding.py
56 lines (35 loc) · 1.46 KB
/
data_encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from sklearn.preprocessing import LabelEncoder
import pandas as pd
def l_encod(data: pd.DataFrame, column) -> pd.DataFrame:
label_encoder = LabelEncoder()
# Identify rows with missing values in the specified column
none_mask = data[column].isna()
data[column] = data[column].astype(str)
data[column].fillna("Unknown", inplace=True)
data[column] = label_encoder.fit_transform(data[column])
data.loc[none_mask, column] = None
return data
def to_bool(data: pd.DataFrame, column):
data[column] = data[column].astype(bool)
data[column] = data[column].astype(int)
return data
def main():
# List of columns to encode using LabelEncoder
l_encod_list = ["HomePlanet", "Destination", "deck", "side", "AgeClass", "Movement"]
# List of columns to convert to boolean
to_bool_list = ["CryoSleep", "VIP", "Transported"]
# Columns to drop
drop_list = ["PassengerId", "Cabin", "Name", "FirstName", "Surname", "GroupId", "PassengerNum"]
all_data = pd.read_csv("data/Preprocessed.csv")
all_data.drop(columns=drop_list, inplace=True)
# Encode categorical columns using LabelEncoder
for c in l_encod_list:
all_data = l_encod(all_data, c)
# Convert selected columns to boolean
for c in to_bool_list:
all_data = to_bool(all_data, c)
all_data.to_csv("data/EncodedAllData.csv", index=False)
print(all_data.isna().sum())
print(all_data.head(7))
if __name__ == "__main__":
main()