-
Notifications
You must be signed in to change notification settings - Fork 2
/
preprocess.py
35 lines (28 loc) · 1.1 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
# Load the data, sample such that the target classes are equal size
df = pd.read_csv("data/train_transaction.csv")
df = pd.concat(
[df[df.isFraud == 0].sample(n=len(df[df.isFraud == 1])), df[df.isFraud == 1]],
axis=0,
)
# Select the features and target
X = df[["ProductCD", "P_emaildomain", "R_emaildomain", "card4", "M1", "M2", "M3"]]
y = df.isFraud
# Use one-hot encoding to encode the categorical features
enc = OneHotEncoder(handle_unknown="ignore")
enc.fit(X)
X = pd.DataFrame(
enc.transform(X).toarray(), columns=enc.get_feature_names_out().reshape(-1)
)
X["TransactionAmt"] = df[["TransactionAmt"]].to_numpy()
# Split the dataset and train the model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
X_train.to_csv("feature_sets/X_train.csv", index=False)
X_test.to_csv("feature_sets/X_test.csv", index=False)
y_train.to_csv("feature_sets/y_train.csv", index=False)
y_test.to_csv("feature_sets/y_test.csv", index=False)