Skip to content

Commit

Permalink
feat: add a new template for kaggle (#289)
Browse files Browse the repository at this point in the history
* edit prompts & model_rf

* init for sf-crime template

* CI issues

* discard change in prompts

* CI issues

* fix a bug

* fix some bugs

* fix a ci bug

---------

Co-authored-by: WinstonLiye <1957922024@qq.com>
  • Loading branch information
TPLin22 and WinstonLiyt authored Sep 21, 2024
1 parent 5b124d7 commit eee3ab5
Show file tree
Hide file tree
Showing 6 changed files with 362 additions and 1 deletion.
1 change: 0 additions & 1 deletion rdagent/scenarios/kaggle/experiment/meta_tpl/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.preprocessing import LabelEncoder

# Set random seed for reproducibility
SEED = 42
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import os

import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


def prepreprocess():
"""
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
"""
# Load and preprocess the data
train = pd.read_csv(
"/kaggle/input/train.csv",
parse_dates=["Dates"],
index_col=False,
)
train = train.drop(["Descript", "Resolution", "Address"], axis=1)

test = pd.read_csv(
"/kaggle/input/test.csv",
parse_dates=["Dates"],
index_col=False,
)
test_ids = test["Id"]
test = test.drop(["Address"], axis=1)

# Feature engineering
def feature_engineering(data):
data["Day"] = data["Dates"].dt.day
data["Month"] = data["Dates"].dt.month
data["Year"] = data["Dates"].dt.year
data["Hour"] = data["Dates"].dt.hour
data["Minute"] = data["Dates"].dt.minute
data["DayOfWeek"] = data["Dates"].dt.dayofweek
data["WeekOfYear"] = data["Dates"].dt.isocalendar().week
return data

train = feature_engineering(train)
test = feature_engineering(test)

# Encoding 'PdDistrict'
enc = LabelEncoder()
train["PdDistrict"] = enc.fit_transform(train["PdDistrict"])
test["PdDistrict"] = enc.transform(test["PdDistrict"])

# Encoding 'Category' in train set
category_encoder = LabelEncoder()
category_encoder.fit(train["Category"])
train["CategoryEncoded"] = category_encoder.transform(train["Category"])

# Selecting feature columns for modeling
x_cols = list(train.columns[2:12].values)
x_cols.remove("Minute") # Exclude the 'Minute' column
X = train[x_cols]
y = train["CategoryEncoded"]
X_test = test[x_cols]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
print(X.shape, y.shape, X_test.shape)

return X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids


def preprocess_fit(X_train: pd.DataFrame):
"""
Fits the preprocessor on the training data and returns the fitted preprocessor.
"""
# Identify numerical features
numerical_cols = X_train.columns # All columns are numerical

# Define preprocessor for numerical features
numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, numerical_cols)])

# Fit the preprocessor on the training data
preprocessor.fit(X_train)

return preprocessor


def preprocess_transform(X: pd.DataFrame, preprocessor):
"""
Transforms the given DataFrame using the fitted preprocessor.
"""
# Transform the data using the fitted preprocessor
X_array = preprocessor.transform(X)

# Convert arrays back to DataFrames
X_transformed = pd.DataFrame(X_array, columns=X.columns, index=X.index)

return X_transformed


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("X_train.pkl"):
X_train = pd.read_pickle("X_train.pkl")
X_valid = pd.read_pickle("X_valid.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_valid = pd.read_pickle("y_valid.pkl")
X_test = pd.read_pickle("X_test.pkl")
return X_train, X_valid, y_train, y_valid, X_test

X_train, X_valid, y_train, y_valid, test, category_encoder, test_ids = prepreprocess()

# Fit the preprocessor on the training data
preprocessor = preprocess_fit(X_train)

# Preprocess the train and validation data
X_train = preprocess_transform(X_train, preprocessor)
X_valid = preprocess_transform(X_valid, preprocessor)

# Preprocess the test data
X_test = preprocess_transform(test, preprocessor)

return X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
return X


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)

# Select features (if any feature selection is needed)
X_train_selected = select(X_train)
X_valid_selected = select(X_valid)

# Fit the model
model.fit(X_train_selected, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Select features (if any feature selection is needed)
X_selected = select(X)

# Predict using the trained model
y_pred_prob = model.predict_proba(X_selected)

# Apply threshold to get boolean predictions
return y_pred_prob
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""
motivation of the model
"""

import numpy as np
import pandas as pd
import xgboost as xgb


def select(X: pd.DataFrame) -> pd.DataFrame:
# Ignore feature selection logic
return X


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
X_train = select(X_train)
X_valid = select(X_valid)
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
num_classes = len(np.unique(y_train))

# TODO: for quick running....
params = {
"objective": "multi:softprob",
"num_class": num_classes,
"nthred": -1,
}
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
X = select(X)
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
return y_pred_prob
116 changes: 116 additions & 0 deletions rdagent/scenarios/kaggle/experiment/sf-crime_template/train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import log_loss

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


# Support various method for metrics calculation
def compute_metrics_for_classification(y_true, y_pred):
"""Compute log loss for classification."""
all_classes = np.unique(y_true)
logloss = log_loss(y_true, y_pred, labels=all_classes)
return logloss


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids = preprocess_script()


# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)

X_train = pd.concat(X_train_l, axis=1)
X_valid = pd.concat(X_valid_l, axis=1)
X_test = pd.concat(X_test_l, axis=1)


# Handle inf and -inf values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="mean")

X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Remove duplicate columns
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

print(X_train.shape, X_valid.shape, X_test.shape)

# 3) Train the model
model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))

# 4) Evaluate the model on the validation set
y_valid_pred_l = []
for model, predict_func in model_l:
y_valid_pred_l.append(predict_func(model, X_valid))
print(predict_func(model, X_valid))
print(predict_func(model, X_valid).shape)

# 5) Ensemble
from scipy import stats

# average probabilities ensemble
y_valid_pred_proba = np.mean(y_valid_pred_l, axis=0)

# Compute metrics
logloss = compute_metrics_for_classification(y_valid, y_valid_pred_proba)
print(f"final log_loss on valid set: {logloss}")

# 6) Save the validation metrics
pd.Series(data=[logloss], index=["log_loss"]).to_csv("submission_score.csv")

# 7) Make predictions on the test set and save them
y_test_pred_l = []
for model, predict_func in model_l:
y_test_pred_l.append(predict_func(model, X_test))

# For multiclass classification, use the mode of the predictions
y_test_pred_proba = np.mean(y_test_pred_l, axis=0)

class_labels = category_encoder.classes_

submission_result = pd.DataFrame(y_test_pred_proba, columns=class_labels)
submission_result.insert(0, "Id", test_ids)

submission_result.to_csv("submission.csv", index=False)

0 comments on commit eee3ab5

Please sign in to comment.