-
-
Notifications
You must be signed in to change notification settings - Fork 124
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add a new template for kaggle (#289)
* edit prompts & model_rf * init for sf-crime template * CI issues * discard change in prompts * CI issues * fix a bug * fix some bugs * fix a ci bug --------- Co-authored-by: WinstonLiye <1957922024@qq.com>
- Loading branch information
1 parent
5b124d7
commit eee3ab5
Showing
6 changed files
with
362 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
125 changes: 125 additions & 0 deletions
125
rdagent/scenarios/kaggle/experiment/sf-crime_template/fea_share_preprocess.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
import os | ||
|
||
import pandas as pd | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.preprocessing import LabelEncoder | ||
|
||
|
||
def prepreprocess(): | ||
""" | ||
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. | ||
""" | ||
# Load and preprocess the data | ||
train = pd.read_csv( | ||
"/kaggle/input/train.csv", | ||
parse_dates=["Dates"], | ||
index_col=False, | ||
) | ||
train = train.drop(["Descript", "Resolution", "Address"], axis=1) | ||
|
||
test = pd.read_csv( | ||
"/kaggle/input/test.csv", | ||
parse_dates=["Dates"], | ||
index_col=False, | ||
) | ||
test_ids = test["Id"] | ||
test = test.drop(["Address"], axis=1) | ||
|
||
# Feature engineering | ||
def feature_engineering(data): | ||
data["Day"] = data["Dates"].dt.day | ||
data["Month"] = data["Dates"].dt.month | ||
data["Year"] = data["Dates"].dt.year | ||
data["Hour"] = data["Dates"].dt.hour | ||
data["Minute"] = data["Dates"].dt.minute | ||
data["DayOfWeek"] = data["Dates"].dt.dayofweek | ||
data["WeekOfYear"] = data["Dates"].dt.isocalendar().week | ||
return data | ||
|
||
train = feature_engineering(train) | ||
test = feature_engineering(test) | ||
|
||
# Encoding 'PdDistrict' | ||
enc = LabelEncoder() | ||
train["PdDistrict"] = enc.fit_transform(train["PdDistrict"]) | ||
test["PdDistrict"] = enc.transform(test["PdDistrict"]) | ||
|
||
# Encoding 'Category' in train set | ||
category_encoder = LabelEncoder() | ||
category_encoder.fit(train["Category"]) | ||
train["CategoryEncoded"] = category_encoder.transform(train["Category"]) | ||
|
||
# Selecting feature columns for modeling | ||
x_cols = list(train.columns[2:12].values) | ||
x_cols.remove("Minute") # Exclude the 'Minute' column | ||
X = train[x_cols] | ||
y = train["CategoryEncoded"] | ||
X_test = test[x_cols] | ||
|
||
# Split the data into training and validation sets | ||
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42) | ||
print(X.shape, y.shape, X_test.shape) | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids | ||
|
||
|
||
def preprocess_fit(X_train: pd.DataFrame): | ||
""" | ||
Fits the preprocessor on the training data and returns the fitted preprocessor. | ||
""" | ||
# Identify numerical features | ||
numerical_cols = X_train.columns # All columns are numerical | ||
|
||
# Define preprocessor for numerical features | ||
numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))]) | ||
|
||
# Combine preprocessing steps | ||
preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, numerical_cols)]) | ||
|
||
# Fit the preprocessor on the training data | ||
preprocessor.fit(X_train) | ||
|
||
return preprocessor | ||
|
||
|
||
def preprocess_transform(X: pd.DataFrame, preprocessor): | ||
""" | ||
Transforms the given DataFrame using the fitted preprocessor. | ||
""" | ||
# Transform the data using the fitted preprocessor | ||
X_array = preprocessor.transform(X) | ||
|
||
# Convert arrays back to DataFrames | ||
X_transformed = pd.DataFrame(X_array, columns=X.columns, index=X.index) | ||
|
||
return X_transformed | ||
|
||
|
||
def preprocess_script(): | ||
""" | ||
This method applies the preprocessing steps to the training, validation, and test datasets. | ||
""" | ||
if os.path.exists("X_train.pkl"): | ||
X_train = pd.read_pickle("X_train.pkl") | ||
X_valid = pd.read_pickle("X_valid.pkl") | ||
y_train = pd.read_pickle("y_train.pkl") | ||
y_valid = pd.read_pickle("y_valid.pkl") | ||
X_test = pd.read_pickle("X_test.pkl") | ||
return X_train, X_valid, y_train, y_valid, X_test | ||
|
||
X_train, X_valid, y_train, y_valid, test, category_encoder, test_ids = prepreprocess() | ||
|
||
# Fit the preprocessor on the training data | ||
preprocessor = preprocess_fit(X_train) | ||
|
||
# Preprocess the train and validation data | ||
X_train = preprocess_transform(X_train, preprocessor) | ||
X_valid = preprocess_transform(X_valid, preprocessor) | ||
|
||
# Preprocess the test data | ||
X_test = preprocess_transform(test, preprocessor) | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids |
23 changes: 23 additions & 0 deletions
23
rdagent/scenarios/kaggle/experiment/sf-crime_template/feature/feature.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pandas as pd | ||
|
||
""" | ||
Here is the feature engineering code for each task, with a class that has a fit and transform method. | ||
Remember | ||
""" | ||
|
||
|
||
class IdentityFeature: | ||
def fit(self, train_df: pd.DataFrame): | ||
""" | ||
Fit the feature engineering model to the training data. | ||
""" | ||
pass | ||
|
||
def transform(self, X: pd.DataFrame): | ||
""" | ||
Transform the input data. | ||
""" | ||
return X | ||
|
||
|
||
feature_engineering_cls = IdentityFeature |
54 changes: 54 additions & 0 deletions
54
rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_randomforest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
""" | ||
Motivation of the model: | ||
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. | ||
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good | ||
baseline model for many classification tasks. | ||
""" | ||
|
||
import pandas as pd | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.metrics import accuracy_score | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
return X | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): | ||
""" | ||
Define and train the Random Forest model. Merge feature selection into the pipeline. | ||
""" | ||
# Initialize the Random Forest model | ||
model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1) | ||
|
||
# Select features (if any feature selection is needed) | ||
X_train_selected = select(X_train) | ||
X_valid_selected = select(X_valid) | ||
|
||
# Fit the model | ||
model.fit(X_train_selected, y_train) | ||
|
||
# Validate the model | ||
y_valid_pred = model.predict(X_valid_selected) | ||
accuracy = accuracy_score(y_valid, y_valid_pred) | ||
print(f"Validation Accuracy: {accuracy:.4f}") | ||
|
||
return model | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature selection's consistency and make predictions. | ||
""" | ||
# Select features (if any feature selection is needed) | ||
X_selected = select(X) | ||
|
||
# Predict using the trained model | ||
y_pred_prob = model.predict_proba(X_selected) | ||
|
||
# Apply threshold to get boolean predictions | ||
return y_pred_prob |
44 changes: 44 additions & 0 deletions
44
rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_xgboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
""" | ||
motivation of the model | ||
""" | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import xgboost as xgb | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
# Ignore feature selection logic | ||
return X | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame): | ||
"""Define and train the model. Merge feature_select""" | ||
X_train = select(X_train) | ||
X_valid = select(X_valid) | ||
dtrain = xgb.DMatrix(X_train, label=y_train) | ||
dvalid = xgb.DMatrix(X_valid, label=y_valid) | ||
num_classes = len(np.unique(y_train)) | ||
|
||
# TODO: for quick running.... | ||
params = { | ||
"objective": "multi:softprob", | ||
"num_class": num_classes, | ||
"nthred": -1, | ||
} | ||
num_round = 100 | ||
|
||
evallist = [(dtrain, "train"), (dvalid, "eval")] | ||
bst = xgb.train(params, dtrain, num_round, evallist) | ||
|
||
return bst | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature select's consistency. | ||
""" | ||
X = select(X) | ||
dtest = xgb.DMatrix(X) | ||
y_pred_prob = model.predict(dtest) | ||
return y_pred_prob |
116 changes: 116 additions & 0 deletions
116
rdagent/scenarios/kaggle/experiment/sf-crime_template/train.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import importlib.util | ||
import random | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from fea_share_preprocess import preprocess_script | ||
from sklearn.metrics import log_loss | ||
|
||
# Set random seed for reproducibility | ||
SEED = 42 | ||
random.seed(SEED) | ||
np.random.seed(SEED) | ||
DIRNAME = Path(__file__).absolute().resolve().parent | ||
|
||
|
||
# Support various method for metrics calculation | ||
def compute_metrics_for_classification(y_true, y_pred): | ||
"""Compute log loss for classification.""" | ||
all_classes = np.unique(y_true) | ||
logloss = log_loss(y_true, y_pred, labels=all_classes) | ||
return logloss | ||
|
||
|
||
def import_module_from_path(module_name, module_path): | ||
spec = importlib.util.spec_from_file_location(module_name, module_path) | ||
module = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(module) | ||
return module | ||
|
||
|
||
# 1) Preprocess the data | ||
X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids = preprocess_script() | ||
|
||
|
||
# 2) Auto feature engineering | ||
X_train_l, X_valid_l = [], [] | ||
X_test_l = [] | ||
|
||
for f in DIRNAME.glob("feature/feat*.py"): | ||
cls = import_module_from_path(f.stem, f).feature_engineering_cls() | ||
cls.fit(X_train) | ||
X_train_f = cls.transform(X_train) | ||
X_valid_f = cls.transform(X_valid) | ||
X_test_f = cls.transform(X_test) | ||
|
||
if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: | ||
X_train_l.append(X_train_f) | ||
X_valid_l.append(X_valid_f) | ||
X_test_l.append(X_test_f) | ||
|
||
X_train = pd.concat(X_train_l, axis=1) | ||
X_valid = pd.concat(X_valid_l, axis=1) | ||
X_test = pd.concat(X_test_l, axis=1) | ||
|
||
|
||
# Handle inf and -inf values | ||
X_train.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
X_test.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
|
||
from sklearn.impute import SimpleImputer | ||
|
||
imputer = SimpleImputer(strategy="mean") | ||
|
||
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) | ||
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) | ||
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) | ||
|
||
# Remove duplicate columns | ||
X_train = X_train.loc[:, ~X_train.columns.duplicated()] | ||
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] | ||
X_test = X_test.loc[:, ~X_test.columns.duplicated()] | ||
|
||
print(X_train.shape, X_valid.shape, X_test.shape) | ||
|
||
# 3) Train the model | ||
model_l = [] # list[tuple[model, predict_func]] | ||
for f in DIRNAME.glob("model/model*.py"): | ||
m = import_module_from_path(f.stem, f) | ||
model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict)) | ||
|
||
# 4) Evaluate the model on the validation set | ||
y_valid_pred_l = [] | ||
for model, predict_func in model_l: | ||
y_valid_pred_l.append(predict_func(model, X_valid)) | ||
print(predict_func(model, X_valid)) | ||
print(predict_func(model, X_valid).shape) | ||
|
||
# 5) Ensemble | ||
from scipy import stats | ||
|
||
# average probabilities ensemble | ||
y_valid_pred_proba = np.mean(y_valid_pred_l, axis=0) | ||
|
||
# Compute metrics | ||
logloss = compute_metrics_for_classification(y_valid, y_valid_pred_proba) | ||
print(f"final log_loss on valid set: {logloss}") | ||
|
||
# 6) Save the validation metrics | ||
pd.Series(data=[logloss], index=["log_loss"]).to_csv("submission_score.csv") | ||
|
||
# 7) Make predictions on the test set and save them | ||
y_test_pred_l = [] | ||
for model, predict_func in model_l: | ||
y_test_pred_l.append(predict_func(model, X_test)) | ||
|
||
# For multiclass classification, use the mode of the predictions | ||
y_test_pred_proba = np.mean(y_test_pred_l, axis=0) | ||
|
||
class_labels = category_encoder.classes_ | ||
|
||
submission_result = pd.DataFrame(y_test_pred_proba, columns=class_labels) | ||
submission_result.insert(0, "Id", test_ids) | ||
|
||
submission_result.to_csv("submission.csv", index=False) |