feat: add a new template for kaggle (#289)

* edit prompts & model_rf * init for sf-crime template * CI issues * discard change in prompts * CI issues * fix a bug * fix some bugs * fix a ci bug --------- Co-authored-by: WinstonLiye <1957922024@qq.com>
microsoft · Sep 21, 2024 · eee3ab5 · eee3ab5
1 parent 5b124d7
commit eee3ab5
Show file tree

Hide file tree

Showing 6 changed files with 362 additions and 1 deletion.
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py
@@ -6,7 +6,6 @@
 import pandas as pd
 from fea_share_preprocess import preprocess_script
 from sklearn.metrics import accuracy_score, matthews_corrcoef
-from sklearn.preprocessing import LabelEncoder
 
 # Set random seed for reproducibility
 SEED = 42

diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/fea_share_preprocess.py
@@ -0,0 +1,125 @@
+import os
+
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import LabelEncoder
+
+
+def prepreprocess():
+    """
+    This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
+    """
+    # Load and preprocess the data
+    train = pd.read_csv(
+        "/kaggle/input/train.csv",
+        parse_dates=["Dates"],
+        index_col=False,
+    )
+    train = train.drop(["Descript", "Resolution", "Address"], axis=1)
+
+    test = pd.read_csv(
+        "/kaggle/input/test.csv",
+        parse_dates=["Dates"],
+        index_col=False,
+    )
+    test_ids = test["Id"]
+    test = test.drop(["Address"], axis=1)
+
+    # Feature engineering
+    def feature_engineering(data):
+        data["Day"] = data["Dates"].dt.day
+        data["Month"] = data["Dates"].dt.month
+        data["Year"] = data["Dates"].dt.year
+        data["Hour"] = data["Dates"].dt.hour
+        data["Minute"] = data["Dates"].dt.minute
+        data["DayOfWeek"] = data["Dates"].dt.dayofweek
+        data["WeekOfYear"] = data["Dates"].dt.isocalendar().week
+        return data
+
+    train = feature_engineering(train)
+    test = feature_engineering(test)
+
+    # Encoding 'PdDistrict'
+    enc = LabelEncoder()
+    train["PdDistrict"] = enc.fit_transform(train["PdDistrict"])
+    test["PdDistrict"] = enc.transform(test["PdDistrict"])
+
+    # Encoding 'Category' in train set
+    category_encoder = LabelEncoder()
+    category_encoder.fit(train["Category"])
+    train["CategoryEncoded"] = category_encoder.transform(train["Category"])
+
+    # Selecting feature columns for modeling
+    x_cols = list(train.columns[2:12].values)
+    x_cols.remove("Minute")  # Exclude the 'Minute' column
+    X = train[x_cols]
+    y = train["CategoryEncoded"]
+    X_test = test[x_cols]
+
+    # Split the data into training and validation sets
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
+    print(X.shape, y.shape, X_test.shape)
+
+    return X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids
+
+
+def preprocess_fit(X_train: pd.DataFrame):
+    """
+    Fits the preprocessor on the training data and returns the fitted preprocessor.
+    """
+    # Identify numerical features
+    numerical_cols = X_train.columns  # All columns are numerical
+
+    # Define preprocessor for numerical features
+    numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean"))])
+
+    # Combine preprocessing steps
+    preprocessor = ColumnTransformer(transformers=[("num", numerical_transformer, numerical_cols)])
+
+    # Fit the preprocessor on the training data
+    preprocessor.fit(X_train)
+
+    return preprocessor
+
+
+def preprocess_transform(X: pd.DataFrame, preprocessor):
+    """
+    Transforms the given DataFrame using the fitted preprocessor.
+    """
+    # Transform the data using the fitted preprocessor
+    X_array = preprocessor.transform(X)
+
+    # Convert arrays back to DataFrames
+    X_transformed = pd.DataFrame(X_array, columns=X.columns, index=X.index)
+
+    return X_transformed
+
+
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("X_train.pkl"):
+        X_train = pd.read_pickle("X_train.pkl")
+        X_valid = pd.read_pickle("X_valid.pkl")
+        y_train = pd.read_pickle("y_train.pkl")
+        y_valid = pd.read_pickle("y_valid.pkl")
+        X_test = pd.read_pickle("X_test.pkl")
+        return X_train, X_valid, y_train, y_valid, X_test
+
+    X_train, X_valid, y_train, y_valid, test, category_encoder, test_ids = prepreprocess()
+
+    # Fit the preprocessor on the training data
+    preprocessor = preprocess_fit(X_train)
+
+    # Preprocess the train and validation data
+    X_train = preprocess_transform(X_train, preprocessor)
+    X_valid = preprocess_transform(X_valid, preprocessor)
+
+    # Preprocess the test data
+    X_test = preprocess_transform(test, preprocessor)
+
+    return X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids
diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/feature/feature.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/feature/feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_randomforest.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_randomforest.py
@@ -0,0 +1,54 @@
+"""
+Motivation of the model:
+The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
+It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
+baseline model for many classification tasks.
+"""
+
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestClassifier(n_estimators=10, random_state=32, n_jobs=-1)
+
+    # Select features (if any feature selection is needed)
+    X_train_selected = select(X_train)
+    X_valid_selected = select(X_valid)
+
+    # Fit the model
+    model.fit(X_train_selected, y_train)
+
+    # Validate the model
+    y_valid_pred = model.predict(X_valid_selected)
+    accuracy = accuracy_score(y_valid, y_valid_pred)
+    print(f"Validation Accuracy: {accuracy:.4f}")
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Select features (if any feature selection is needed)
+    X_selected = select(X)
+
+    # Predict using the trained model
+    y_pred_prob = model.predict_proba(X_selected)
+
+    # Apply threshold to get boolean predictions
+    return y_pred_prob
diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_xgboost.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/model/model_xgboost.py
@@ -0,0 +1,44 @@
+"""
+motivation  of the model
+"""
+
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    # Ignore feature selection logic
+    return X
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
+    """Define and train the model. Merge feature_select"""
+    X_train = select(X_train)
+    X_valid = select(X_valid)
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dvalid = xgb.DMatrix(X_valid, label=y_valid)
+    num_classes = len(np.unique(y_train))
+
+    # TODO: for quick running....
+    params = {
+        "objective": "multi:softprob",
+        "num_class": num_classes,
+        "nthred": -1,
+    }
+    num_round = 100
+
+    evallist = [(dtrain, "train"), (dvalid, "eval")]
+    bst = xgb.train(params, dtrain, num_round, evallist)
+
+    return bst
+
+
+def predict(model, X):
+    """
+    Keep feature select's consistency.
+    """
+    X = select(X)
+    dtest = xgb.DMatrix(X)
+    y_pred_prob = model.predict(dtest)
+    return y_pred_prob
diff --git a/rdagent/scenarios/kaggle/experiment/sf-crime_template/train.py b/rdagent/scenarios/kaggle/experiment/sf-crime_template/train.py
@@ -0,0 +1,116 @@
+import importlib.util
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import preprocess_script
+from sklearn.metrics import log_loss
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+# Support various method for metrics calculation
+def compute_metrics_for_classification(y_true, y_pred):
+    """Compute log loss for classification."""
+    all_classes = np.unique(y_true)
+    logloss = log_loss(y_true, y_pred, labels=all_classes)
+    return logloss
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test, category_encoder, test_ids = preprocess_script()
+
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
+        X_train_l.append(X_train_f)
+        X_valid_l.append(X_valid_f)
+        X_test_l.append(X_test_f)
+
+X_train = pd.concat(X_train_l, axis=1)
+X_valid = pd.concat(X_valid_l, axis=1)
+X_test = pd.concat(X_test_l, axis=1)
+
+
+# Handle inf and -inf values
+X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
+X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
+X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+from sklearn.impute import SimpleImputer
+
+imputer = SimpleImputer(strategy="mean")
+
+X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
+X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
+X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
+
+# Remove duplicate columns
+X_train = X_train.loc[:, ~X_train.columns.duplicated()]
+X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
+X_test = X_test.loc[:, ~X_test.columns.duplicated()]
+
+print(X_train.shape, X_valid.shape, X_test.shape)
+
+# 3) Train the model
+model_l = []  # list[tuple[model, predict_func]]
+for f in DIRNAME.glob("model/model*.py"):
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train, y_train, X_valid, y_valid), m.predict))
+
+# 4) Evaluate the model on the validation set
+y_valid_pred_l = []
+for model, predict_func in model_l:
+    y_valid_pred_l.append(predict_func(model, X_valid))
+    print(predict_func(model, X_valid))
+    print(predict_func(model, X_valid).shape)
+
+# 5) Ensemble
+from scipy import stats
+
+# average probabilities ensemble
+y_valid_pred_proba = np.mean(y_valid_pred_l, axis=0)
+
+# Compute metrics
+logloss = compute_metrics_for_classification(y_valid, y_valid_pred_proba)
+print(f"final log_loss on valid set: {logloss}")
+
+# 6) Save the validation metrics
+pd.Series(data=[logloss], index=["log_loss"]).to_csv("submission_score.csv")
+
+# 7) Make predictions on the test set and save them
+y_test_pred_l = []
+for model, predict_func in model_l:
+    y_test_pred_l.append(predict_func(model, X_test))
+
+# For multiclass classification, use the mode of the predictions
+y_test_pred_proba = np.mean(y_test_pred_l, axis=0)
+
+class_labels = category_encoder.classes_
+
+submission_result = pd.DataFrame(y_test_pred_proba, columns=class_labels)
+submission_result.insert(0, "Id", test_ids)
+
+submission_result.to_csv("submission.csv", index=False)