fix: update new feature engineering code format (#272)

* update new feature engineering code format * fix CI
microsoft · Sep 19, 2024 · 7850b80 · 7850b80
1 parent c4895de
commit 7850b80
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 24 deletions.
diff --git a/rdagent/components/coder/factor_coder/factor_execution_template.txt b/rdagent/components/coder/factor_coder/factor_execution_template.txt
@@ -2,12 +2,14 @@ import os
 
 import numpy as np
 import pandas as pd
-from factor import feat_eng
+from factor import feature_engineering_cls
 
 if os.path.exists("valid.pkl"):
     valid_df = pd.read_pickle("valid.pkl")
 else:
     raise FileNotFoundError("No valid data found.")
 
-new_feat = feat_eng(valid_df)
+cls = feature_engineering_cls()
+cls.fit(valid_df)
+new_feat = cls.transform(valid_df)
 new_feat.to_hdf("result.h5", key="data", mode="w")
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/feature/feature.py
@@ -1,13 +1,23 @@
 import pandas as pd
 
 """
-Here is the feature engineering code for each task, with the function name specified as feat_eng. 
-The file name should start with feat_, followed by the specific task name.
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
 """
 
 
-def feat_eng(X: pd.DataFrame):
-    """
-    return the selected features
-    """
-    return X
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/train.py
@@ -44,10 +44,11 @@ def import_module_from_path(module_name, module_path):
 X_test_l = []
 
 for f in DIRNAME.glob("feature/feat*.py"):
-    m = import_module_from_path(f.stem, f)
-    X_train_f = m.feat_eng(X_train)
-    X_valid_f = m.feat_eng(X_valid)
-    X_test_f = m.feat_eng(X_test)
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
 
     X_train_l.append(X_train_f)
     X_valid_l.append(X_valid_f)

diff --git a/rdagent/scenarios/kaggle/experiment/prompts.yaml b/rdagent/scenarios/kaggle/experiment/prompts.yaml
@@ -66,11 +66,13 @@ kg_background: |-
 kg_feature_interface: |-
   Your code should contain several parts:
   1. The import part: import the necessary libraries.
-  2. A feat_eng() function that handles feature engineering for each task.
-    The function should take the following arguments:
-      - X: The features as a pandas DataFrame.
-    The function should return the new features as a pandas DataFrame.
-  The input to `feat_eng` will be a pandas DataFrame, which should be processed to return a new DataFrame containing only the engineered features. 
+  2. A class that contains the feature engineering logic.
+    The class should have the following methods:
+      - fit: This method should fit the feature engineering model to the training data.
+      - transform: This method should transform the input data and return it.
+    For some tasks like generating new features, the fit method may not be necessary. Please pass this function as a no-op.
+  3. A variable called feature_engineering_cls that contains the class name.
+  The input to 'fit' is the training data in pandas dataframe, and the input to 'transform' is the data to be transformed in pandas dataframe.
   The original columns should be excluded from the returned DataFrame.
 
   Exception handling will be managed externally, so avoid using try-except blocks in your code. The user will handle any exceptions that arise and provide feedback as needed.
@@ -83,12 +85,24 @@ kg_feature_interface: |-
   ```python
   import pandas as pd
 
-  def feat_eng(X: pd.DataFrame):
-      """
-      return the selected features
-      """
-      return X.mean(axis=1).to_frame("mean_feature") # Example feature engineering
-      return X.fillna(0) # Example feature processing
+  class FeatureEngineeringName:
+      def fit(self, train_df: pd.DataFrame):
+          """
+          Fit the feature engineering model to the training data. 
+          For example, for one hot encoding, this would involve fitting the encoder to the training data.
+          For feature scaling, this would involve fitting the scaler to the training data.
+          """
+          return self
+
+      def transform(self, X: pd.DataFrame):
+          """
+          Transform the input data.
+          """
+          return X
+          return X.mean(axis=1).to_frame("mean_feature") # Example feature engineering
+          return X.fillna(0) # Example feature processing
+
+  feature_engineering_cls = FeatureEngineeringName
   ```
 
   To Note: