microsoft · WinstonLiyt · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -155,7 +155,7 @@ def clear(self) -> None:
         """
         Clear the workspace
         """
-        shutil.rmtree(self.workspace_path)
+        shutil.rmtree(self.workspace_path, ignore_errors=True)
         self.code_dict = {}
 
     def execute(self) -> object | None:

diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
@@ -7,7 +7,7 @@
 from rdagent.components.coder.factor_coder.factor import FactorTask
 from rdagent.components.runner import CachedRunner
 from rdagent.components.runner.conf import RUNNER_SETTINGS
-from rdagent.core.exception import ModelEmptyError
+from rdagent.core.exception import FactorEmptyError, ModelEmptyError
 from rdagent.core.experiment import ASpecificExp
 from rdagent.oai.llm_utils import md5_hash
 from rdagent.scenarios.kaggle.experiment.kaggle_experiment import (
@@ -41,12 +41,20 @@ class KGModelRunner(KGCachedRunner[KGModelExperiment]):
     def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
         self.build_from_SOTA(exp)
         if exp.sub_workspace_list[0].target_task.model_type == "XGBoost":
+            if exp.sub_workspace_list[0].code_dict == {}:
+                raise ModelEmptyError("No model is implemented")
             exp.experiment_workspace.inject_code(**{"model_xgb.py": exp.sub_workspace_list[0].code_dict["model.py"]})
         elif exp.sub_workspace_list[0].target_task.model_type == "RandomForest":
+            if exp.sub_workspace_list[0].code_dict == {}:
+                raise ModelEmptyError("No model is implemented")
             exp.experiment_workspace.inject_code(**{"model_rf.py": exp.sub_workspace_list[0].code_dict["model.py"]})
         elif exp.sub_workspace_list[0].target_task.model_type == "LightGBM":
+            if exp.sub_workspace_list[0].code_dict == {}:
+                raise ModelEmptyError("No model is implemented")
             exp.experiment_workspace.inject_code(**{"model_lgb.py": exp.sub_workspace_list[0].code_dict["model.py"]})
         elif exp.sub_workspace_list[0].target_task.model_type == "NN":
+            if exp.sub_workspace_list[0].code_dict == {}:
+                raise ModelEmptyError("No model is implemented")
             exp.experiment_workspace.inject_code(**{"model_nn.py": exp.sub_workspace_list[0].code_dict["model.py"]})
         if RUNNER_SETTINGS.cache_result:
             cache_hit, result = self.get_cache_result(exp)
@@ -113,7 +121,7 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
             exp.experiment_workspace.data_description.append((sub_ws.target_task.get_task_information(), feature_shape))
             current_feature_file_count += 1
         if implemented_factor_count == 0:
-            raise ModelEmptyError("No factor is implemented")
+            raise FactorEmptyError("No factor is implemented")
 
         if RUNNER_SETTINGS.cache_result:
             cache_hit, result = self.get_cache_result(exp)

diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py
@@ -6,13 +6,15 @@
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
 
-def prepreprocess():
+def prepreprocess(debug_mode=False):
     """
     This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
     """
     # Load and preprocess the data
     data_df = pd.read_csv("/kaggle/input/train.csv")
-    data_df = data_df.head(1200)
+    if debug_mode:
+        data_df = data_df.sample(frac=0.1, random_state=42)
+    data_df = data_df
     data_df = data_df.drop(["id"], axis=1)
 
     X = data_df.drop(["class"], axis=1)
@@ -79,11 +81,11 @@ def preprocess_transform(X: pd.DataFrame, preprocessor):
     return X_transformed
 
 
-def preprocess_script():
+def preprocess_script(debug_mode=False):
     """
     This method applies the preprocessing steps to the training, validation, and test datasets.
     """
-    X_train, X_valid, y_train, y_valid = prepreprocess()
+    X_train, X_valid, y_train, y_valid = prepreprocess(debug_mode=debug_mode)
 
     # Fit the preprocessor on the training data
     preprocessor = preprocess_fit(X_train)
@@ -94,7 +96,8 @@ def preprocess_script():
 
     # Load and preprocess the test data
     submission_df = pd.read_csv("/kaggle/input/test.csv")
-    submission_df = submission_df.head(500)
+    if debug_mode:
+        data_df = data_df.sample(frac=0.1, random_state=42)
     passenger_ids = submission_df["id"]
     submission_df = submission_df.drop(["id"], axis=1)
     X_test = preprocess_transform(submission_df, preprocessor)

diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -13,7 +13,7 @@
 
 from fea_share_preprocess import preprocess_script
 
-X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script()
+X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script(debug_mode=True)
 
 pickle.dump(X_train, open("X_train.pkl", "wb"))
 pickle.dump(X_valid, open("X_valid.pkl", "wb"))

diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -93,7 +93,7 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
 
         context_dict = {
             "hypothesis_and_feedback": hypothesis_feedback,
-            "RAG": None,
+            "RAG": rag_content,
             "hypothesis_output_format": prompt_dict["hypothesis_output_format"],
             "hypothesis_specification": None,
         }