diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py index 1f9f3f40..f6619876 100644 --- a/rdagent/core/experiment.py +++ b/rdagent/core/experiment.py @@ -155,7 +155,7 @@ def clear(self) -> None: """ Clear the workspace """ - shutil.rmtree(self.workspace_path) + shutil.rmtree(self.workspace_path, ignore_errors=True) self.code_dict = {} def execute(self) -> object | None: diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py index 7a56de0a..7fa6242a 100644 --- a/rdagent/scenarios/kaggle/developer/runner.py +++ b/rdagent/scenarios/kaggle/developer/runner.py @@ -7,7 +7,7 @@ from rdagent.components.coder.factor_coder.factor import FactorTask from rdagent.components.runner import CachedRunner from rdagent.components.runner.conf import RUNNER_SETTINGS -from rdagent.core.exception import ModelEmptyError +from rdagent.core.exception import FactorEmptyError, ModelEmptyError from rdagent.core.experiment import ASpecificExp from rdagent.oai.llm_utils import md5_hash from rdagent.scenarios.kaggle.experiment.kaggle_experiment import ( @@ -41,12 +41,20 @@ class KGModelRunner(KGCachedRunner[KGModelExperiment]): def develop(self, exp: KGModelExperiment) -> KGModelExperiment: self.build_from_SOTA(exp) if exp.sub_workspace_list[0].target_task.model_type == "XGBoost": + if exp.sub_workspace_list[0].code_dict == {}: + raise ModelEmptyError("No model is implemented") exp.experiment_workspace.inject_code(**{"model_xgb.py": exp.sub_workspace_list[0].code_dict["model.py"]}) elif exp.sub_workspace_list[0].target_task.model_type == "RandomForest": + if exp.sub_workspace_list[0].code_dict == {}: + raise ModelEmptyError("No model is implemented") exp.experiment_workspace.inject_code(**{"model_rf.py": exp.sub_workspace_list[0].code_dict["model.py"]}) elif exp.sub_workspace_list[0].target_task.model_type == "LightGBM": + if exp.sub_workspace_list[0].code_dict == {}: + raise ModelEmptyError("No model is implemented") exp.experiment_workspace.inject_code(**{"model_lgb.py": exp.sub_workspace_list[0].code_dict["model.py"]}) elif exp.sub_workspace_list[0].target_task.model_type == "NN": + if exp.sub_workspace_list[0].code_dict == {}: + raise ModelEmptyError("No model is implemented") exp.experiment_workspace.inject_code(**{"model_nn.py": exp.sub_workspace_list[0].code_dict["model.py"]}) if RUNNER_SETTINGS.cache_result: cache_hit, result = self.get_cache_result(exp) @@ -113,7 +121,7 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment: exp.experiment_workspace.data_description.append((sub_ws.target_task.get_task_information(), feature_shape)) current_feature_file_count += 1 if implemented_factor_count == 0: - raise ModelEmptyError("No factor is implemented") + raise FactorEmptyError("No factor is implemented") if RUNNER_SETTINGS.cache_result: cache_hit, result = self.get_cache_result(exp) diff --git a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py index 3bbd7bf2..b1121f73 100644 --- a/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py +++ b/rdagent/scenarios/kaggle/experiment/meta_tpl/fea_share_preprocess.py @@ -6,13 +6,15 @@ from sklearn.preprocessing import LabelEncoder, OneHotEncoder -def prepreprocess(): +def prepreprocess(debug_mode=False): """ This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. """ # Load and preprocess the data data_df = pd.read_csv("/kaggle/input/train.csv") - data_df = data_df.head(1200) + if debug_mode: + data_df = data_df.sample(frac=0.1, random_state=42) + data_df = data_df data_df = data_df.drop(["id"], axis=1) X = data_df.drop(["class"], axis=1) @@ -79,11 +81,11 @@ def preprocess_transform(X: pd.DataFrame, preprocessor): return X_transformed -def preprocess_script(): +def preprocess_script(debug_mode=False): """ This method applies the preprocessing steps to the training, validation, and test datasets. """ - X_train, X_valid, y_train, y_valid = prepreprocess() + X_train, X_valid, y_train, y_valid = prepreprocess(debug_mode=debug_mode) # Fit the preprocessor on the training data preprocessor = preprocess_fit(X_train) @@ -94,7 +96,8 @@ def preprocess_script(): # Load and preprocess the test data submission_df = pd.read_csv("/kaggle/input/test.csv") - submission_df = submission_df.head(500) + if debug_mode: + data_df = data_df.sample(frac=0.1, random_state=42) passenger_ids = submission_df["id"] submission_df = submission_df.drop(["id"], axis=1) X_test = preprocess_transform(submission_df, preprocessor) diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py index 42388785..64faa6dc 100644 --- a/rdagent/scenarios/kaggle/experiment/workspace.py +++ b/rdagent/scenarios/kaggle/experiment/workspace.py @@ -13,7 +13,7 @@ from fea_share_preprocess import preprocess_script -X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script() +X_train, X_valid, y_train, y_valid, X_test, passenger_ids = preprocess_script(debug_mode=True) pickle.dump(X_train, open("X_train.pkl", "wb")) pickle.dump(X_valid, open("X_valid.pkl", "wb")) diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py index 9883b7f7..0851e8d1 100644 --- a/rdagent/scenarios/kaggle/proposal/proposal.py +++ b/rdagent/scenarios/kaggle/proposal/proposal.py @@ -93,7 +93,7 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: context_dict = { "hypothesis_and_feedback": hypothesis_feedback, - "RAG": None, + "RAG": rag_content, "hypothesis_output_format": prompt_dict["hypothesis_output_format"], "hypothesis_specification": None, }