diff --git a/rdagent/app/kaggle/conf.py b/rdagent/app/kaggle/conf.py index 0a6059b8..da6c856d 100644 --- a/rdagent/app/kaggle/conf.py +++ b/rdagent/app/kaggle/conf.py @@ -51,12 +51,18 @@ class Config: local_data_path: str = "/data/userdata/share/kaggle" + domain_knowledge_path: str = "/data/userdata/share/kaggle/domain_knowledge" + rag_path: str = "git_ignore_folder/rag" if_action_choosing_based_on_UCB: bool = False if_using_feature_selection: bool = False + if_using_graph_rag: bool = False + + if_using_vector_rag: bool = False + auto_submit: bool = True diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py index 3c185958..5f4baf1c 100644 --- a/rdagent/scenarios/kaggle/developer/feedback.py +++ b/rdagent/scenarios/kaggle/developer/feedback.py @@ -14,9 +14,6 @@ ) from rdagent.log import rdagent_logger as logger from rdagent.oai.llm_utils import APIBackend -from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import ( - extract_knowledge_from_feedback, -) from rdagent.utils import convert2bool prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml") @@ -155,13 +152,17 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac experiment_feedback = { "hypothesis_text": hypothesis_text, "current_result": current_result, - "tasks_factors": tasks_factors, + "model_code": model_code, + "available_features": available_features, "observations": observations, "hypothesis_evaluation": hypothesis_evaluation, "reason": reason, } - # self.scen.vector_base.add_experience_to_vector_base(experiment_feedback) + if self.scen.if_using_vector_rag: + self.scen.vector_base.add_experience_to_vector_base(experiment_feedback) + elif self.scen.if_using_graph_rag: + self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen) return HypothesisFeedback( observations=observations, diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py index 251a645e..659b048c 100644 --- a/rdagent/scenarios/kaggle/experiment/scenario.py +++ b/rdagent/scenarios/kaggle/experiment/scenario.py @@ -35,9 +35,16 @@ def __init__(self, competition: str) -> None: self.model_output_channel = None self.evaluation_desc = None self.evaluation_metric_direction = None + self.vector_base = None self._analysis_competition_description() self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB self.if_using_feature_selection = KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection + self.if_using_graph_rag = KAGGLE_IMPLEMENT_SETTING.if_using_graph_rag + self.if_using_vector_rag = KAGGLE_IMPLEMENT_SETTING.if_using_vector_rag + + if self.if_using_vector_rag and KAGGLE_IMPLEMENT_SETTING.rag_path: + self.vector_base = KaggleExperienceBase() + self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path) self._output_format = self.output_format self._interface = self.interface diff --git a/rdagent/scenarios/kaggle/knowledge_management/graph.py b/rdagent/scenarios/kaggle/knowledge_management/graph.py index 614d8b09..9b002aca 100644 --- a/rdagent/scenarios/kaggle/knowledge_management/graph.py +++ b/rdagent/scenarios/kaggle/knowledge_management/graph.py @@ -1,4 +1,5 @@ import json +from datetime import datetime, timezone from pathlib import Path from typing import List @@ -20,21 +21,31 @@ class KGKnowledgeGraph(UndirectedGraph): - def __init__(self, path: str | Path | None, scenario: KGScenario) -> None: + def __init__(self, path: str | Path | None, scenario: KGScenario | None) -> None: super().__init__(path) - if path is not None and not Path(path).exists(): + if path is not None and Path(path).exists(): + self.load() + self.path = Path(path).parent / ( + datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S") + "_kaggle_kb.pkl" + ) + else: documents = [] - for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / "domain_knowledge").glob("*.case"): + print(Path(KAGGLE_IMPLEMENT_SETTING.domain_knowledge_path)) + for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.domain_knowledge_path)).rglob("*.case"): with open(file_path, "r") as f: documents.append(f.read()) self.load_from_documents(documents=documents, scenario=scenario) self.dump() - def analyze_one_document(self, document_content: str, scenario: KGScenario) -> list: + def add_document(self, document_content: str, scenario: KGScenario | None) -> None: + self.load_from_documents([document_content], scenario) + self.dump() # Each valid experiment will overwrite this file once again. + + def analyze_one_document(self, document_content: str, scenario: KGScenario | None) -> list: session_system_prompt = ( Environment(undefined=StrictUndefined) .from_string(PROMPT_DICT["extract_knowledge_graph_from_document"]["system"]) - .render(scenario=scenario.get_scenario_all_desc()) + .render(scenario=scenario.get_scenario_all_desc() if scenario is not None else "") ) session = APIBackend().build_chat_session( @@ -53,7 +64,7 @@ def analyze_one_document(self, document_content: str, scenario: KGScenario) -> l user_prompt = "Continue from the last step please. Don't extract the same knowledge again." return knowledge_list - def load_from_documents(self, documents: List[str], scenario: KGScenario): + def load_from_documents(self, documents: List[str], scenario: KGScenario | None) -> None: knowledge_list_list = multiprocessing_wrapper( [ ( @@ -105,3 +116,7 @@ def load_from_documents(self, documents: List[str], scenario: KGScenario): node_list = self.batch_embedding(node_list) for node_pair in node_pairs: self.add_node(node_pair[0], node_pair[1]) + + +if __name__ == "__main__": + graph = KGKnowledgeGraph(path="git_ignore_folder/kg_graph.pkl", scenario=None) diff --git a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml index 7f04a69f..61fdc0f0 100644 --- a/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml +++ b/rdagent/scenarios/kaggle/knowledge_management/prompts.yaml @@ -41,11 +41,14 @@ extract_kaggle_knowledge_from_feedback_prompts: extract_knowledge_graph_from_document: system: |- - You are helping user to extract knowledge from a document. - The user is working on data science competitions in Kaggle in the following scenario: - {{ scenario }} + You are helping the user extract knowledge from a document. + {% if scenario %} + The user is working on data science competitions in Kaggle, with the following scenario: {{ scenario }} + {% else %} + The user is working on general data science competitions on Kaggle. + {% endif %} - The user has found some possible high value documents from other experts, and they need your help to extract some knowledge from these documents. + The user has identified valuable documents from other experts and requires your help to extract meaningful insights from them. Considering each document might contain several valuable insights, you need to extract them one by one and organize them in a structured format. @@ -58,13 +61,13 @@ extract_knowledge_graph_from_document: Please provide the analysis in the following JSON format: { - "competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features", + "competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features (If no specific competition name or other fields are found, leave them blank).", "hypothesis": { "type": "one of the hypothesis types from ['Feature engineering', 'Feature processing', 'Model feature selection', 'Model tuning']", "explanation": "(Plain text) extracted detailed explanation to the hypothesis" }, - "experiments": "(Plain text) extracted experiments details. You can list them in bullet points.", + "experiments": "(Plain text) Detailed descriptions of the experiments conducted in the document, which can be listed in bullet points.", "code": "extracted code snippets if available", "conclusion": { diff --git a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py index 4ad55c3f..0b7cfd49 100644 --- a/rdagent/scenarios/kaggle/knowledge_management/vector_base.py +++ b/rdagent/scenarios/kaggle/knowledge_management/vector_base.py @@ -1,3 +1,4 @@ +from datetime import datetime, timezone from pathlib import Path from typing import List, Union @@ -107,7 +108,7 @@ class KaggleExperienceBase(PDVectorBase): Class for handling Kaggle competition experience posts and organizing them for reference """ - def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None): + def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None): """ Initialize the KaggleExperienceBase class @@ -118,12 +119,14 @@ def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[ kaggle_experience_path: str or Path, optional Path to the Kaggle experience post data. """ - super().__init__(path) + super().__init__(vector_df_path) self.kaggle_experience_path = kaggle_experience_path self.kaggle_experience_data = [] - - if kaggle_experience_path: - self.load_kaggle_experience(kaggle_experience_path) + # if path is not None and Path(path).exists(): + # self.load_kaggle_experience(kaggle_experience_path) + # self.path = Path(path).parent / (datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S") + "_kaggle_kb.pkl") + # else: + # pass def add(self, document: Union[KGDocument, List[KGDocument]]): document.split_into_trunk() @@ -258,7 +261,7 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f kaggle_base.add_experience_to_vector_base() - kaggle_base.save() + kaggle_base.save("git_ignore_folder/experience/tabular_cases/kaggle_vector_base.pkl") print(f"There are {kaggle_base.shape()[0]} records in the vector base.") diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py index 81bd7a39..c19e8da7 100644 --- a/rdagent/scenarios/kaggle/proposal/proposal.py +++ b/rdagent/scenarios/kaggle/proposal/proposal.py @@ -94,7 +94,7 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]: self.initial_performance = 0.0 def generate_RAG_content(self, trace: Trace) -> str: - if trace.knowledge_base is None: + if self.scen.if_using_graph_rag is False or trace.knowledge_base is None: return None same_competition_node = trace.knowledge_base.get_node_by_content(trace.scen.get_competition_full_desc()) if same_competition_node is not None: