Skip to content

Commit

Permalink
feat: Added support for loading and storing RAG in Kaggle scenarios. (#…
Browse files Browse the repository at this point in the history
…269)

* init a scenario for kaggle feature engineering

* Added support for loading and storing RAG in Kaggle scenarios.

* fix a ci bug

* Add RAG after each experiment's feedback.

* add a promt

* fix a bug

* fix a bug

* add a readme

* refine the code in knowledge loading
  • Loading branch information
WinstonLiyt authored Sep 19, 2024
1 parent 0938394 commit c4895de
Show file tree
Hide file tree
Showing 9 changed files with 140 additions and 13 deletions.
4 changes: 3 additions & 1 deletion rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class Config:
env_prefix = "KG_"
"""Use `KG_` as prefix for environment variables"""
protected_namespaces = ()
"""Add 'model_' to the protected namespaces"""
"""Do not allow overriding of these namespaces"""

# 1) overriding the default
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGScenario"
Expand Down Expand Up @@ -42,5 +42,7 @@ class Config:

competition: str = ""

rag_path: str = "git_ignore_folder/rag"


KAGGLE_IMPLEMENT_SETTING = KaggleBasePropSetting()
7 changes: 7 additions & 0 deletions rdagent/app/kaggle/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
from rdagent.core.scenario import Scenario
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
KaggleExperienceBase,
)
from rdagent.scenarios.kaggle.proposal.proposal import (
KG_ACTION_FEATURE_ENGINEERING,
KG_ACTION_FEATURE_PROCESSING,
Expand All @@ -29,6 +32,10 @@ def __init__(self, PROP_SETTING: BasePropSetting):
scen: Scenario = import_class(PROP_SETTING.scen)(PROP_SETTING.competition)
logger.log_object(scen, tag="scenario")

self.vector_base = KaggleExperienceBase()
if KAGGLE_IMPLEMENT_SETTING.rag_path:
self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path)

self.hypothesis_gen: HypothesisGen = import_class(PROP_SETTING.hypothesis_gen)(scen)
logger.log_object(self.hypothesis_gen, tag="hypothesis generator")

Expand Down
20 changes: 17 additions & 3 deletions rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@
)
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
extract_knowledge_from_feedback,
)
from rdagent.utils import convert2bool

feedback_prompts = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
DIRNAME = Path(__file__).absolute().resolve().parent


Expand Down Expand Up @@ -84,7 +87,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
# Generate the system prompt
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(feedback_prompts["factor_feedback_generation"]["system"])
.from_string(prompt_dict["factor_feedback_generation"]["system"])
.render(scenario=self.scen.get_scenario_all_desc())
)

Expand All @@ -97,7 +100,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
# Generate the user prompt
usr_prompt = (
Environment(undefined=StrictUndefined)
.from_string(feedback_prompts[prompt_key]["user"])
.from_string(prompt_dict[prompt_key]["user"])
.render(
hypothesis_text=hypothesis_text,
task_details=tasks_factors,
Expand All @@ -122,6 +125,17 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
reason = response_json.get("Reasoning", "No reasoning provided")
decision = convert2bool(response_json.get("Replace Best Result", "no"))

experiment_feedback = {
"hypothesis_text": hypothesis_text,
"current_result": current_result,
"tasks_factors": tasks_factors,
"observations": observations,
"hypothesis_evaluation": hypothesis_evaluation,
"reason": reason,
}

self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)

return HypothesisFeedback(
observations=observations,
hypothesis_evaluation=hypothesis_evaluation,
Expand Down
9 changes: 9 additions & 0 deletions rdagent/scenarios/kaggle/experiment/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import pandas as pd
from jinja2 import Environment, StrictUndefined

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
from rdagent.core.prompts import Prompts
from rdagent.core.scenario import Scenario
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KGFactorExperiment
from rdagent.scenarios.kaggle.kaggle_crawler import crawl_descriptions
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
KaggleExperienceBase,
)

prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")

Expand All @@ -32,6 +36,11 @@ def __init__(self, competition: str) -> None:

self._background = self.background

# all competitions are based on the same vector base
self.vector_base = KaggleExperienceBase()
if KAGGLE_IMPLEMENT_SETTING.rag_path:
self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path)

def _analysis_competition_description(self):
sys_prompt = (
Environment(undefined=StrictUndefined)
Expand Down
8 changes: 8 additions & 0 deletions rdagent/scenarios/kaggle/knowledge_management/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
## Usage

This folder implements a knowledge base using RAG based on Kaggle competitions.
It allows you to store Kaggle competition experiences into the knowledge base, as well as store experimental experiences from RD-Agent.

1. First, generate a knowledge base (in JSON format) by running the `main` function in `extract_knowledge.py`.
2. Then, create a vector base in `vector_base.py` and save it.
3. Finally, add the field `KG_RAG_PATH="xxx.pkl"` (the path to the saved vector base) in your `.env` file.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")


def process_with_gpt(content: str):
def extract_knowledge_from_high_score_answers(content: str):
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_kaggle_knowledge_prompts"]["system"])
Expand All @@ -37,6 +37,36 @@ def process_with_gpt(content: str):
return response_json_analysis


def extract_knowledge_from_feedback(feedback_response: dict) -> dict:
"""
Extracts knowledge from LLM-generated feedback and structures it.
"""
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_kaggle_knowledge_from_feedback_prompts"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_kaggle_knowledge_from_feedback_prompts"]["user"])
.render(experiment_strategy=feedback_response)
)

response_analysis = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(response_analysis)
except json.JSONDecodeError:
response_json_analysis = {"error": "Failed to parse LLM's response as JSON"}

return response_json_analysis


def process_all_case_files(directory_path: str):
output_file = Path(directory_path) / "kaggle_experience_results.json"
json_output = []
Expand All @@ -46,8 +76,8 @@ def process_all_case_files(directory_path: str):

with open(file_path, "r", encoding="utf-8") as file:
content = file.read()
gpt_response = process_with_gpt(content)
json_output.append(gpt_response)
knowladge = extract_knowledge_from_high_score_answers(content)
json_output.append(knowladge)

with open(output_file, "w", encoding="utf-8") as json_file:
json.dump(json_output, json_file, ensure_ascii=False)
Expand Down
22 changes: 21 additions & 1 deletion rdagent/scenarios/kaggle/knowledge_management/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,24 @@ extract_kaggle_knowledge_prompts:
}
user: |-
High-ranking Kaggle notebooks or competition strategies: {{ file_content }}
High-ranking Kaggle notebooks or competition strategies: {{ file_content }}
extract_kaggle_knowledge_from_feedback_prompts:
system: |-
You are a Kaggle competition expert with extensive experience in analyzing Kaggle notebooks and competition strategies.
Your task is to summarize or infer key information such as the competition name, task type, and specific techniques employed in the notebook or strategy.
For each provided content, you are expected to extract valuable insights and organize the analysis in the structured format outlined below.
Please provide the analysis in the following JSON format:
{
"content": "all provided content",
"title": "extracted title, if available",
"competition_name": "extracted competition name",
"task_category": "extracted task type, e.g., Classification, Regression",
"field": "field of focus, e.g., Feature Engineering, Modeling",
"ranking": "extracted ranking, if available",
"score": "extracted score or metric, if available"
}
user: |-
Experiment strategy: {{ experiment_strategy }}
31 changes: 29 additions & 2 deletions rdagent/scenarios/kaggle/knowledge_management/vector_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
)
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
extract_knowledge_from_feedback,
)


class KGKnowledgeMetaData(KnowledgeMetaData):
Expand Down Expand Up @@ -178,10 +181,32 @@ def load_kaggle_experience(self, kaggle_experience_path: Union[str, Path]):
logger.error(f"Kaggle experience data not found at {kaggle_experience_path}")
self.kaggle_experience_data = []

def add_experience_to_vector_base(self):
def add_experience_to_vector_base(self, experiment_feedback=None):
"""
Process the Kaggle experience data and add relevant information to the vector base
Process Kaggle experience data or experiment feedback and add relevant information to the vector base.
Args:
experiment_feedback (dict, optional): A dictionary containing experiment feedback.
If provided, this feedback will be processed and added to the vector base.
"""
# If experiment feedback is provided, extract relevant knowledge and add it to the vector base
if experiment_feedback:
extracted_knowledge = extract_knowledge_from_feedback(experiment_feedback)

document = KGKnowledgeMetaData(
content=experiment_feedback.get("hypothesis_text", ""),
label="Experiment Feedback",
competition_name="Experiment Result",
task_category=experiment_feedback.get("tasks_factors", "General Task"),
field="Research Feedback",
ranking=None,
score=experiment_feedback.get("current_result", None),
)
document.create_embedding()
self.add(document)
return

# Process Kaggle experience data
for experience in self.kaggle_experience_data:
content = experience.get("content", "")
label = experience.get("title", "Kaggle Experience")
Expand Down Expand Up @@ -238,6 +263,8 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f

kaggle_base.add_experience_to_vector_base()

kaggle_base.save("git_ignore_folder/experience/tabular_cases/kaggle_vector_base.pkl")

print(f"There are {kaggle_base.shape()[0]} records in the vector base.")

search_results, similarities = kaggle_base.search_experience(query="image classification", topk_k=3)
Expand Down
16 changes: 13 additions & 3 deletions rdagent/scenarios/kaggle/proposal/proposal.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@

from jinja2 import Environment, StrictUndefined

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.components.coder.factor_coder.factor import FactorTask
from rdagent.components.coder.model_coder.model import ModelExperiment, ModelTask
from rdagent.components.knowledge_management.vector_base import VectorBase
from rdagent.components.proposal.model_proposal import (
ModelHypothesis,
ModelHypothesis2Experiment,
Expand All @@ -17,6 +19,9 @@
KGFactorExperiment,
KGModelExperiment,
)
from rdagent.scenarios.kaggle.knowledge_management.vector_base import (
KaggleExperienceBase,
)

prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")

Expand Down Expand Up @@ -68,22 +73,27 @@ class KGHypothesisGen(ModelHypothesisGen):
.. code-block:: python
class XXXDMModelHypothesisGen(DMModelHypothesisGen):
class KGHypothesisGen(ModelHypothesisGen):
prompts: Prompts = a_specifc_prompt_dict
"""

def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
def __init__(self, scen: Scenario, knowledge: VectorBase = None) -> Tuple[dict, bool]:
super().__init__(scen)
self.scen.vector_base.save(KAGGLE_IMPLEMENT_SETTING.rag_path)

def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
hypothesis_feedback = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["hypothesis_and_feedback"])
.render(trace=trace)
)

rag_results, _ = self.scen.vector_base.search_experience(hypothesis_feedback, topk_k=5)
rag_content = "\n".join([doc.content for doc in rag_results])

context_dict = {
"hypothesis_and_feedback": hypothesis_feedback,
"RAG": None,
"RAG": rag_content,
"hypothesis_output_format": prompt_dict["hypothesis_output_format"],
"hypothesis_specification": None,
}
Expand Down

0 comments on commit c4895de

Please sign in to comment.