Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Action Graph Solver Version 0.1 #1428

Merged
merged 5 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,319 changes: 1,319 additions & 0 deletions examples/ags/benchmark/data/gsm8k_main_test.jsonl

Large diffs are not rendered by default.

7,473 changes: 7,473 additions & 0 deletions examples/ags/benchmark/data/gsm8k_main_train.jsonl

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions examples/ags/benchmark/data/hotpot.json

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions examples/ags/benchmark/gsm8k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Date :
# @Author : issac
# @Desc : test on gsm8k
4 changes: 4 additions & 0 deletions examples/ags/benchmark/hotpotQA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# -*- coding: utf-8 -*-
# @Date :
# @Author : issac
# @Desc : test on hotpotqa
165 changes: 165 additions & 0 deletions examples/ags/benchmark/humaneval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
# @Date : 7/7/2024 17:07 PM
# @Author : didi
# @Desc : test on human eval graph

import asyncio
import json
import os
import subprocess
import sys
from typing import Literal, Optional

import aiofiles
from evalplus.data import get_human_eval_plus

from examples.ags.w_action_node.graph import HumanEvalGraph
from examples.ags.w_action_node.operator import GenerateCode, GenerateCodeBlock
from examples.ags.w_action_node.utils import sort_json_by_task_id
from metagpt.llm import LLM
from metagpt.logs import logger
from metagpt.utils.common import add_jsonl_file, read_json_file
from metagpt.utils.exceptions import handle_exception

generate_code = GenerateCode(llm=LLM())
generate_code_block = GenerateCodeBlock(llm=LLM())
solver = HumanEvalGraph(name="solver", llm=LLM(), criteria="correctness, efficiency, readability", vote_count=5)

ModeType = Literal["ags", "alpha_codium", "llm"]


async def llm_generate(id):
case = get_human_eval_plus()[f"{id}"]
solution_result = await generate_code_block(case["prompt"], case["entry_point"])
sample_dict = dict(task_id=case["task_id"], solution=solution_result["code_solution"])
return sample_dict


async def ags_generate(id, ensemble_count: int = 5):
case = get_human_eval_plus()[f"{id}"]
solution_result = await solver(case["prompt"], ensemble_count=ensemble_count)
sample_dict = dict(task_id=case["task_id"], solution=solution_result["final_solution"])
return sample_dict


async def alpha_codium_generate(id):
case = get_human_eval_plus()[f"{id}"]
solution_result = await solver.alpha_codium(case["task_id"], case["prompt"], ensemble_count=5)
sample_dict = dict(task_id=case["task_id"], solution=solution_result["final_solution"])
return sample_dict


async def route_generate(mode: ModeType, id: str):
if mode == "ags":
didiforgithub marked this conversation as resolved.
Show resolved Hide resolved
sample_dict = await ags_generate(id)
elif mode == "alpha_codium":
sample_dict = await alpha_codium_generate(id)
elif mode == "llm":
sample_dict = await llm_generate(id)
else:
raise ValueError(f"Invalid mode: {mode}")
return sample_dict


async def sample_generate(id, result_path: str = "samples.jsonl", mode: ModeType = "ags"):
sample_dict = await route_generate(mode, id)
add_jsonl_file(result_path, [sample_dict])
sort_json_by_task_id(result_path, result_path)


async def samples_generate(mode: ModeType, result_path: str = "samples.jsonl"):
ids = list(get_human_eval_plus().keys())
file_lock = asyncio.Lock()
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

建议就是带上时间戳等信息,不要使用 file_lock


@handle_exception(
exception_type=Exception,
exception_msg="Error in solve_and_write function",
default_return=lambda id, *args, **kwargs: id,
)
async def solve_and_write(id: str, mode: ModeType) -> Optional[str]:
sample_dict = await route_generate(mode, id)
async with file_lock:
async with aiofiles.open(result_path, mode="a") as f:
await f.write(json.dumps(sample_dict) + "\n")
return None

tasks = [solve_and_write(id, mode) for id in ids]
results = await asyncio.gather(*tasks)
failed_tasks = [task_id for task_id in results if task_id is not None]

if failed_tasks:
logger.info(failed_tasks)
for task_id in failed_tasks:
try:
await sample_generate(task_id, result_path, mode)
failed_tasks.remove(task_id)
except Exception:
logger.error(f"{task_id} fail")

sort_json_by_task_id(result_path, result_path)

if not failed_tasks:
if automatic_evalplus(result_path):
eval_path = result_path[:-6] + "_eval_results.json"
unpassed_exapmle = extract_failure_tests(eval_path)
logger.info(unpassed_exapmle)
else:
logger.info(failed_tasks)


@handle_exception(exception_type=subprocess.CalledProcessError, exception_msg="sanitize error", default_return=None)
def automatic_sanitize(result_path: str = "samples.jsonl") -> Optional[str]:
"""
在命令行中自动执行 evalplus.sanitize --samples result_path
返回result_path前缀加上"-sanitized.jsonl"
"""
command = ["evalplus.sanitize", "--samples", result_path]

subprocess.run(command, check=True)

base_name = os.path.splitext(result_path)[0]
sanitized_path = f"{base_name}-sanitized.jsonl"

return sanitized_path


@handle_exception(
exception_type=subprocess.CalledProcessError,
exception_msg="Error in automatic_evalplus function",
default_return=False,
)
def automatic_evalplus(result_path: str = "samples.jsonl") -> bool:
"""
在命令行中自动执行 evalplus.evaluate --dataset humaneval --samples samples.jsonl --parallel 2 --base-only
"""
command = [
sys.executable, # 使用当前 Python 解释器
"-m",
"evalplus.evaluate",
"--dataset",
"humaneval",
"--samples",
result_path,
"--parallel",
"2",
"--base-only",
]

result = subprocess.run(command, check=True, capture_output=True, text=True)
logger.info(f"ouptput: \n {result.stdout}")
return True


def extract_failure_tests(file_path: str = "samples_eval_results.json"):
task_results = read_json_file(file_path)

failed_tests = []
for task in task_results["eval"].values():
if task[0]["base_status"] == "fail":
failed_test = {
"task_id": task[0]["task_id"],
}
failed_tests.append(failed_test)
logger.info(f"length of failed tests: {len(failed_tests)}")

return failed_tests
128 changes: 128 additions & 0 deletions examples/ags/w_action_node/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
# @Date : 6/27/2024 22:07 PM
# @Author : didi
# @Desc : graph & an instance - humanevalgraph

from typing import List

from evalplus.data import get_human_eval_plus

from examples.ags.w_action_node.operator import (
FuEnsemble,
Generate,
GenerateCode,
GenerateCodeBlock,
MdEnsemble,
Rephrase,
Review,
Revise,
Test,
)
from examples.ags.w_action_node.utils import extract_test_cases_from_jsonl
from metagpt.llm import LLM


class Graph:
didiforgithub marked this conversation as resolved.
Show resolved Hide resolved
def __init__(self, name: str, llm: LLM) -> None:
self.name = name
self.model = llm

def __call__():
NotImplementedError("Subclasses must implement __call__ method")

def optimize(dataset: List):
pass


class HumanEvalGraph(Graph):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems graph has no abstraction, each {Dataset}Graph should define the particular operator?

def __init__(self, name: str, llm: LLM, criteria: str, vote_count: int = 5) -> None:
super().__init__(name, llm)
self.criteria = criteria # TODO 自动构建图时,图的初始参数与图所使用的算子要求的外部参数相匹配
self.generate_code = GenerateCode(llm=llm)
self.generate_code_block = GenerateCodeBlock(llm=llm)
self.review = Review(llm=llm, criteria=criteria)
self.revise = Revise(llm=llm)
self.rephrase = Rephrase(llm=llm)
self.tester = Test(llm=llm)
self.fuensemble = FuEnsemble(llm=llm)
self.mdensemble = MdEnsemble(llm=llm, vote_count=vote_count)

async def __call__(self, problem: str, ensemble_count: int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.generate_code_block(problem)
solution = solution.get("code_solution")
solution_list.append(solution)
solution = await self.mdensemble("code", solution_list, problem)
return solution

async def alpha_codium(self, problem_id: str, problem: str, ensemble_count: int = 3):
"""
Paper: Code Generation with AlphaCodium: From Prompt Engineering to Flow Engineering
Link: https://arxiv.org/abs/2404.14963
Flow: An incomplete version of alpha codium, implementing the basic process of rephrase -> code ensemble -> tes
"""
test_cases = extract_test_cases_from_jsonl(problem_id)
entry_point = get_human_eval_plus()[problem_id]["entry_point"]
rephrase_problem = await self.rephrase(problem) # 在rephrase 中拼接原始的问题描述
solution_list = []
for _ in range(ensemble_count):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems can be abstracted as an ensemble process with particular func due to part of same with __call__

solution = await self.generate_code_block.rephrase_generate(
problem, rephrase_problem, function_name=entry_point
)
solution = solution.get("code_solution")
solution_list.append(solution)
solution = await self.mdensemble("code", solution_list, problem)
solution = await self.tester(problem_id, problem, rephrase_problem, solution, test_cases)
return solution

async def review_revise_ensemble(self, problem: str, ensemble_count: int = 2, revise_round: int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.single_solve(problem, revise_round)
solution_list.append(solution)
solution = await self.ensemble(solution_list, problem)
return solution

async def simple_ensemble(self, problem: str, ensemble_count: int = 3):
solution_list = []
for _ in range(ensemble_count):
solution = await self.generate_code(problem)
# solution = await self.generate_code_block(problem)
solution = solution.get("code_solution")
solution_list.append(solution)
solution = await self.fuensemble(solution_list, problem)
return solution

async def single_solve(self, problem: str, max_loop: int):
solution = await self.generate_code(problem)
solution = solution.get("code_solution")
for _ in range(max_loop):
review_feedback = await self.review(problem, solution)
didiforgithub marked this conversation as resolved.
Show resolved Hide resolved
if review_feedback["review_result"]:
break
solution = await self.revise(problem, solution, review_feedback["feedback"])
solution = solution.get("revised_solution")
return solution


class Gsm8kGraph(Graph):
def __init__(self, name: str, llm: LLM) -> None:
super().__init__(name, llm)
self.generate = Generate(llm=llm)
self.rephrase = Rephrase(llm=llm)

async def __call__(self, problem: str):
solution = self.generate(problem)
return solution


class HotpotQAGraph(Graph):
def __init__(self, name: str, llm: LLM) -> None:
super().__init__(name, llm)
self.generate = Generate(llm=llm)
self.rephrase = Rephrase(llm=llm)

async def __call__(self, problem: str):
solution = self.generate(problem)
return solution
Loading
Loading