Skip to content

Commit

Permalink
chore: get prediction for eval dataset (#414)
Browse files Browse the repository at this point in the history
Add the function to get prediction for each of the queries from
golden_dataset. Prediction is used as comparison to retrieve metrics.

Usage example:

```
from evaluation import run_llm_for_eval, goldens

# set up orchestration, session, set uuid
eval_list = await run_llm_for_eval(goldens, orchestration, session, session_id)
```
  • Loading branch information
Yuan325 committed Jul 26, 2024
1 parent 5a24a79 commit 5a112d8
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 1 deletion.
18 changes: 18 additions & 0 deletions llm_demo/evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .eval_golden import goldens
from .evaluation import run_llm_for_eval

__ALL__ = ["run_llm_for_eval", "goldens"]
2 changes: 1 addition & 1 deletion llm_demo/evaluation/eval_golden.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class EvalData(BaseModel):
)
content: Optional[str] = Field(default=None)
tool_calls: Optional[List[ToolCall]] = Field(default=None)
context: Optional[str] = Field(
context: Optional[List[Dict[str, Any] | List[Dict[str, Any]]]] = Field(
default=None, description="context given to llm in order to answer user query"
)
output: Optional[str] = Field(default=None)
Expand Down
55 changes: 55 additions & 0 deletions llm_demo/evaluation/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List

from orchestrator import BaseOrchestrator

from .eval_golden import EvalData, ToolCall


async def run_llm_for_eval(
eval_list: List[EvalData], orc: BaseOrchestrator, session: Dict, session_id: str
) -> List[EvalData]:
"""
Generate prediction_tool_calls and prediction_output for golden dataset query.
"""
agent = orc.get_user_session(session_id)
for eval_data in eval_list:
try:
query_response = await agent.invoke(eval_data.query)
except Exception as e:
print(f"error invoking agent: {e}")
else:
eval_data.prediction_output = query_response.get("output")

# Retrieve prediction_tool_calls from query response
prediction_tool_calls = []
contexts = []
for step in query_response.get("intermediate_steps"):
called_tool = step[0]
tool_call = ToolCall(
name=called_tool.tool,
arguments=called_tool.tool_input,
)
prediction_tool_calls.append(tool_call)
context = step[-1]
contexts.append(context)

eval_data.prediction_tool_calls = prediction_tool_calls
eval_data.context = contexts

if eval_data.reset:
orc.user_session_reset(session, session_id)
return eval_list

0 comments on commit 5a112d8

Please sign in to comment.