chore: get prediction for eval dataset (#414)

Add the function to get prediction for each of the queries from golden_dataset. Prediction is used as comparison to retrieve metrics. Usage example: ``` from evaluation import run_llm_for_eval, goldens # set up orchestration, session, set uuid eval_list = await run_llm_for_eval(goldens, orchestration, session, session_id) ```
GoogleCloudPlatform · Jul 26, 2024 · 5a112d8 · 5a112d8
1 parent 5a24a79
commit 5a112d8
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 1 deletion.
diff --git a/llm_demo/evaluation/__init__.py b/llm_demo/evaluation/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .eval_golden import goldens
+from .evaluation import run_llm_for_eval
+
+__ALL__ = ["run_llm_for_eval", "goldens"]
diff --git a/llm_demo/evaluation/eval_golden.py b/llm_demo/evaluation/eval_golden.py
@@ -32,7 +32,7 @@ class EvalData(BaseModel):
     )
     content: Optional[str] = Field(default=None)
     tool_calls: Optional[List[ToolCall]] = Field(default=None)
-    context: Optional[str] = Field(
+    context: Optional[List[Dict[str, Any] | List[Dict[str, Any]]]] = Field(
         default=None, description="context given to llm in order to answer user query"
     )
     output: Optional[str] = Field(default=None)

diff --git a/llm_demo/evaluation/evaluation.py b/llm_demo/evaluation/evaluation.py
@@ -0,0 +1,55 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List
+
+from orchestrator import BaseOrchestrator
+
+from .eval_golden import EvalData, ToolCall
+
+
+async def run_llm_for_eval(
+    eval_list: List[EvalData], orc: BaseOrchestrator, session: Dict, session_id: str
+) -> List[EvalData]:
+    """
+    Generate prediction_tool_calls and prediction_output for golden dataset query.
+    """
+    agent = orc.get_user_session(session_id)
+    for eval_data in eval_list:
+        try:
+            query_response = await agent.invoke(eval_data.query)
+        except Exception as e:
+            print(f"error invoking agent: {e}")
+        else:
+            eval_data.prediction_output = query_response.get("output")
+
+            # Retrieve prediction_tool_calls from query response
+            prediction_tool_calls = []
+            contexts = []
+            for step in query_response.get("intermediate_steps"):
+                called_tool = step[0]
+                tool_call = ToolCall(
+                    name=called_tool.tool,
+                    arguments=called_tool.tool_input,
+                )
+                prediction_tool_calls.append(tool_call)
+                context = step[-1]
+                contexts.append(context)
+
+            eval_data.prediction_tool_calls = prediction_tool_calls
+            eval_data.context = contexts
+
+        if eval_data.reset:
+            orc.user_session_reset(session, session_id)
+    return eval_list