From 39bd7801152d5a64cef4c45aa11abcf2f5645778 Mon Sep 17 00:00:00 2001 From: sydney-runkle Date: Tue, 17 Dec 2024 10:04:36 -0500 Subject: [PATCH 1/2] API --- pydantic_ai_slim/pydantic_ai/agent.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/pydantic_ai_slim/pydantic_ai/agent.py b/pydantic_ai_slim/pydantic_ai/agent.py index 6e437c7..0a4c718 100644 --- a/pydantic_ai_slim/pydantic_ai/agent.py +++ b/pydantic_ai_slim/pydantic_ai/agent.py @@ -191,6 +191,8 @@ async def run( model: models.Model | models.KnownModelName | None = None, deps: AgentDeps = None, model_settings: ModelSettings | None = None, + message_limit: int | None = None, + token_limit: int | None = None, infer_name: bool = True, ) -> result.RunResult[ResultData]: """Run the agent with a user prompt in async mode. @@ -211,8 +213,10 @@ async def run( message_history: History of the conversation so far. model: Optional model to use for this run, required if `model` was not set when creating the agent. deps: Optional dependencies to use for this run. - infer_name: Whether to try to infer the agent name from the call frame if it's not set. model_settings: Optional settings to use for this model's request. + message_limit: Optional limit on the number of messages in the conversation. + token_limit: Optional limit on the number of tokens used in the conversation. + infer_name: Whether to try to infer the agent name from the call frame if it's not set. Returns: The result of the run. @@ -284,6 +288,8 @@ def run_sync( model: models.Model | models.KnownModelName | None = None, deps: AgentDeps = None, model_settings: ModelSettings | None = None, + message_limit: int | None = None, + token_limit: int | None = None, infer_name: bool = True, ) -> result.RunResult[ResultData]: """Run the agent with a user prompt synchronously. @@ -308,8 +314,10 @@ async def main(): message_history: History of the conversation so far. model: Optional model to use for this run, required if `model` was not set when creating the agent. deps: Optional dependencies to use for this run. - infer_name: Whether to try to infer the agent name from the call frame if it's not set. model_settings: Optional settings to use for this model's request. + message_limit: Optional limit on the number of messages in the conversation. + token_limit: Optional limit on the number of tokens used in the conversation. + infer_name: Whether to try to infer the agent name from the call frame if it's not set. Returns: The result of the run. @@ -336,6 +344,8 @@ async def run_stream( model: models.Model | models.KnownModelName | None = None, deps: AgentDeps = None, model_settings: ModelSettings | None = None, + message_limit: int | None = None, + token_limit: int | None = None, infer_name: bool = True, ) -> AsyncIterator[result.StreamedRunResult[AgentDeps, ResultData]]: """Run the agent with a user prompt in async mode, returning a streamed response. @@ -357,8 +367,10 @@ async def main(): message_history: History of the conversation so far. model: Optional model to use for this run, required if `model` was not set when creating the agent. deps: Optional dependencies to use for this run. - infer_name: Whether to try to infer the agent name from the call frame if it's not set. model_settings: Optional settings to use for this model's request. + message_limit: Optional limit on the number of messages in the conversation. + token_limit: Optional limit on the number of tokens used in the conversation. + infer_name: Whether to try to infer the agent name from the call frame if it's not set. Returns: The result of the run. From ccc9fd910b4d9e352b1e22a8bb3242f9925ae99c Mon Sep 17 00:00:00 2001 From: sydney-runkle Date: Tue, 17 Dec 2024 14:20:52 -0500 Subject: [PATCH 2/2] initial API idea --- pydantic_ai_slim/pydantic_ai/agent.py | 32 +++++++++---------- pydantic_ai_slim/pydantic_ai/settings.py | 40 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 16 deletions(-) diff --git a/pydantic_ai_slim/pydantic_ai/agent.py b/pydantic_ai_slim/pydantic_ai/agent.py index 0a4c718..5c9a683 100644 --- a/pydantic_ai_slim/pydantic_ai/agent.py +++ b/pydantic_ai_slim/pydantic_ai/agent.py @@ -22,7 +22,7 @@ result, ) from .result import ResultData -from .settings import ModelSettings, merge_model_settings +from .settings import ExecutionLimitSettings, ModelSettings, merge_model_settings from .tools import ( AgentDeps, RunContext, @@ -191,8 +191,7 @@ async def run( model: models.Model | models.KnownModelName | None = None, deps: AgentDeps = None, model_settings: ModelSettings | None = None, - message_limit: int | None = None, - token_limit: int | None = None, + execution_limit_settings: ExecutionLimitSettings | None = None, infer_name: bool = True, ) -> result.RunResult[ResultData]: """Run the agent with a user prompt in async mode. @@ -214,8 +213,7 @@ async def run( model: Optional model to use for this run, required if `model` was not set when creating the agent. deps: Optional dependencies to use for this run. model_settings: Optional settings to use for this model's request. - message_limit: Optional limit on the number of messages in the conversation. - token_limit: Optional limit on the number of tokens used in the conversation. + execution_limit_settings: Optional settings to use in order to limit model request or cost (token usage). infer_name: Whether to try to infer the agent name from the call frame if it's not set. Returns: @@ -242,8 +240,8 @@ async def run( tool.current_retry = 0 cost = result.Cost() - model_settings = merge_model_settings(self.model_settings, model_settings) + execution_limit_settings = execution_limit_settings or ExecutionLimitSettings(request_limit=50) run_step = 0 while True: @@ -258,6 +256,8 @@ async def run( messages.append(model_response) cost += request_cost + # TODO: is this the right location? Should we move this earlier in the logic? + execution_limit_settings.increment(request_cost) with _logfire.span('handle model response', run_step=run_step) as handle_span: final_result, tool_responses = await self._handle_model_response(model_response, deps, messages) @@ -288,8 +288,7 @@ def run_sync( model: models.Model | models.KnownModelName | None = None, deps: AgentDeps = None, model_settings: ModelSettings | None = None, - message_limit: int | None = None, - token_limit: int | None = None, + execution_limit_settings: ExecutionLimitSettings | None = None, infer_name: bool = True, ) -> result.RunResult[ResultData]: """Run the agent with a user prompt synchronously. @@ -315,8 +314,7 @@ async def main(): model: Optional model to use for this run, required if `model` was not set when creating the agent. deps: Optional dependencies to use for this run. model_settings: Optional settings to use for this model's request. - message_limit: Optional limit on the number of messages in the conversation. - token_limit: Optional limit on the number of tokens used in the conversation. + execution_limit_settings: Optional settings to use in order to limit model request or cost (token usage). infer_name: Whether to try to infer the agent name from the call frame if it's not set. Returns: @@ -330,8 +328,9 @@ async def main(): message_history=message_history, model=model, deps=deps, - infer_name=False, + execution_limit_settings=execution_limit_settings, model_settings=model_settings, + infer_name=False, ) ) @@ -344,8 +343,7 @@ async def run_stream( model: models.Model | models.KnownModelName | None = None, deps: AgentDeps = None, model_settings: ModelSettings | None = None, - message_limit: int | None = None, - token_limit: int | None = None, + execution_limit_settings: ExecutionLimitSettings | None = None, infer_name: bool = True, ) -> AsyncIterator[result.StreamedRunResult[AgentDeps, ResultData]]: """Run the agent with a user prompt in async mode, returning a streamed response. @@ -368,8 +366,7 @@ async def main(): model: Optional model to use for this run, required if `model` was not set when creating the agent. deps: Optional dependencies to use for this run. model_settings: Optional settings to use for this model's request. - message_limit: Optional limit on the number of messages in the conversation. - token_limit: Optional limit on the number of tokens used in the conversation. + execution_limit_settings: Optional settings to use in order to limit model request or cost (token usage). infer_name: Whether to try to infer the agent name from the call frame if it's not set. Returns: @@ -399,6 +396,7 @@ async def main(): cost = result.Cost() model_settings = merge_model_settings(self.model_settings, model_settings) + execution_limit_settings = execution_limit_settings or ExecutionLimitSettings(request_limit=50) run_step = 0 while True: @@ -468,7 +466,9 @@ async def on_complete(): tool_responses_str = ' '.join(r.part_kind for r in tool_responses) handle_span.message = f'handle model response -> {tool_responses_str}' # the model_response should have been fully streamed by now, we can add it's cost - cost += model_response.cost() + model_response_cost = model_response.cost() + execution_limit_settings.increment(model_response_cost) + cost += model_response_cost @contextmanager def override( diff --git a/pydantic_ai_slim/pydantic_ai/settings.py b/pydantic_ai_slim/pydantic_ai/settings.py index d3e2d42..94ea6e5 100644 --- a/pydantic_ai_slim/pydantic_ai/settings.py +++ b/pydantic_ai_slim/pydantic_ai/settings.py @@ -1,8 +1,16 @@ from __future__ import annotations +from dataclasses import dataclass +from typing import TYPE_CHECKING + from httpx import Timeout from typing_extensions import TypedDict +from .exceptions import UnexpectedModelBehavior + +if TYPE_CHECKING: + from .result import Cost + class ModelSettings(TypedDict, total=False): """Settings to configure an LLM. @@ -70,3 +78,35 @@ def merge_model_settings(base: ModelSettings | None, overrides: ModelSettings | return base | overrides else: return base or overrides + + +@dataclass +class ExecutionLimitSettings: + """Settings to configure an agent run.""" + + request_limit: int | None = None + request_tokens_limit: int | None = None + response_tokens_limit: int | None = None + total_tokens_limit: int | None = None + + _request_count: int = 0 + _request_tokens_count: int = 0 + _response_tokens_count: int = 0 + _total_tokens_count: int = 0 + + def increment(self, cost: Cost) -> None: + self._request_count += 1 + self._check_limit(self.request_limit, self._request_count, 'request count') + + self._request_tokens_count += cost.request_tokens or 0 + self._check_limit(self.request_tokens_limit, self._request_tokens_count, 'request tokens count') + + self._response_tokens_count += cost.response_tokens or 0 + self._check_limit(self.response_tokens_limit, self._response_tokens_count, 'response tokens count') + + self._total_tokens_count += cost.total_tokens or 0 + self._check_limit(self.total_tokens_limit, self._total_tokens_count, 'total tokens count') + + def _check_limit(self, limit: int | None, count: int, limit_name: str) -> None: + if limit and limit < count: + raise UnexpectedModelBehavior(f'Exceeded {limit_name} limit of {limit} by {count - limit}')