Skip to content

Commit

Permalink
anthropic[major]: support python 3.13 (#27916)
Browse files Browse the repository at this point in the history
Last week Anthropic released version 0.39.0 of its python sdk, which
enabled support for Python 3.13. This release deleted a legacy
`client.count_tokens` method, which we currently access during init of
the `Anthropic` LLM. Anthropic has replaced this functionality with the
[client.beta.messages.count_tokens()
API](anthropics/anthropic-sdk-python#726).

To enable support for `anthropic >= 0.39.0` and Python 3.13, here we
drop support for the legacy token counting method, and add support for
the new method via `ChatAnthropic.get_num_tokens_from_messages`.

To fully support the token counting API, we update the signature of
`get_num_tokens_from_message` to accept tools everywhere.

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
  • Loading branch information
ccurme and baskaryan authored Nov 12, 2024
1 parent 759b6ed commit 1538ee1
Show file tree
Hide file tree
Showing 14 changed files with 534 additions and 542 deletions.
1 change: 0 additions & 1 deletion .github/scripts/check_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
PY_312_MAX_PACKAGES = [
f"libs/partners/{integration}"
for integration in [
"anthropic",
"chroma",
"couchbase",
"huggingface",
Expand Down
26 changes: 24 additions & 2 deletions libs/community/langchain_community/chat_models/anyscale.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,22 @@
import logging
import os
import sys
from typing import TYPE_CHECKING, Any, Dict, Optional, Set
import warnings
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Optional,
Sequence,
Set,
Type,
Union,
)

import requests
from langchain_core.messages import BaseMessage
from langchain_core.tools import BaseTool
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
from pydantic import Field, SecretStr, model_validator

Expand Down Expand Up @@ -197,10 +209,20 @@ def _get_encoding_model(self) -> tuple[str, tiktoken.Encoding]:
encoding = tiktoken_.get_encoding(model)
return model, encoding

def get_num_tokens_from_messages(self, messages: list[BaseMessage]) -> int:
def get_num_tokens_from_messages(
self,
messages: list[BaseMessage],
tools: Optional[
Sequence[Union[Dict[str, Any], Type, Callable, BaseTool]]
] = None,
) -> int:
"""Calculate num tokens with tiktoken package.
Official documentation: https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
"""
if tools is not None:
warnings.warn(
"Counting tokens in tool schemas is not yet supported. Ignoring tools."
)
if sys.version_info[1] <= 7:
return super().get_num_tokens_from_messages(messages)
model, encoding = self._get_encoding_model()
Expand Down
26 changes: 24 additions & 2 deletions libs/community/langchain_community/chat_models/everlyai.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,21 @@

import logging
import sys
from typing import TYPE_CHECKING, Any, Dict, Optional, Set
import warnings
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Optional,
Sequence,
Set,
Type,
Union,
)

from langchain_core.messages import BaseMessage
from langchain_core.tools import BaseTool
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
from pydantic import Field, model_validator

Expand Down Expand Up @@ -138,11 +150,21 @@ def _get_encoding_model(self) -> tuple[str, tiktoken.Encoding]:
encoding = tiktoken_.get_encoding(model)
return model, encoding

def get_num_tokens_from_messages(self, messages: list[BaseMessage]) -> int:
def get_num_tokens_from_messages(
self,
messages: list[BaseMessage],
tools: Optional[
Sequence[Union[Dict[str, Any], Type, Callable, BaseTool]]
] = None,
) -> int:
"""Calculate num tokens with tiktoken package.
Official documentation: https://github.com/openai/openai-cookbook/blob/
main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb"""
if tools is not None:
warnings.warn(
"Counting tokens in tool schemas is not yet supported. Ignoring tools."
)
if sys.version_info[1] <= 7:
return super().get_num_tokens_from_messages(messages)
model, encoding = self._get_encoding_model()
Expand Down
13 changes: 12 additions & 1 deletion libs/community/langchain_community/chat_models/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
)
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
from langchain_core.runnables import Runnable
from langchain_core.tools import BaseTool
from langchain_core.utils import (
get_from_dict_or_env,
get_pydantic_field_names,
Expand Down Expand Up @@ -644,11 +645,21 @@ def get_token_ids(self, text: str) -> List[int]:
_, encoding_model = self._get_encoding_model()
return encoding_model.encode(text)

def get_num_tokens_from_messages(self, messages: List[BaseMessage]) -> int:
def get_num_tokens_from_messages(
self,
messages: List[BaseMessage],
tools: Optional[
Sequence[Union[Dict[str, Any], Type, Callable, BaseTool]]
] = None,
) -> int:
"""Calculate num tokens for gpt-3.5-turbo and gpt-4 with tiktoken package.
Official documentation: https://github.com/openai/openai-cookbook/blob/
main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb"""
if tools is not None:
warnings.warn(
"Counting tokens in tool schemas is not yet supported. Ignoring tools."
)
if sys.version_info[1] <= 7:
return super().get_num_tokens_from_messages(messages)
model, encoding = self._get_encoding_model()
Expand Down
17 changes: 16 additions & 1 deletion libs/core/langchain_core/language_models/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import warnings
from abc import ABC, abstractmethod
from collections.abc import Mapping, Sequence
from functools import cache
Expand Down Expand Up @@ -364,17 +365,31 @@ def get_num_tokens(self, text: str) -> int:
"""
return len(self.get_token_ids(text))

def get_num_tokens_from_messages(self, messages: list[BaseMessage]) -> int:
def get_num_tokens_from_messages(
self,
messages: list[BaseMessage],
tools: Optional[Sequence] = None,
) -> int:
"""Get the number of tokens in the messages.
Useful for checking if an input fits in a model's context window.
**Note**: the base implementation of get_num_tokens_from_messages ignores
tool schemas.
Args:
messages: The message inputs to tokenize.
tools: If provided, sequence of dict, BaseModel, function, or BaseTools
to be converted to tool schemas.
Returns:
The sum of the number of tokens across the messages.
"""
if tools is not None:
warnings.warn(
"Counting tokens in tool schemas is not yet supported. Ignoring tools.",
stacklevel=2,
)
return sum([self.get_num_tokens(get_buffer_string([m])) for m in messages])

@classmethod
Expand Down
14 changes: 13 additions & 1 deletion libs/core/tests/unit_tests/messages/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import base64
import json
import typing
from collections.abc import Sequence
from typing import Any, Callable, Optional, Union

import pytest

Expand All @@ -19,6 +22,7 @@
merge_message_runs,
trim_messages,
)
from langchain_core.tools import BaseTool


@pytest.mark.parametrize("msg_cls", [HumanMessage, AIMessage, SystemMessage])
Expand Down Expand Up @@ -431,7 +435,15 @@ def dummy_token_counter(messages: list[BaseMessage]) -> int:


class FakeTokenCountingModel(FakeChatModel):
def get_num_tokens_from_messages(self, messages: list[BaseMessage]) -> int:
def get_num_tokens_from_messages(
self,
messages: list[BaseMessage],
tools: Optional[
Sequence[
Union[typing.Dict[str, Any], type, Callable, BaseTool] # noqa: UP006
]
] = None,
) -> int:
return dummy_token_counter(messages)


Expand Down
4 changes: 0 additions & 4 deletions libs/langchain/tests/unit_tests/chat_models/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import pytest
from langchain_core.language_models import BaseChatModel
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableConfig, RunnableSequence
from pydantic import SecretStr
Expand Down Expand Up @@ -180,9 +179,6 @@ def test_configurable_with_default() -> None:
)

assert model_with_config.model == "claude-3-sonnet-20240229" # type: ignore[attr-defined]
# Anthropic defaults to using `transformers` for token counting.
with pytest.raises(ImportError):
model_with_config.get_num_tokens_from_messages([(HumanMessage("foo"))]) # type: ignore[attr-defined]

assert model_with_config.model_dump() == { # type: ignore[attr-defined]
"name": None,
Expand Down
37 changes: 36 additions & 1 deletion libs/partners/anthropic/langchain_anthropic/chat_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
)

import anthropic
from langchain_core._api import deprecated
from langchain_core._api import beta, deprecated
from langchain_core.callbacks import (
AsyncCallbackManagerForLLMRun,
CallbackManagerForLLMRun,
Expand Down Expand Up @@ -1113,6 +1113,41 @@ class AnswerWithJustification(BaseModel):
else:
return llm | output_parser

@beta()
def get_num_tokens_from_messages(
self,
messages: List[BaseMessage],
tools: Optional[
Sequence[Union[Dict[str, Any], Type, Callable, BaseTool]]
] = None,
) -> int:
"""Count tokens in a sequence of input messages.
Args:
messages: The message inputs to tokenize.
tools: If provided, sequence of dict, BaseModel, function, or BaseTools
to be converted to tool schemas.
.. versionchanged:: 0.3.0
Uses Anthropic's token counting API to count tokens in messages. See:
https://docs.anthropic.com/en/docs/build-with-claude/token-counting
"""
formatted_system, formatted_messages = _format_messages(messages)
kwargs: Dict[str, Any] = {}
if isinstance(formatted_system, str):
kwargs["system"] = formatted_system
if tools:
kwargs["tools"] = [convert_to_anthropic_tool(tool) for tool in tools]

response = self._client.beta.messages.count_tokens(
betas=["token-counting-2024-11-01"],
model=self.model,
messages=formatted_messages, # type: ignore[arg-type]
**kwargs,
)
return response.input_tokens


class AnthropicTool(TypedDict):
"""Anthropic tool definition."""
Expand Down
9 changes: 5 additions & 4 deletions libs/partners/anthropic/langchain_anthropic/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ def validate_environment(self) -> Self:
)
self.HUMAN_PROMPT = anthropic.HUMAN_PROMPT
self.AI_PROMPT = anthropic.AI_PROMPT
self.count_tokens = self.client.count_tokens
return self

@property
Expand Down Expand Up @@ -375,9 +374,11 @@ async def _astream(

def get_num_tokens(self, text: str) -> int:
"""Calculate number of tokens."""
if not self.count_tokens:
raise NameError("Please ensure the anthropic package is loaded")
return self.count_tokens(text)
raise NotImplementedError(
"Anthropic's legacy count_tokens method was removed in anthropic 0.39.0 "
"and langchain-anthropic 0.3.0. Please use "
"ChatAnthropic.get_num_tokens_from_messages instead."
)


@deprecated(since="0.1.0", removal="0.3.0", alternative="AnthropicLLM")
Expand Down
Loading

0 comments on commit 1538ee1

Please sign in to comment.