From 1c8ff61727c7302b17f301dde35a540ad09a6b52 Mon Sep 17 00:00:00 2001 From: Avi Lumelsky Date: Wed, 21 Dec 2022 10:29:28 +0200 Subject: [PATCH] Added support for python before 3.9 --- pyproject.toml | 2 +- tiktoken/core.py | 35 ++++++++++++++++++----------------- tiktoken/load.py | 7 ++++--- tiktoken/registry.py | 7 ++++--- 4 files changed, 27 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0d4327b9..2698af91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "tiktoken" dependencies = ["blobfile>=2", "regex>=2022.1.18"] dynamic = ["version"] -requires-python = ">=3.9" +requires-python = ">=3.7" [build-system] build-backend = "setuptools.build_meta" diff --git a/tiktoken/core.py b/tiktoken/core.py index c566a520..dd65b13c 100644 --- a/tiktoken/core.py +++ b/tiktoken/core.py @@ -6,6 +6,7 @@ from tiktoken import _tiktoken +from typing import Dict, List, Tuple, Set, FrozenSet class Encoding: def __init__( @@ -13,8 +14,8 @@ def __init__( name: str, *, pat_str: str, - mergeable_ranks: dict[bytes, int], - special_tokens: dict[str, int], + mergeable_ranks: Dict[bytes, int], + special_tokens: Dict[str, int], explicit_n_vocab: Optional[int] = None, ): self.name = name @@ -39,7 +40,7 @@ def __repr__(self) -> str: # Encoding # ==================== - def encode_ordinary(self, text: str) -> list[int]: + def encode_ordinary(self, text: str) -> List[int]: """Encodes a string into tokens, ignoring special tokens. This is equivalent to `encode(text, disallowed_special=())` (but slightly faster). @@ -56,7 +57,7 @@ def encode( *, allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 disallowed_special: Union[Literal["all"], Collection[str]] = "all", - ) -> list[int]: + ) -> List[int]: """Encodes a string into tokens. Special tokens are artificial tokens used to unlock capabilities from a model, @@ -96,7 +97,7 @@ def encode( return self._core_bpe.encode(text, allowed_special) - def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]: + def encode_ordinary_batch(self, text: List[str], *, num_threads: int = 8) -> List[List[int]]: """Encodes a list of strings into tokens, in parallel, ignoring special tokens. This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster). @@ -112,12 +113,12 @@ def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> lis def encode_batch( self, - text: list[str], + text: List[str], *, num_threads: int = 8, allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 disallowed_special: Union[Literal["all"], Collection[str]] = "all", - ) -> list[list[int]]: + ) -> List[List[int]]: """Encodes a list of strings into tokens, in parallel. See `encode` for more details on `allowed_special` and `disallowed_special`. @@ -146,7 +147,7 @@ def encode_with_unstable( *, allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006 disallowed_special: Union[Literal["all"], Collection[str]] = "all", - ) -> tuple[list[int], list[list[int]]]: + ) -> Tuple[List[int], List[List[int]]]: """Encodes a string into stable tokens and possible completion sequences. Note that the stable tokens will only represent a substring of `text`. @@ -197,7 +198,7 @@ def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int: # Decoding # ==================== - def decode_bytes(self, tokens: list[int]) -> bytes: + def decode_bytes(self, tokens: List[int]) -> bytes: """Decodes a list of tokens into bytes. ``` @@ -207,7 +208,7 @@ def decode_bytes(self, tokens: list[int]) -> bytes: """ return self._core_bpe.decode_bytes(tokens) - def decode(self, tokens: list[int], errors: str = "replace") -> str: + def decode(self, tokens: List[int], errors: str = "replace") -> str: """Decodes a list of tokens into a string. WARNING: the default behaviour of this function is lossy, since decoded bytes are not @@ -235,7 +236,7 @@ def decode_single_token_bytes(self, token: int) -> bytes: """ return self._core_bpe.decode_single_token_bytes(token) - def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]: + def decode_tokens_bytes(self, tokens: List[int]) -> List[bytes]: """Decodes a list of tokens into a list of bytes. Useful for visualising tokenisation. @@ -248,7 +249,7 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]: # Miscellaneous # ==================== - def token_byte_values(self) -> list[bytes]: + def token_byte_values(self) -> List[bytes]: """Returns the list of all token byte values.""" return self._core_bpe.token_byte_values() @@ -257,7 +258,7 @@ def eot_token(self) -> int: return self._special_tokens["<|endoftext|>"] @functools.cached_property - def special_tokens_set(self) -> set[str]: + def special_tokens_set(self) -> Set[str]: return set(self._special_tokens.keys()) @property @@ -269,7 +270,7 @@ def n_vocab(self) -> int: # Private # ==================== - def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]: + def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> List[int]: """Encodes text corresponding to bytes without a regex split. NOTE: this will not encode any special tokens. @@ -283,7 +284,7 @@ def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]: text_or_bytes = text_or_bytes.encode("utf-8") return self._core_bpe.encode_single_piece(text_or_bytes) - def _encode_only_native_bpe(self, text: str) -> list[str]: + def _encode_only_native_bpe(self, text: str) -> List[str]: """Encodes a string into tokens, but do regex splitting in Python.""" _unused_pat = regex.compile(self._pat_str) ret = [] @@ -291,12 +292,12 @@ def _encode_only_native_bpe(self, text: str) -> list[str]: ret.extend(self._core_bpe.encode_single_piece(piece)) return ret - def _encode_bytes(self, text: bytes) -> list[int]: + def _encode_bytes(self, text: bytes) -> List[int]: return self._core_bpe._encode_bytes(text) @functools.lru_cache(maxsize=128) -def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]": +def _special_token_regex(tokens: FrozenSet[str]) -> "regex.Pattern[str]": inner = "|".join(regex.escape(token) for token in tokens) return regex.compile(f"({inner})") diff --git a/tiktoken/load.py b/tiktoken/load.py index 06e51cc3..e84db314 100644 --- a/tiktoken/load.py +++ b/tiktoken/load.py @@ -3,6 +3,7 @@ import json import os import uuid +from typing import Dict import blobfile @@ -41,7 +42,7 @@ def read_file_cached(blobpath: str) -> bytes: def data_gym_to_mergeable_bpe_ranks( vocab_bpe_file: str, encoder_json_file: str -) -> dict[bytes, int]: +) -> Dict[bytes, int]: # NB: do not add caching to this function rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "] @@ -82,13 +83,13 @@ def decode_data_gym(value: str) -> bytes: return bpe_ranks -def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None: +def dump_tiktoken_bpe(bpe_ranks: Dict[bytes, int], tiktoken_bpe_file: str) -> None: with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f: for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]): f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n") -def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]: +def load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]: # NB: do not add caching to this function contents = read_file_cached(tiktoken_bpe_file) return { diff --git a/tiktoken/registry.py b/tiktoken/registry.py index 24bb1737..ee390a1b 100644 --- a/tiktoken/registry.py +++ b/tiktoken/registry.py @@ -6,10 +6,11 @@ import tiktoken_ext from tiktoken.core import Encoding +from typing import Dict, List _lock = threading.RLock() -ENCODINGS: dict[str, Encoding] = {} -ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None +ENCODINGS: Dict[str, Encoding] = {} +ENCODING_CONSTRUCTORS: Optional[Dict[str, Callable[[], Dict[str, Any]]]] = None def _find_constructors() -> None: @@ -63,7 +64,7 @@ def get_encoding(encoding_name: str) -> Encoding: return enc -def list_encoding_names() -> list[str]: +def list_encoding_names() -> List[str]: with _lock: if ENCODING_CONSTRUCTORS is None: _find_constructors()