Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for python before 3.9 #14

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "tiktoken"
dependencies = ["blobfile>=2", "regex>=2022.1.18"]
dynamic = ["version"]
requires-python = ">=3.9"
requires-python = ">=3.7"

[build-system]
build-backend = "setuptools.build_meta"
Expand Down
35 changes: 18 additions & 17 deletions tiktoken/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@

from tiktoken import _tiktoken

from typing import Dict, List, Tuple, Set, FrozenSet
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please let me know which formatter you are using (black?)
I'll sort the imports and reformat the code as you wish.


class Encoding:
def __init__(
self,
name: str,
*,
pat_str: str,
mergeable_ranks: dict[bytes, int],
special_tokens: dict[str, int],
mergeable_ranks: Dict[bytes, int],
special_tokens: Dict[str, int],
explicit_n_vocab: Optional[int] = None,
):
self.name = name
Expand All @@ -39,7 +40,7 @@ def __repr__(self) -> str:
# Encoding
# ====================

def encode_ordinary(self, text: str) -> list[int]:
def encode_ordinary(self, text: str) -> List[int]:
"""Encodes a string into tokens, ignoring special tokens.

This is equivalent to `encode(text, disallowed_special=())` (but slightly faster).
Expand All @@ -56,7 +57,7 @@ def encode(
*,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
) -> list[int]:
) -> List[int]:
"""Encodes a string into tokens.

Special tokens are artificial tokens used to unlock capabilities from a model,
Expand Down Expand Up @@ -96,7 +97,7 @@ def encode(

return self._core_bpe.encode(text, allowed_special)

def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
def encode_ordinary_batch(self, text: List[str], *, num_threads: int = 8) -> List[List[int]]:
"""Encodes a list of strings into tokens, in parallel, ignoring special tokens.

This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster).
Expand All @@ -112,12 +113,12 @@ def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> lis

def encode_batch(
self,
text: list[str],
text: List[str],
*,
num_threads: int = 8,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
) -> list[list[int]]:
) -> List[List[int]]:
"""Encodes a list of strings into tokens, in parallel.

See `encode` for more details on `allowed_special` and `disallowed_special`.
Expand Down Expand Up @@ -146,7 +147,7 @@ def encode_with_unstable(
*,
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), # noqa: B006
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
) -> tuple[list[int], list[list[int]]]:
) -> Tuple[List[int], List[List[int]]]:
"""Encodes a string into stable tokens and possible completion sequences.

Note that the stable tokens will only represent a substring of `text`.
Expand Down Expand Up @@ -197,7 +198,7 @@ def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int:
# Decoding
# ====================

def decode_bytes(self, tokens: list[int]) -> bytes:
def decode_bytes(self, tokens: List[int]) -> bytes:
"""Decodes a list of tokens into bytes.

```
Expand All @@ -207,7 +208,7 @@ def decode_bytes(self, tokens: list[int]) -> bytes:
"""
return self._core_bpe.decode_bytes(tokens)

def decode(self, tokens: list[int], errors: str = "replace") -> str:
def decode(self, tokens: List[int], errors: str = "replace") -> str:
"""Decodes a list of tokens into a string.

WARNING: the default behaviour of this function is lossy, since decoded bytes are not
Expand Down Expand Up @@ -235,7 +236,7 @@ def decode_single_token_bytes(self, token: int) -> bytes:
"""
return self._core_bpe.decode_single_token_bytes(token)

def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
def decode_tokens_bytes(self, tokens: List[int]) -> List[bytes]:
"""Decodes a list of tokens into a list of bytes.

Useful for visualising tokenisation.
Expand All @@ -248,7 +249,7 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
# Miscellaneous
# ====================

def token_byte_values(self) -> list[bytes]:
def token_byte_values(self) -> List[bytes]:
"""Returns the list of all token byte values."""
return self._core_bpe.token_byte_values()

Expand All @@ -257,7 +258,7 @@ def eot_token(self) -> int:
return self._special_tokens["<|endoftext|>"]

@functools.cached_property
def special_tokens_set(self) -> set[str]:
def special_tokens_set(self) -> Set[str]:
return set(self._special_tokens.keys())

@property
Expand All @@ -269,7 +270,7 @@ def n_vocab(self) -> int:
# Private
# ====================

def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]:
def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> List[int]:
"""Encodes text corresponding to bytes without a regex split.

NOTE: this will not encode any special tokens.
Expand All @@ -283,20 +284,20 @@ def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]:
text_or_bytes = text_or_bytes.encode("utf-8")
return self._core_bpe.encode_single_piece(text_or_bytes)

def _encode_only_native_bpe(self, text: str) -> list[str]:
def _encode_only_native_bpe(self, text: str) -> List[str]:
"""Encodes a string into tokens, but do regex splitting in Python."""
_unused_pat = regex.compile(self._pat_str)
ret = []
for piece in regex.findall(_unused_pat, text):
ret.extend(self._core_bpe.encode_single_piece(piece))
return ret

def _encode_bytes(self, text: bytes) -> list[int]:
def _encode_bytes(self, text: bytes) -> List[int]:
return self._core_bpe._encode_bytes(text)


@functools.lru_cache(maxsize=128)
def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
def _special_token_regex(tokens: FrozenSet[str]) -> "regex.Pattern[str]":
inner = "|".join(regex.escape(token) for token in tokens)
return regex.compile(f"({inner})")

Expand Down
7 changes: 4 additions & 3 deletions tiktoken/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import os
import uuid
from typing import Dict

import blobfile

Expand Down Expand Up @@ -41,7 +42,7 @@ def read_file_cached(blobpath: str) -> bytes:

def data_gym_to_mergeable_bpe_ranks(
vocab_bpe_file: str, encoder_json_file: str
) -> dict[bytes, int]:
) -> Dict[bytes, int]:
# NB: do not add caching to this function
rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]

Expand Down Expand Up @@ -82,13 +83,13 @@ def decode_data_gym(value: str) -> bytes:
return bpe_ranks


def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
def dump_tiktoken_bpe(bpe_ranks: Dict[bytes, int], tiktoken_bpe_file: str) -> None:
with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")


def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
def load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
# NB: do not add caching to this function
contents = read_file_cached(tiktoken_bpe_file)
return {
Expand Down
7 changes: 4 additions & 3 deletions tiktoken/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@
import tiktoken_ext

from tiktoken.core import Encoding
from typing import Dict, List

_lock = threading.RLock()
ENCODINGS: dict[str, Encoding] = {}
ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
ENCODINGS: Dict[str, Encoding] = {}
ENCODING_CONSTRUCTORS: Optional[Dict[str, Callable[[], Dict[str, Any]]]] = None


def _find_constructors() -> None:
Expand Down Expand Up @@ -63,7 +64,7 @@ def get_encoding(encoding_name: str) -> Encoding:
return enc


def list_encoding_names() -> list[str]:
def list_encoding_names() -> List[str]:
with _lock:
if ENCODING_CONSTRUCTORS is None:
_find_constructors()
Expand Down