From 1c8ff61727c7302b17f301dde35a540ad09a6b52 Mon Sep 17 00:00:00 2001
From: Avi Lumelsky <avi.lumelsky@deci.ai>
Date: Wed, 21 Dec 2022 10:29:28 +0200
Subject: [PATCH] Added support for python before 3.9

---
 pyproject.toml       |  2 +-
 tiktoken/core.py     | 35 ++++++++++++++++++-----------------
 tiktoken/load.py     |  7 ++++---
 tiktoken/registry.py |  7 ++++---
 4 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0d4327b9..2698af91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 name = "tiktoken"
 dependencies = ["blobfile>=2", "regex>=2022.1.18"]
 dynamic = ["version"]
-requires-python = ">=3.9"
+requires-python = ">=3.7"
 
 [build-system]
 build-backend = "setuptools.build_meta"
diff --git a/tiktoken/core.py b/tiktoken/core.py
index c566a520..dd65b13c 100644
--- a/tiktoken/core.py
+++ b/tiktoken/core.py
@@ -6,6 +6,7 @@
 
 from tiktoken import _tiktoken
 
+from typing import Dict, List, Tuple, Set, FrozenSet
 
 class Encoding:
     def __init__(
@@ -13,8 +14,8 @@ def __init__(
         name: str,
         *,
         pat_str: str,
-        mergeable_ranks: dict[bytes, int],
-        special_tokens: dict[str, int],
+        mergeable_ranks: Dict[bytes, int],
+        special_tokens: Dict[str, int],
         explicit_n_vocab: Optional[int] = None,
     ):
         self.name = name
@@ -39,7 +40,7 @@ def __repr__(self) -> str:
     # Encoding
     # ====================
 
-    def encode_ordinary(self, text: str) -> list[int]:
+    def encode_ordinary(self, text: str) -> List[int]:
         """Encodes a string into tokens, ignoring special tokens.
 
         This is equivalent to `encode(text, disallowed_special=())` (but slightly faster).
@@ -56,7 +57,7 @@ def encode(
         *,
         allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),  # noqa: B006
         disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-    ) -> list[int]:
+    ) -> List[int]:
         """Encodes a string into tokens.
 
         Special tokens are artificial tokens used to unlock capabilities from a model,
@@ -96,7 +97,7 @@ def encode(
 
         return self._core_bpe.encode(text, allowed_special)
 
-    def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:
+    def encode_ordinary_batch(self, text: List[str], *, num_threads: int = 8) -> List[List[int]]:
         """Encodes a list of strings into tokens, in parallel, ignoring special tokens.
 
         This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster).
@@ -112,12 +113,12 @@ def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> lis
 
     def encode_batch(
         self,
-        text: list[str],
+        text: List[str],
         *,
         num_threads: int = 8,
         allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),  # noqa: B006
         disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-    ) -> list[list[int]]:
+    ) -> List[List[int]]:
         """Encodes a list of strings into tokens, in parallel.
 
         See `encode` for more details on `allowed_special` and `disallowed_special`.
@@ -146,7 +147,7 @@ def encode_with_unstable(
         *,
         allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),  # noqa: B006
         disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-    ) -> tuple[list[int], list[list[int]]]:
+    ) -> Tuple[List[int], List[List[int]]]:
         """Encodes a string into stable tokens and possible completion sequences.
 
         Note that the stable tokens will only represent a substring of `text`.
@@ -197,7 +198,7 @@ def encode_single_token(self, text_or_bytes: Union[str, bytes]) -> int:
     # Decoding
     # ====================
 
-    def decode_bytes(self, tokens: list[int]) -> bytes:
+    def decode_bytes(self, tokens: List[int]) -> bytes:
         """Decodes a list of tokens into bytes.
 
         ```
@@ -207,7 +208,7 @@ def decode_bytes(self, tokens: list[int]) -> bytes:
         """
         return self._core_bpe.decode_bytes(tokens)
 
-    def decode(self, tokens: list[int], errors: str = "replace") -> str:
+    def decode(self, tokens: List[int], errors: str = "replace") -> str:
         """Decodes a list of tokens into a string.
 
         WARNING: the default behaviour of this function is lossy, since decoded bytes are not
@@ -235,7 +236,7 @@ def decode_single_token_bytes(self, token: int) -> bytes:
         """
         return self._core_bpe.decode_single_token_bytes(token)
 
-    def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
+    def decode_tokens_bytes(self, tokens: List[int]) -> List[bytes]:
         """Decodes a list of tokens into a list of bytes.
 
         Useful for visualising tokenisation.
@@ -248,7 +249,7 @@ def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:
     # Miscellaneous
     # ====================
 
-    def token_byte_values(self) -> list[bytes]:
+    def token_byte_values(self) -> List[bytes]:
         """Returns the list of all token byte values."""
         return self._core_bpe.token_byte_values()
 
@@ -257,7 +258,7 @@ def eot_token(self) -> int:
         return self._special_tokens["<|endoftext|>"]
 
     @functools.cached_property
-    def special_tokens_set(self) -> set[str]:
+    def special_tokens_set(self) -> Set[str]:
         return set(self._special_tokens.keys())
 
     @property
@@ -269,7 +270,7 @@ def n_vocab(self) -> int:
     # Private
     # ====================
 
-    def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]:
+    def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> List[int]:
         """Encodes text corresponding to bytes without a regex split.
 
         NOTE: this will not encode any special tokens.
@@ -283,7 +284,7 @@ def _encode_single_piece(self, text_or_bytes: Union[str, bytes]) -> list[int]:
             text_or_bytes = text_or_bytes.encode("utf-8")
         return self._core_bpe.encode_single_piece(text_or_bytes)
 
-    def _encode_only_native_bpe(self, text: str) -> list[str]:
+    def _encode_only_native_bpe(self, text: str) -> List[str]:
         """Encodes a string into tokens, but do regex splitting in Python."""
         _unused_pat = regex.compile(self._pat_str)
         ret = []
@@ -291,12 +292,12 @@ def _encode_only_native_bpe(self, text: str) -> list[str]:
             ret.extend(self._core_bpe.encode_single_piece(piece))
         return ret
 
-    def _encode_bytes(self, text: bytes) -> list[int]:
+    def _encode_bytes(self, text: bytes) -> List[int]:
         return self._core_bpe._encode_bytes(text)
 
 
 @functools.lru_cache(maxsize=128)
-def _special_token_regex(tokens: frozenset[str]) -> "regex.Pattern[str]":
+def _special_token_regex(tokens: FrozenSet[str]) -> "regex.Pattern[str]":
     inner = "|".join(regex.escape(token) for token in tokens)
     return regex.compile(f"({inner})")
 
diff --git a/tiktoken/load.py b/tiktoken/load.py
index 06e51cc3..e84db314 100644
--- a/tiktoken/load.py
+++ b/tiktoken/load.py
@@ -3,6 +3,7 @@
 import json
 import os
 import uuid
+from typing import Dict 
 
 import blobfile
 
@@ -41,7 +42,7 @@ def read_file_cached(blobpath: str) -> bytes:
 
 def data_gym_to_mergeable_bpe_ranks(
     vocab_bpe_file: str, encoder_json_file: str
-) -> dict[bytes, int]:
+) -> Dict[bytes, int]:
     # NB: do not add caching to this function
     rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != " "]
 
@@ -82,13 +83,13 @@ def decode_data_gym(value: str) -> bytes:
     return bpe_ranks
 
 
-def dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:
+def dump_tiktoken_bpe(bpe_ranks: Dict[bytes, int], tiktoken_bpe_file: str) -> None:
     with blobfile.BlobFile(tiktoken_bpe_file, "wb") as f:
         for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):
             f.write(base64.b64encode(token) + b" " + str(rank).encode() + b"\n")
 
 
-def load_tiktoken_bpe(tiktoken_bpe_file: str) -> dict[bytes, int]:
+def load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
     # NB: do not add caching to this function
     contents = read_file_cached(tiktoken_bpe_file)
     return {
diff --git a/tiktoken/registry.py b/tiktoken/registry.py
index 24bb1737..ee390a1b 100644
--- a/tiktoken/registry.py
+++ b/tiktoken/registry.py
@@ -6,10 +6,11 @@
 import tiktoken_ext
 
 from tiktoken.core import Encoding
+from typing import Dict, List
 
 _lock = threading.RLock()
-ENCODINGS: dict[str, Encoding] = {}
-ENCODING_CONSTRUCTORS: Optional[dict[str, Callable[[], dict[str, Any]]]] = None
+ENCODINGS: Dict[str, Encoding] = {}
+ENCODING_CONSTRUCTORS: Optional[Dict[str, Callable[[], Dict[str, Any]]]] = None
 
 
 def _find_constructors() -> None:
@@ -63,7 +64,7 @@ def get_encoding(encoding_name: str) -> Encoding:
         return enc
 
 
-def list_encoding_names() -> list[str]:
+def list_encoding_names() -> List[str]:
     with _lock:
         if ENCODING_CONSTRUCTORS is None:
             _find_constructors()