ModelCloud · Qubitium · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025 · Feb 10, 2025
diff --git a/README.md b/README.md
@@ -1,3 +1,78 @@
-# Toke(n)icer
+<h1 align="center">Toke(n)icer</h1>
+<p align="center">A (nicer) tokenizer you want to use for model `inference` and `training`: with all known peventable `gotchas` normalized or auto-fixed.</p>
+<p align="center">
+    <a href="https://github.com/ModelCloud/Tokenicer/releases" style="text-decoration:none;"><img alt="GitHub release" src="https://img.shields.io/github/release/ModelCloud/Tokenicer.svg"></a>
+    <a href="https://pypi.org/project/tokenicer/" style="text-decoration:none;"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/tokenicer"></a>
+    <a href="https://pepy.tech/projects/tokenicer" style="text-decoration:none;"><img src="https://static.pepy.tech/badge/tokenicer" alt="PyPI Downloads"></a>
+    <a href="https://github.com/ModelCloud/tokenicer/blob/main/LICENSE"><img src="https://img.shields.io/pypi/l/tokenicer"></a>
+    <a href="https://huggingface.co/modelcloud/"><img src="https://img.shields.io/badge/🤗%20Hugging%20Face-ModelCloud-%23ff8811.svg"></a>
+</p>
 
-The tokenizer you want to use for model `inference` and `training`: with all gotchas removed or normalized. 
+## News
+* 02/10/2025 [0.0.1](https://github.com/ModelCloud/Tokenicer/releases/tag/v0.0.1): 🤗 Initial release!
+
+## Features:
+
+* Compatible with all `HF Transformers` compatible tokenizers
+* Auto-fix `models` makers not setting or forgetting to set `padding_token`
+* Auto-Fix `models` using and setting wrong `padding_token`: many `models` incorrectly use `eos_token` as `pad_token` which leads to subtle and hidden errors in post-training and inference when `batching` is used which is almost always.
+
+## Upcoming Features:
+
+* Add `automatic` tokenizer validation to `model` `training` and subsequence `inference` so that not only tokenizer config but actual `decode`/`encode` are 100% re-validated on model load. Often the case, `inference` and `training` engines modifies the traditional tokenizers causing subtle and inaccurate output when `inference` performed on a platform that is disjointed from the `trainer`. 
+
+## Install
+
+### PIP/UV 
+
+```bash
+pip install -v tokenicer
+uv pip install -v tokenicer
+```
+
+### Install from source
+
+```bash
+# clone repo
+git clone https://github.com/ModelCloud/Tokencier.git && cd Tokenicer
+
+# compile
+pip install -v . --no-build-isolation
+```
+
+## Usage
+
+* Replace all calls to `AutoTokenizer.from_pretrained()` with `Tokenizer.load()`: args are 100% compatible with `AutoTokenizer`
+
+```py
+# Replace `AutoTokenizer.from_pretrained()`
+# from tokenizer import AutoTokenizer
+# tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-0.5B-Instruct')
+
+# With `Tokenicer.load()`
+from tokenicer import Tokenicer
+tokenizer = Tokenicer.load('Qwen/Qwen2.5-0.5B-Instruct')
+
+# Auto fix pad_token
+tokenizer.auto_assign_pad_token()
+
+bos_token = tokenizer.bos_token # <|im_end|>
+pad_token = tokenizer.pad_token # <|fim_pad|>
+
+text = "test string"
+input_ids = tokenizer.encode(text, add_special_tokens=False) # [1944, 914]
+
+```
+
+## Citation
+
+```
+@misc{gptqmodel,
+    author = {ModelCloud.ai and qubitium@modelcloud.ai},
+    title = {Toke(n)icer},
+    year = {2025},
+    publisher = {GitHub},
+    journal = {GitHub repository},
+    howpublished = {\url{https://github.com/modelcloud/tokenicer}},
+    note = {Contact: qubitium@modelcloud.ai}
+}
diff --git a/setup.py b/setup.py
@@ -27,7 +27,7 @@
     version=__version__,
     author="ModelCloud",
     author_email="qubitium@modelcloud.ai",
-    description="A nicer tokenizer",
+    description="A (nicer) tokenizer you want to use for model `inference` and `training`: with all known peventable `gotchas` normalized or auto-fixed.",
     long_description=(Path(__file__).parent / "README.md").read_text(encoding="UTF-8"),
     long_description_content_type="text/markdown",
     url="https://github.com/ModelCloud/Tokenicer",
@@ -40,4 +40,4 @@
         "Operating System :: OS Independent",
     ],
     python_requires=">=3",
-)
+)
diff --git a/tests/test_pad_token.py b/tests/test_pad_token.py
@@ -43,8 +43,10 @@ def test_pad_token(self,
                        pad_tokens: Optional[List[Union[str, int]]] = None,
                        trust_remote: bool = False
                        ):
-        tokenicer = Tokenicer.load(tokenizer_or_path=tokenizer_or_path, trust_remote=trust_remote)
-        tokenicer.auto_assign_pad_token(pad_tokens=pad_tokens)
+        tokenicer = Tokenicer.load(tokenizer_or_path, trust_remote_code=trust_remote)
+
+        if pad_tokens is not None:
+            tokenicer.auto_fix_pad_token(pad_tokens=pad_tokens)
 
         self.assertEqual(
             tokenicer.tokenizer.pad_token,

diff --git a/tests/test_tokenicer_forward.py b/tests/test_tokenicer_forward.py
@@ -1,3 +1,19 @@
+# Copyright 2025 ModelCloud.ai
+# Copyright 2025 qubitium@modelcloud.ai
+# Contact: qubitium@modelcloud.ai, x.com/qubitium
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from tokenicer import Tokenicer
 from parameterized import parameterized
 import unittest
@@ -7,8 +23,7 @@ class TestTokenicer(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         self.pretrained_model_id = "/monster/data/model/Qwen2.5-0.5B-Instruct/"
-        self.tokenizer = Tokenicer.load(tokenizer_or_path=self.pretrained_model_id)
-        self.tokenizer.auto_assign_pad_token()
+        self.tokenizer = Tokenicer.load(self.pretrained_model_id)
         self.example = 'Test Case String'
         self.expect_input_ids = [2271, 11538, 923]
 

diff --git a/tokenicer/tokenicer.py b/tokenicer/tokenicer.py
@@ -26,52 +26,55 @@
 
 class Tokenicer:
     tokenizer: Union[str, PreTrainedTokenizerBase] = None
-    trust_remote: bool = False
     model_config = None
 
     @classmethod
-    def load(cls, tokenizer_or_path: Union[str, PreTrainedTokenizerBase], trust_remote: bool = False):
-        if tokenizer_or_path is None:
-            raise ValueError("`tokenizer_or_path` cannot be `None`.")
+    def load(cls, pretrained_model_name_or_path: Union[str, PreTrainedTokenizerBase], **kwargs):
+        if pretrained_model_name_or_path is None:
+            raise ValueError("`pretrained_model_name_or_path` cannot be `None`.")
+
+        trust_remote_code = kwargs.get('trust_remote_code', False)
+
         tokenicer = cls()
-        tokenicer.trust_remote = trust_remote
 
         path = None
-        if isinstance(tokenizer_or_path, PreTrainedTokenizerBase):
-            tokenizer = tokenizer_or_path
+        if isinstance(pretrained_model_name_or_path, PreTrainedTokenizerBase):
+            tokenizer = pretrained_model_name_or_path
             tokenicer.tokenizer = tokenizer
             path = config_path(tokenizer)
-        elif isinstance(tokenizer_or_path, str):
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_or_path, trust_remote_code=trust_remote)
+        elif isinstance(pretrained_model_name_or_path, str):
+            tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
             if isinstance(tokenizer, PreTrainedTokenizerBase):
                 tokenicer.tokenizer = tokenizer
-                path = tokenizer_or_path
+                path = pretrained_model_name_or_path
             else:
                 ValueError(
-                    f"Failed to initialize `tokenizer`: please ensure that the `tokenizer_or_path` parameter is set correctly.")
+                    f"Failed to initialize `tokenizer`: please ensure that the `pretrained_model_name_or_path` parameter is set correctly.")
         else:
             raise ValueError(
-                f"Unsupported `tokenizer_or_path` type: Expected `str` or `PreTrainedTokenizerBase`, actual = `{type(tokenizer_or_path)}`.")
+                f"Unsupported `pretrained_model_name_or_path` type: Expected `str` or `PreTrainedTokenizerBase`, actual = `{type(pretrained_model_name_or_path)}`.")
 
-        tokenicer.model_config = auto_config(path, trust_remote)
+        tokenicer.model_config = auto_config(path, trust_remote_code)
 
         if tokenicer.model_config is None:
             logger.warning(
-                f"Auto model config retrieval from `tokenizer_or_path` failed. "
+                f"Auto model config retrieval from `pretrained_model_name_or_path` failed. "
                 f"Please pass a valid `model_or_path` argument to `auto_assign_pad_token()`.",
             )
 
+        tokenicer.auto_fix_pad_token()
+
         return tokenicer
 
-    def auto_assign_pad_token(
+    def auto_fix_pad_token(
         self,
         model_or_path: Optional[Union[str, PreTrainedModel]] = None,
         pad_tokens: Optional[List[Union[str, int]]] = None,
     ):
         model_config = None
         if model_or_path is not None:
             if isinstance(model_or_path, str):
-                model_config = auto_config(model_or_path, self.trust_remote)
+                model_config = auto_config(model_or_path, self.tokenizer.trust_remote_code)
             elif isinstance(model_or_path, PreTrainedModel):
                 model_config = getattr(model_or_path, "config", None)
             else:
@@ -85,7 +88,7 @@ def auto_assign_pad_token(
                 model_config = self.model_config
             else:
                 raise ValueError(
-                    f"Auto model config retrieval from `tokenizer_or_path` failed. "
+                    f"Auto model config retrieval from `pretrained_model_name_or_path` failed. "
                     f"Please pass a valid `model_or_path` argument to `auto_assign_pad_token()`.",
             )
 
@@ -114,10 +117,10 @@ def _auto_map_pad_token(self, model_config, pad_tokens) -> Optional[int]:
 
         # Match MODEL_PAD_TOKEN_MAP to get pad token
         if pad_token_id is None and MODEL_PAD_TOKEN_MAP.get(model_config.model_type, None) is not None:
-            tuple = MODEL_PAD_TOKEN_MAP.get(model_config.model_type)
-            pad_token = tuple.token
+            token_tuple = MODEL_PAD_TOKEN_MAP.get(model_config.model_type)
+            pad_token = token_tuple.token
             token_id = vocab.get(pad_token, None)
-            if token_id is not None and token_id == tuple.token_id:
+            if token_id is not None and token_id == token_tuple.token_id:
                 pad_token_id = token_id
 
         # Match DEFAULT_PAD_TOKENS to get pad token