Support references to tokenizers (#37)

benbrandt · Aug 10, 2023 · 2af8bd8 · 2af8bd8
1 parent 1b36cac
commit 2af8bd8
Show file tree

Hide file tree

Showing 5 changed files with 51 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.4.3
+
+### What's New
+
+- Support `impl ChunkSizer` for `&Tokenizer` and `&CoreBPE`, allowing for generating chunks based off of a reference to a tokenizer as well, instead of requiring ownership.
+
 ## v0.4.2
 
 ### What's New

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "text-splitter"
-version = "0.4.2"
+version = "0.4.3"
 authors = ["Ben Brandt <benjamin.j.brandt@gmail.com>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens (when used with large language models)."
@@ -18,11 +18,11 @@ rustdoc-args = ["--cfg", "docsrs"]
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-auto_enums = "0.8.1"
-either = "1.8.1"
+auto_enums = "0.8.2"
+either = "1.9.0"
 itertools = "0.11.0"
 once_cell = "1.18.0"
-regex = "1.9.1"
+regex = "1.9.3"
 tiktoken-rs = { version = ">=0.2.0, <0.6.0", optional = true }
 tokenizers = { version = ">=0.13.3, <0.14.0", default_features = false, features = [
     "onig",

diff --git a/src/huggingface.rs b/src/huggingface.rs
@@ -10,8 +10,25 @@ impl ChunkSizer for Tokenizer {
     /// Will panic if you don't have a byte-level tokenizer and the splitter
     /// encounters text it can't tokenize.
     fn chunk_size(&self, chunk: &str) -> usize {
-        self.encode(chunk, false)
-            .map(|enc| enc.len())
-            .expect("Unable to tokenize the following string {str}")
+        chunk_size(self, chunk)
     }
 }
+
+impl ChunkSizer for &Tokenizer {
+    /// Returns the number of tokens in a given text after tokenization.
+    ///
+    /// # Panics
+    ///
+    /// Will panic if you don't have a byte-level tokenizer and the splitter
+    /// encounters text it can't tokenize.
+    fn chunk_size(&self, chunk: &str) -> usize {
+        chunk_size(self, chunk)
+    }
+}
+
+fn chunk_size(tokenizer: &Tokenizer, chunk: &str) -> usize {
+    tokenizer
+        .encode(chunk, false)
+        .map(|enc| enc.len())
+        .expect("Unable to tokenize the following string {str}")
+}
diff --git a/src/tiktoken.rs b/src/tiktoken.rs
@@ -10,6 +10,22 @@ impl ChunkSizer for CoreBPE {
     /// Will panic if you don't have a byte-level tokenizer and the splitter
     /// encounters text it can't tokenize.
     fn chunk_size(&self, text: &str) -> usize {
-        self.encode_ordinary(text).len()
+        chunk_size(self, text)
     }
 }
+
+impl ChunkSizer for &CoreBPE {
+    /// Returns the number of tokens in a given text after tokenization.
+    ///
+    /// # Panics
+    ///
+    /// Will panic if you don't have a byte-level tokenizer and the splitter
+    /// encounters text it can't tokenize.
+    fn chunk_size(&self, text: &str) -> usize {
+        chunk_size(self, text)
+    }
+}
+
+fn chunk_size(bpe: &CoreBPE, text: &str) -> usize {
+    bpe.encode_ordinary(text).len()
+}
diff --git a/tests/text_splitter_snapshots.rs b/tests/text_splitter_snapshots.rs
@@ -85,7 +85,7 @@ fn huggingface_default() {
         let text = fs::read_to_string(path).unwrap();
 
         for chunk_size in [10, 100, 1000] {
-            let splitter = TextSplitter::new(HUGGINGFACE_TOKENIZER.clone());
+            let splitter = TextSplitter::new(&*HUGGINGFACE_TOKENIZER);
             let chunks = splitter.chunks(&text, chunk_size).collect::<Vec<_>>();
 
             assert_eq!(chunks.join(""), text);
@@ -103,7 +103,7 @@ fn huggingface_trim() {
         let text = fs::read_to_string(path).unwrap();
 
         for chunk_size in [10, 100, 1000] {
-            let splitter = TextSplitter::new(HUGGINGFACE_TOKENIZER.clone()).with_trim_chunks(true);
+            let splitter = TextSplitter::new(&*HUGGINGFACE_TOKENIZER).with_trim_chunks(true);
             let chunks = splitter.chunks(&text, chunk_size).collect::<Vec<_>>();
 
             for chunk in chunks.iter() {
@@ -122,7 +122,7 @@ fn tiktoken_default() {
         let text = fs::read_to_string(path).unwrap();
 
         for chunk_size in [10, 100, 1000] {
-            let splitter = TextSplitter::new(TIKTOKEN_TOKENIZER.clone());
+            let splitter = TextSplitter::new(&*TIKTOKEN_TOKENIZER);
             let chunks = splitter.chunks(&text, chunk_size).collect::<Vec<_>>();
 
             assert_eq!(chunks.join(""), text);
@@ -140,7 +140,7 @@ fn tiktoken_trim() {
         let text = fs::read_to_string(path).unwrap();
 
         for chunk_size in [10, 100, 1000] {
-            let splitter = TextSplitter::new(TIKTOKEN_TOKENIZER.clone()).with_trim_chunks(true);
+            let splitter = TextSplitter::new(&*TIKTOKEN_TOKENIZER).with_trim_chunks(true);
             let chunks = splitter.chunks(&text, chunk_size).collect::<Vec<_>>();
 
             for chunk in chunks.iter() {