From 6a52235efcaf33ff8d7190131797fb85343e5066 Mon Sep 17 00:00:00 2001 From: kojix2 <2xijok@gmail.com> Date: Mon, 14 Oct 2024 14:43:51 +0900 Subject: [PATCH] Update `tiktoken-rs` to 0.6 and refactor token encoding - Updated the `tiktoken-rs` dependency version from 0.5 to 0.6 in `Cargo.toml`. - Introduced a `Rank` typedef for `uint32_t` to improve code readability and consistency. - Refactored encoding-related functions to use `Rank` instead of `size_t` for tokens. - Adjusted relevant parts of the codebase, including `README.md`, `lib.rs`, `test.c`, and `tiktoken.h` to reflect these changes. --- Cargo.toml | 2 +- README.md | 31 ++++++++++++++++++------------- src/lib.rs | 23 ++++++++++++++--------- test/test.c | 4 ++-- tiktoken.h | 25 ++++++++++++++----------- 5 files changed, 49 insertions(+), 36 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ff6e5d0..2c69af7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ publish = false crate-type = ["cdylib", "staticlib"] [dependencies] -tiktoken-rs = "0.5" +tiktoken-rs = "0.6" log = "0.4" simple_logger = "4.3" diff --git a/README.md b/README.md index 68cb6d3..2685c81 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,9 @@ cargo build --release Please refer to the [tiktoken-rs documentation](https://docs.rs/tiktoken-rs/). ```c +typedef void CoreBPE; +typedef uint32_t Rank; + typedef struct CFunctionCall { const char *name; const char *arguments; @@ -37,6 +40,10 @@ typedef struct CChatCompletionRequestMessage { const struct CFunctionCall *function_call; } CChatCompletionRequestMessage; +const char *tiktoken_c_version(void); + +void tiktoken_init_logger(void); + CoreBPE *tiktoken_r50k_base(void); CoreBPE *tiktoken_p50k_base(void); @@ -61,21 +68,19 @@ size_t tiktoken_get_chat_completion_max_tokens(const char *model, uint32_t num_messages, const struct CChatCompletionRequestMessage *messages); -size_t *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens); - -size_t *tiktoken_corebpe_encode(CoreBPE *ptr, - const char *text, - const char *const *allowed_special, - size_t allowed_special_len, - size_t *num_tokens); +Rank *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens); -size_t *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr, - const char *text, - size_t *num_tokens); +Rank *tiktoken_corebpe_encode(CoreBPE *ptr, + const char *text, + const char *const *allowed_special, + size_t allowed_special_len, + size_t *num_tokens); -char *tiktoken_corebpe_decode(CoreBPE *ptr, const size_t *tokens, size_t num_tokens); +Rank *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr, + const char *text, + size_t *num_tokens); -const char *tiktoken_c_version(void); +char *tiktoken_corebpe_decode(CoreBPE *ptr, const Rank *tokens, size_t num_tokens); ``` ## Language Bindings @@ -107,7 +112,7 @@ cbindgen --config cbindgen.toml --crate tiktoken-c --output tiktoken.h cbindgen does not support opaque pointers and must be added. ``` -perl -i -pe '$i ||= /#include/; $_ = "\ntypedef void CoreBPE;\n" if $i && /^$/ && !$f++; $i = 0 if /^$/ && $f' tiktoken.h +perl -i -pe '$i ||= /#include/; $_ = "\ntypedef void CoreBPE;\ntypedef uint32_t Rank;\n" if $i && /^$/ && !$f++; $i = 0 if /^$/ && $f' tiktoken.h ``` ## License diff --git a/src/lib.rs b/src/lib.rs index 030f287..9251ee4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,9 +2,14 @@ use log::warn; use simple_logger::SimpleLogger; use std::ffi::{c_char, CStr}; use tiktoken_rs; -use tiktoken_rs::CoreBPE; +use tiktoken_rs::{CoreBPE, Rank}; mod corebpe; +// use corebpe::{ +// tiktoken_cl100k_base, tiktoken_destroy_corebpe, tiktoken_get_bpe_from_model, +// tiktoken_o200k_base, tiktoken_p50k_base, tiktoken_p50k_edit, tiktoken_r50k_base, +// }; + mod utils; use utils::c_str_to_string; @@ -198,7 +203,7 @@ pub extern "C" fn tiktoken_corebpe_encode_ordinary( ptr: *mut CoreBPE, text: *const c_char, num_tokens: *mut usize, -) -> *mut usize { +) -> *mut Rank { if ptr.is_null() { warn!("Null pointer provided for CoreBPE!"); return std::ptr::null_mut(); @@ -225,7 +230,7 @@ pub extern "C" fn tiktoken_corebpe_encode_ordinary( } }; let boxed = encoded.into_boxed_slice(); - Box::into_raw(boxed) as *mut usize + Box::into_raw(boxed) as *mut Rank } // pub fn encode(&self, text: &str, allowed_special: HashSet<&str>) -> Vec @@ -236,7 +241,7 @@ pub extern "C" fn tiktoken_corebpe_encode( allowed_special: *const *const c_char, allowed_special_len: usize, num_tokens: *mut usize, -) -> *mut usize { +) -> *mut Rank { if ptr.is_null() { warn!("Null pointer provided for CoreBPE!"); return std::ptr::null_mut(); @@ -283,7 +288,7 @@ pub extern "C" fn tiktoken_corebpe_encode( } }; let boxed = encoded.into_boxed_slice(); - Box::into_raw(boxed) as *mut usize + Box::into_raw(boxed) as *mut Rank } #[no_mangle] @@ -291,7 +296,7 @@ pub extern "C" fn tiktoken_corebpe_encode_with_special_tokens( ptr: *mut CoreBPE, text: *const c_char, num_tokens: *mut usize, -) -> *mut usize { +) -> *mut Rank { if ptr.is_null() { warn!("Null pointer provided for CoreBPE!"); return std::ptr::null_mut(); @@ -318,13 +323,13 @@ pub extern "C" fn tiktoken_corebpe_encode_with_special_tokens( } }; let boxed = encoded.into_boxed_slice(); - Box::into_raw(boxed) as *mut usize + Box::into_raw(boxed) as *mut Rank } #[no_mangle] pub extern "C" fn tiktoken_corebpe_decode( ptr: *mut CoreBPE, - tokens: *const usize, + tokens: *const Rank, num_tokens: usize, ) -> *mut c_char { if ptr.is_null() { @@ -336,7 +341,7 @@ pub extern "C" fn tiktoken_corebpe_decode( return std::ptr::null_mut(); } let tokens = unsafe { std::slice::from_raw_parts(tokens, num_tokens) }; - let tokens: Vec = tokens.iter().map(|&x| x as usize).collect(); + let tokens = tokens.to_vec(); let corebpe = unsafe { &mut *ptr }; let decoded = corebpe.decode(tokens); diff --git a/test/test.c b/test/test.c index 3efe450..bbaa31e 100644 --- a/test/test.c +++ b/test/test.c @@ -42,11 +42,11 @@ int main(int argc, char *argv[]) CoreBPE *bpe = tiktoken_get_bpe_from_model(model); size_t n; - size_t *tokens = tiktoken_corebpe_encode_with_special_tokens(bpe, text, &n); + Rank *tokens = tiktoken_corebpe_encode_with_special_tokens(bpe, text, &n); for (size_t i = 0; i < n; i++) { - printf("%zu", tokens[i]); + printf("%u", tokens[i]); if (i < n - 1) { printf(" "); diff --git a/tiktoken.h b/tiktoken.h index 969528f..38d0e6f 100644 --- a/tiktoken.h +++ b/tiktoken.h @@ -9,6 +9,7 @@ #include typedef void CoreBPE; +typedef uint32_t Rank; typedef struct CFunctionCall { const char *name; @@ -22,6 +23,8 @@ typedef struct CChatCompletionRequestMessage { const struct CFunctionCall *function_call; } CChatCompletionRequestMessage; +const char *tiktoken_c_version(void); + void tiktoken_init_logger(void); CoreBPE *tiktoken_r50k_base(void); @@ -48,18 +51,18 @@ size_t tiktoken_get_chat_completion_max_tokens(const char *model, uint32_t num_messages, const struct CChatCompletionRequestMessage *messages); -size_t *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens); +Rank *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens); -size_t *tiktoken_corebpe_encode(CoreBPE *ptr, - const char *text, - const char *const *allowed_special, - size_t allowed_special_len, - size_t *num_tokens); +Rank *tiktoken_corebpe_encode(CoreBPE *ptr, + const char *text, + const char *const *allowed_special, + size_t allowed_special_len, + size_t *num_tokens); -size_t *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr, - const char *text, - size_t *num_tokens); +Rank *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr, + const char *text, + size_t *num_tokens); + +char *tiktoken_corebpe_decode(CoreBPE *ptr, const Rank *tokens, size_t num_tokens); -char *tiktoken_corebpe_decode(CoreBPE *ptr, const size_t *tokens, size_t num_tokens); -const char *tiktoken_c_version(void);