Skip to content

Commit

Permalink
Update tiktoken-rs to 0.6 and refactor token encoding
Browse files Browse the repository at this point in the history
- Updated the `tiktoken-rs` dependency version from 0.5 to 0.6 in `Cargo.toml`.
- Introduced a `Rank` typedef for `uint32_t` to improve code readability and consistency.
- Refactored encoding-related functions to use `Rank` instead of `size_t` for tokens.
- Adjusted relevant parts of the codebase, including `README.md`, `lib.rs`, `test.c`, and `tiktoken.h` to reflect these changes.
  • Loading branch information
kojix2 committed Oct 14, 2024
1 parent 1f4f47b commit 6a52235
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 36 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ publish = false
crate-type = ["cdylib", "staticlib"]

[dependencies]
tiktoken-rs = "0.5"
tiktoken-rs = "0.6"
log = "0.4"
simple_logger = "4.3"

Expand Down
31 changes: 18 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ cargo build --release
Please refer to the [tiktoken-rs documentation](https://docs.rs/tiktoken-rs/).

```c
typedef void CoreBPE;
typedef uint32_t Rank;

typedef struct CFunctionCall {
const char *name;
const char *arguments;
Expand All @@ -37,6 +40,10 @@ typedef struct CChatCompletionRequestMessage {
const struct CFunctionCall *function_call;
} CChatCompletionRequestMessage;

const char *tiktoken_c_version(void);

void tiktoken_init_logger(void);

CoreBPE *tiktoken_r50k_base(void);

CoreBPE *tiktoken_p50k_base(void);
Expand All @@ -61,21 +68,19 @@ size_t tiktoken_get_chat_completion_max_tokens(const char *model,
uint32_t num_messages,
const struct CChatCompletionRequestMessage *messages);

size_t *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens);

size_t *tiktoken_corebpe_encode(CoreBPE *ptr,
const char *text,
const char *const *allowed_special,
size_t allowed_special_len,
size_t *num_tokens);
Rank *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens);

size_t *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr,
const char *text,
size_t *num_tokens);
Rank *tiktoken_corebpe_encode(CoreBPE *ptr,
const char *text,
const char *const *allowed_special,
size_t allowed_special_len,
size_t *num_tokens);

char *tiktoken_corebpe_decode(CoreBPE *ptr, const size_t *tokens, size_t num_tokens);
Rank *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr,
const char *text,
size_t *num_tokens);

const char *tiktoken_c_version(void);
char *tiktoken_corebpe_decode(CoreBPE *ptr, const Rank *tokens, size_t num_tokens);
```
## Language Bindings
Expand Down Expand Up @@ -107,7 +112,7 @@ cbindgen --config cbindgen.toml --crate tiktoken-c --output tiktoken.h
cbindgen does not support opaque pointers and must be added.
```
perl -i -pe '$i ||= /#include/; $_ = "\ntypedef void CoreBPE;\n" if $i && /^$/ && !$f++; $i = 0 if /^$/ && $f' tiktoken.h
perl -i -pe '$i ||= /#include/; $_ = "\ntypedef void CoreBPE;\ntypedef uint32_t Rank;\n" if $i && /^$/ && !$f++; $i = 0 if /^$/ && $f' tiktoken.h
```
## License
Expand Down
23 changes: 14 additions & 9 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,14 @@ use log::warn;
use simple_logger::SimpleLogger;
use std::ffi::{c_char, CStr};
use tiktoken_rs;
use tiktoken_rs::CoreBPE;
use tiktoken_rs::{CoreBPE, Rank};

mod corebpe;
// use corebpe::{
// tiktoken_cl100k_base, tiktoken_destroy_corebpe, tiktoken_get_bpe_from_model,
// tiktoken_o200k_base, tiktoken_p50k_base, tiktoken_p50k_edit, tiktoken_r50k_base,
// };

mod utils;
use utils::c_str_to_string;

Expand Down Expand Up @@ -198,7 +203,7 @@ pub extern "C" fn tiktoken_corebpe_encode_ordinary(
ptr: *mut CoreBPE,
text: *const c_char,
num_tokens: *mut usize,
) -> *mut usize {
) -> *mut Rank {
if ptr.is_null() {
warn!("Null pointer provided for CoreBPE!");
return std::ptr::null_mut();
Expand All @@ -225,7 +230,7 @@ pub extern "C" fn tiktoken_corebpe_encode_ordinary(
}
};
let boxed = encoded.into_boxed_slice();
Box::into_raw(boxed) as *mut usize
Box::into_raw(boxed) as *mut Rank
}

// pub fn encode(&self, text: &str, allowed_special: HashSet<&str>) -> Vec<usize>
Expand All @@ -236,7 +241,7 @@ pub extern "C" fn tiktoken_corebpe_encode(
allowed_special: *const *const c_char,
allowed_special_len: usize,
num_tokens: *mut usize,
) -> *mut usize {
) -> *mut Rank {
if ptr.is_null() {
warn!("Null pointer provided for CoreBPE!");
return std::ptr::null_mut();
Expand Down Expand Up @@ -283,15 +288,15 @@ pub extern "C" fn tiktoken_corebpe_encode(
}
};
let boxed = encoded.into_boxed_slice();
Box::into_raw(boxed) as *mut usize
Box::into_raw(boxed) as *mut Rank
}

#[no_mangle]
pub extern "C" fn tiktoken_corebpe_encode_with_special_tokens(
ptr: *mut CoreBPE,
text: *const c_char,
num_tokens: *mut usize,
) -> *mut usize {
) -> *mut Rank {
if ptr.is_null() {
warn!("Null pointer provided for CoreBPE!");
return std::ptr::null_mut();
Expand All @@ -318,13 +323,13 @@ pub extern "C" fn tiktoken_corebpe_encode_with_special_tokens(
}
};
let boxed = encoded.into_boxed_slice();
Box::into_raw(boxed) as *mut usize
Box::into_raw(boxed) as *mut Rank
}

#[no_mangle]
pub extern "C" fn tiktoken_corebpe_decode(
ptr: *mut CoreBPE,
tokens: *const usize,
tokens: *const Rank,
num_tokens: usize,
) -> *mut c_char {
if ptr.is_null() {
Expand All @@ -336,7 +341,7 @@ pub extern "C" fn tiktoken_corebpe_decode(
return std::ptr::null_mut();
}
let tokens = unsafe { std::slice::from_raw_parts(tokens, num_tokens) };
let tokens: Vec<usize> = tokens.iter().map(|&x| x as usize).collect();
let tokens = tokens.to_vec();

let corebpe = unsafe { &mut *ptr };
let decoded = corebpe.decode(tokens);
Expand Down
4 changes: 2 additions & 2 deletions test/test.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,11 @@ int main(int argc, char *argv[])

CoreBPE *bpe = tiktoken_get_bpe_from_model(model);
size_t n;
size_t *tokens = tiktoken_corebpe_encode_with_special_tokens(bpe, text, &n);
Rank *tokens = tiktoken_corebpe_encode_with_special_tokens(bpe, text, &n);

for (size_t i = 0; i < n; i++)
{
printf("%zu", tokens[i]);
printf("%u", tokens[i]);
if (i < n - 1)
{
printf(" ");
Expand Down
25 changes: 14 additions & 11 deletions tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <stdlib.h>

typedef void CoreBPE;
typedef uint32_t Rank;

typedef struct CFunctionCall {
const char *name;
Expand All @@ -22,6 +23,8 @@ typedef struct CChatCompletionRequestMessage {
const struct CFunctionCall *function_call;
} CChatCompletionRequestMessage;

const char *tiktoken_c_version(void);

void tiktoken_init_logger(void);

CoreBPE *tiktoken_r50k_base(void);
Expand All @@ -48,18 +51,18 @@ size_t tiktoken_get_chat_completion_max_tokens(const char *model,
uint32_t num_messages,
const struct CChatCompletionRequestMessage *messages);

size_t *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens);
Rank *tiktoken_corebpe_encode_ordinary(CoreBPE *ptr, const char *text, size_t *num_tokens);

size_t *tiktoken_corebpe_encode(CoreBPE *ptr,
const char *text,
const char *const *allowed_special,
size_t allowed_special_len,
size_t *num_tokens);
Rank *tiktoken_corebpe_encode(CoreBPE *ptr,
const char *text,
const char *const *allowed_special,
size_t allowed_special_len,
size_t *num_tokens);

size_t *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr,
const char *text,
size_t *num_tokens);
Rank *tiktoken_corebpe_encode_with_special_tokens(CoreBPE *ptr,
const char *text,
size_t *num_tokens);

char *tiktoken_corebpe_decode(CoreBPE *ptr, const Rank *tokens, size_t num_tokens);

char *tiktoken_corebpe_decode(CoreBPE *ptr, const size_t *tokens, size_t num_tokens);

const char *tiktoken_c_version(void);

0 comments on commit 6a52235

Please sign in to comment.