Skip to content

Commit

Permalink
Fixing the last deviations from sentencepiece indicated by test-token…
Browse files Browse the repository at this point in the history
…izer-1
  • Loading branch information
goerch committed Sep 14, 2023
1 parent 16bf5f2 commit 64b0b74
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 9 deletions.
4 changes: 2 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -791,10 +791,10 @@ std::vector<llama_token> llama_tokenize(
// upper limit for the number of tokens
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
n_tokens = llama_tokenize(ctx, text.c_str(), text.length(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
int check = llama_tokenize(ctx, text.c_str(), text.length(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
Expand Down
6 changes: 4 additions & 2 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6202,19 +6202,21 @@ llama_token llama_token_nl(const struct llama_context * ctx) {
int llama_tokenize(
struct llama_context * ctx,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
return llama_tokenize_with_model(&ctx->model, text, tokens, n_max_tokens, add_bos);
return llama_tokenize_with_model(&ctx->model, text, text_len, tokens, n_max_tokens, add_bos);
}

int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos) {
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos);

if (n_max_tokens < (int) res.size()) {
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
Expand Down
2 changes: 2 additions & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -374,13 +374,15 @@ extern "C" {
LLAMA_API int llama_tokenize(
struct llama_context * ctx,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos);

LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model,
const char * text,
int text_len,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
Expand Down
10 changes: 5 additions & 5 deletions tests/test-tokenizer-1-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,7 @@ int main(int argc, char **argv) {
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
if(i != 3)
return 2;
return 2;
}
}

Expand All @@ -100,10 +99,11 @@ int main(int argc, char **argv) {
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
if(cp != 0 && cp != 9601)
if(cp != 9601) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 3;
}
}
}
}
Expand Down

0 comments on commit 64b0b74

Please sign in to comment.