From 6186e591ad662ee813f1a53507f103bc8e82e3bb Mon Sep 17 00:00:00 2001 From: goerch Date: Tue, 10 Oct 2023 07:18:39 +0200 Subject: [PATCH] Fixing two minor bugs in `bpe_gpt2_preprocess` --- llama.cpp | 9 ++++----- tests/test-tokenizer-0-falcon.cpp | 6 +++--- tests/test-tokenizer-0-falcon.py | 10 ++++++---- tests/test-tokenizer-0-llama.cpp | 4 +--- tests/test-tokenizer-0-llama.py | 7 +++---- 5 files changed, 17 insertions(+), 19 deletions(-) diff --git a/llama.cpp b/llama.cpp index 08d6c162a5d7ce..0814ca4d230263 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5007,7 +5007,6 @@ struct llm_tokenizer_bpe { for (int i = 0; i < (int)text_utf.size(); i++) { const std::string & utf_char = text_utf[i]; bool split_condition = false; - // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes; int bytes_remain = text_utf.size() - i; // forward backward lookups const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; @@ -5033,9 +5032,9 @@ struct llm_tokenizer_bpe { if (!split_condition && bytes_remain >= 3) { // 're|'ve|'ll if (utf_char == "\'" && ( - (utf_char_next == "r" || utf_char_next_next == "e") || - (utf_char_next == "v" || utf_char_next_next == "e") || - (utf_char_next == "l" || utf_char_next_next == "l")) + (utf_char_next == "r" && utf_char_next_next == "e") || + (utf_char_next == "v" && utf_char_next_next == "e") || + (utf_char_next == "l" && utf_char_next_next == "l")) ) { split_condition = true; } @@ -5086,7 +5085,7 @@ struct llm_tokenizer_bpe { else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { split_condition = true; } - else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) { + else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { split_condition = true; } } diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 0f3c50bce8ae9d..25038401cd4689 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -36,6 +36,8 @@ static const std::map> & k_tests() { { " Hello" , { 258, 23090, }, }, { " Hello" , { 466, 23090, }, }, { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + { "\n =" , { 1212, 40, }, }, + { "' era" , { 18, 4932, }, }, }; return _k_tests; @@ -169,10 +171,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " "; + ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector{tok}) << "'" << std::endl; } - - ofs << "\n"; } fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py index 9c8c1c7d1d3ca4..0dfd4ae0957051 100644 --- a/tests/test-tokenizer-0-falcon.py +++ b/tests/test-tokenizer-0-falcon.py @@ -16,6 +16,7 @@ tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) tests = [ + "1500", "", " ", " ", @@ -41,6 +42,8 @@ " Hello", " Hello", " Hello\n Hello", + "\n =", + "' era", ] for text in tests: @@ -69,15 +72,14 @@ if fname_tok: print('tokenizing file: ', fname_tok) fname_out = fname_tok + '.tok' - with open(fname_tok, 'r') as f: + with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) res = tokenizer.encode(s) # write to file - with open(fname_out, 'w') as f: + with open(fname_out, 'w', encoding='utf-8') as f: for x in res: - f.write(str(x) + ' ') - f.write('\n') + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines)) print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index 91c841f7bba8f6..39c8d188c90861 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -174,10 +174,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " "; + ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector{tok}) << "'" << std::endl; } - - ofs << "\n"; } fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py index bc164ee296cb1d..078f680b165ca1 100644 --- a/tests/test-tokenizer-0-llama.py +++ b/tests/test-tokenizer-0-llama.py @@ -81,15 +81,14 @@ if fname_tok: print('tokenizing file: ', fname_tok) fname_out = fname_tok + '.tok' - with open(fname_tok, 'r') as f: + with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) res = tokenizer.encode(s, add_bos=True) # write to file - with open(fname_out, 'w') as f: + with open(fname_out, 'w', encoding='utf-8') as f: for x in res: - f.write(str(x) + ' ') - f.write('\n') + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines)) print('results written to: ', fname_out)