Skip to content

Commit

Permalink
Fixing two minor bugs in bpe_gpt2_preprocess
Browse files Browse the repository at this point in the history
  • Loading branch information
goerch committed Oct 10, 2023
1 parent d62adfd commit 6186e59
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 19 deletions.
9 changes: 4 additions & 5 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5007,7 +5007,6 @@ struct llm_tokenizer_bpe {
for (int i = 0; i < (int)text_utf.size(); i++) {
const std::string & utf_char = text_utf[i];
bool split_condition = false;
// const char* text_pos = raw_text_p + utf_char.seq_offset_bytes;
int bytes_remain = text_utf.size() - i;
// forward backward lookups
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
Expand All @@ -5033,9 +5032,9 @@ struct llm_tokenizer_bpe {
if (!split_condition && bytes_remain >= 3) {
// 're|'ve|'ll
if (utf_char == "\'" && (
(utf_char_next == "r" || utf_char_next_next == "e") ||
(utf_char_next == "v" || utf_char_next_next == "e") ||
(utf_char_next == "l" || utf_char_next_next == "l"))
(utf_char_next == "r" && utf_char_next_next == "e") ||
(utf_char_next == "v" && utf_char_next_next == "e") ||
(utf_char_next == "l" && utf_char_next_next == "l"))
) {
split_condition = true;
}
Expand Down Expand Up @@ -5086,7 +5085,7 @@ struct llm_tokenizer_bpe {
else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
split_condition = true;
}
else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) {
else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
split_condition = true;
}
}
Expand Down
6 changes: 3 additions & 3 deletions tests/test-tokenizer-0-falcon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{ " Hello" , { 258, 23090, }, },
{ " Hello" , { 466, 23090, }, },
{ " Hello\n Hello" , { 466, 23090, 742, 23090, }, },
{ "\n =" , { 1212, 40, }, },
{ "' era" , { 18, 4932, }, },
};

return _k_tests;
Expand Down Expand Up @@ -169,10 +171,8 @@ int main(int argc, char **argv) {
}

for (const auto & tok : res) {
ofs << tok << " ";
ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
}

ofs << "\n";
}

fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
Expand Down
10 changes: 6 additions & 4 deletions tests/test-tokenizer-0-falcon.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)

tests = [
"1500",
"",
" ",
" ",
Expand All @@ -41,6 +42,8 @@
" Hello",
" Hello",
" Hello\n Hello",
"\n =",
"' era",
]

for text in tests:
Expand Down Expand Up @@ -69,15 +72,14 @@
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r') as f:
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s)
# write to file
with open(fname_out, 'w') as f:
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
f.write(str(x) + ' ')
f.write('\n')
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)
4 changes: 1 addition & 3 deletions tests/test-tokenizer-0-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,8 @@ int main(int argc, char **argv) {
}

for (const auto & tok : res) {
ofs << tok << " ";
ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
}

ofs << "\n";
}

fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
Expand Down
7 changes: 3 additions & 4 deletions tests/test-tokenizer-0-llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,14 @@
if fname_tok:
print('tokenizing file: ', fname_tok)
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r') as f:
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
res = tokenizer.encode(s, add_bos=True)
# write to file
with open(fname_out, 'w') as f:
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
f.write(str(x) + ' ')
f.write('\n')
f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
print('len(res): ', len(res))
print('len(lines): ', len(lines))
print('results written to: ', fname_out)

0 comments on commit 6186e59

Please sign in to comment.