Skip to content

Commit

Permalink
Update tokenization_code_llama_fast.py (huggingface#26576)
Browse files Browse the repository at this point in the history
* Update tokenization_code_llama_fast.py

* Update test_tokenization_code_llama.py

* Update test_tokenization_code_llama.py
  • Loading branch information
andyl98 authored and blbadger committed Nov 8, 2023
1 parent 0f8dbc0 commit 6333195
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens=
special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else []
if suffix_first:
# format as " <PRE> <SUF>{suf} <MID> {pre}"
pair += [self.prefix_token, self.suffix_token, "$A", self.middle_token, "$B"]
pair += [self.prefix_token, self.suffix_token, "$B", self.middle_token, "$A"]
special_tokens += [
(self.prefix_token, self.prefix_id),
(self.suffix_token, self.suffix_id),
Expand Down
12 changes: 12 additions & 0 deletions tests/models/code_llama/test_tokenization_code_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,3 +643,15 @@ def main():
input_ids = tokenizer.encode(PROMPTS[0])
self.assertEqual(input_ids, tokenizer.encode(prefix, suffix=suffix))
self.assertEqual(tokenizer.encode(prefix, suffix=suffix), tokenizer_fast.encode(prefix, suffix=suffix))

# Adding suffix_first check for infilling tasks
suffix_first_formatted_prompt = tokenizer.tokenize(PROMPTS[0], suffix_first=True)
self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0], suffix_first=True))
prefix, suffix = PROMPTS[0].split("<FILL_ME>")
self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True))
self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True))

prefix, suffix = PROMPTS[0].split("<FILL_ME>")
suffix_first_input_ids = tokenizer.encode(PROMPTS[0], suffix_first=True)
self.assertEqual(suffix_first_input_ids, tokenizer.encode(prefix, suffix=suffix, suffix_first=True))
self.assertEqual(suffix_first_input_ids, tokenizer_fast.encode(prefix, suffix=suffix, suffix_first=True))

0 comments on commit 6333195

Please sign in to comment.