From 963200bed31f1de2338fc5ec0316bfb3df565705 Mon Sep 17 00:00:00 2001 From: andyliu Date: Tue, 3 Oct 2023 11:47:49 -0700 Subject: [PATCH 1/3] Update tokenization_code_llama_fast.py --- .../models/code_llama/tokenization_code_llama_fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index 66a312eb3dfa8f..7d1e237022377e 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -278,7 +278,7 @@ def set_infilling_processor(self, reset, suffix_first=False, add_special_tokens= special_tokens = [(self.bos_token, self.bos_token_id)] if self.add_bos_token and add_special_tokens else [] if suffix_first: # format as "
 {suf}  {pre}"
-            pair += [self.prefix_token, self.suffix_token, "$A", self.middle_token, "$B"]
+            pair += [self.prefix_token, self.suffix_token, "$B", self.middle_token, "$A"]
             special_tokens += [
                 (self.prefix_token, self.prefix_id),
                 (self.suffix_token, self.suffix_id),

From 1596d9aa7ee3d512273c61350ed7732d7c38d9d9 Mon Sep 17 00:00:00 2001
From: andyliu 
Date: Wed, 4 Oct 2023 11:21:32 -0700
Subject: [PATCH 2/3] Update test_tokenization_code_llama.py

---
 tests/models/code_llama/test_tokenization_code_llama.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index 3df0c552c0daa4..3a8bcb2d4f2cf0 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -643,3 +643,10 @@ def main():
         input_ids = tokenizer.encode(PROMPTS[0])
         self.assertEqual(input_ids, tokenizer.encode(prefix, suffix=suffix))
         self.assertEqual(tokenizer.encode(prefix, suffix=suffix), tokenizer_fast.encode(prefix, suffix=suffix))
+
+        # Adding suffix_first check for infilling tasks
+        suffix_first_formatted_prompt = tokenizer.tokenize(PROMPTS[0], suffix_first=True)
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0], suffix_first=True))
+        prefix, suffix = PROMPTS[0].split("")
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True))
+        self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True))

From 3048f3ee5c4be6cb71bac89bbe32930bc7615a6f Mon Sep 17 00:00:00 2001
From: andyliu 
Date: Wed, 4 Oct 2023 11:23:33 -0700
Subject: [PATCH 3/3] Update test_tokenization_code_llama.py

---
 tests/models/code_llama/test_tokenization_code_llama.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index 3a8bcb2d4f2cf0..2673981527048d 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -650,3 +650,8 @@ def main():
         prefix, suffix = PROMPTS[0].split("")
         self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True))
         self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True))
+
+        prefix, suffix = PROMPTS[0].split("")
+        suffix_first_input_ids = tokenizer.encode(PROMPTS[0], suffix_first=True)
+        self.assertEqual(suffix_first_input_ids, tokenizer.encode(prefix, suffix=suffix, suffix_first=True))
+        self.assertEqual(suffix_first_input_ids, tokenizer_fast.encode(prefix, suffix=suffix, suffix_first=True))