huggingface · kashif · Oct 10, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 7, 2024
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -20,7 +20,13 @@
 from transformers.utils import is_peft_available
 
 from trl.trainer.model_config import ModelConfig
-from trl.trainer.utils import decode_and_strip_padding, generate_model_card, get_peft_config, pad
+from trl.trainer.utils import (
+    DataCollatorForChatML,
+    decode_and_strip_padding,
+    generate_model_card,
+    get_peft_config,
+    pad,
+)
 
 
 if is_peft_available():
@@ -169,3 +175,87 @@ def test_val_none(self):
         assert "my_model" in card_text
         assert 'pipeline("text-generation", model="username/my_hub_model", device="cuda")' in card_text
         assert "My Trainer" in card_text
+
+
+class TestDataCollatorForChatML(unittest.TestCase):
+    def setUp(self):
+        # Initialize the tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+        self.tokenizer.pad_token = (
+            self.tokenizer.bos_token if self.tokenizer.pad_token is None else self.tokenizer.pad_token
+        )
+
+        # Define token IDs
+        self.bos_token_id = self.tokenizer.bos_token_id if self.tokenizer.bos_token_id is not None else 1
+        self.eos_token_id = self.tokenizer.eos_token_id if self.tokenizer.eos_token_id is not None else 2
+        self.assistant_output_token_id = 1565  # Token ID for "true", which is the last assistant's response in the example
+        self.ignore_index = -100
+        self.max_length = 1024
+        self.messages_key = "messages"
+
+        # Example input
+        self.examples = [
+            {
+                self.messages_key: [
+                    {
+                        "role": "user",
+                        "content": (
+                            "Does the following code contain any security vulnerabilities? Return true or false.\n"
+                            "char buffer[10];\nchar input[50];\nstrcpy(buffer, input);\n"
+                        ),
+                    },
+                    {"role": "assistant", "content": "true"},
+                ]
+            }
+        ]
+
+        # Initialize the data collator
+        self.collator = DataCollatorForChatML(
+            tokenizer=self.tokenizer,
+            max_length=self.max_length,
+            ignore_index=self.ignore_index,
+            messages_key=self.messages_key,
+        )
+
+    def test_data_collator_for_chatml(self):
+        # Process the data
+        data = self.collator(self.examples)
+
+        # Decode input_ids and labels for verification
+        input_ids = data["input_ids"][0].tolist()
+        labels = data["labels"][0].tolist()
+
+        # Expected tokens
+        expected_bos = self.bos_token_id
+        expected_eos = self.eos_token_id
+        expected_assistant_token = self.assistant_output_token_id
+
+        # Verify that input_ids start with a BOS token and there are no extra ones
+        self.assertEqual(input_ids[0], expected_bos, "The first token of input_ids should be BOS token.")
+        self.assertNotEqual(
+            input_ids[1], expected_bos, "The second token of input_ids should not be BOS token (extra BOS)."
+        )
+
+        # Verify that the assistant's response token is present in input_ids
+        self.assertIn(expected_assistant_token, input_ids, "Assistant's response token should be in input_ids.")
+
+        # Verify that EOS token is at the end of input_ids
+        self.assertEqual(input_ids[-1], expected_eos, "The last token of input_ids should be EOS token.")
+
+        # Verify that the labels preserved the target string (last_assistant_response)
+        last_assistant_response = self.examples[0][self.messages_key][-1]["content"]
+        last_assistant_response_tokens = self.tokenizer.encode(last_assistant_response, add_special_tokens=False)
+
+        # Find the start and end of the last assistant's response in the labels
+        response_start = next(i for i, label in enumerate(labels) if label != self.ignore_index)
+        response_end = next(i for i in range(len(labels) - 1, -1, -1) if labels[i] != self.ignore_index)
+
+        actual_response = labels[response_start : response_end - 1]
+        self.assertEqual(
+            actual_response,
+            last_assistant_response_tokens,
+            "The labels should preserve the last assistant's response tokens.",
+        )
+
+        # Verify that EOS token is at the end of labels
+        self.assertEqual(labels[-1], expected_eos, "The last token of labels should be EOS token.")
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
@@ -265,17 +265,29 @@ def __call__(self, examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
             assistant_messages = [msg for msg in messages if msg["role"] == "assistant"]
             last_assistant_message = assistant_messages[-1]["content"]
             prompt = formatted_chat.rsplit(last_assistant_message, 1)[0]
-            completion = last_assistant_message
+            completion = last_assistant_message + formatted_chat.rsplit(last_assistant_message, 1)[1]
 
             prompts.append(prompt)
             completions.append(completion)
 
         # Tokenize prompts and completions
         tokenized_prompts = self.tokenizer(
-            prompts, truncation=True, max_length=self.max_length, padding=False, return_tensors=None
+            prompts,
+            truncation=True,
+            max_length=self.max_length,
+            padding=False,
+            return_tensors=None,
+            # We assume the inputs are already wrapped with BOS&EOS tokens in tokenizer.apply_chat_template, so extra BOS/EOS tokens should not be added
+            add_special_tokens=False,
         )
         tokenized_completions = self.tokenizer(
-            completions, truncation=True, max_length=self.max_length, padding=False, return_tensors=None
+            completions,
+            truncation=True,
+            max_length=self.max_length,
+            padding=False,
+            return_tensors=None,
+            # We assume the inputs are already wrapped with BOS&EOS tokens in tokenizer.apply_chat_template, so extra BOS/EOS tokens should not be added
+            add_special_tokens=False,
         )
 
         # Combine prompts and completions