Address issue huggingface#1122 (huggingface#1174)

* Address issue huggingface#1122 Issue [huggingface#1122](huggingface#1122) takes care of an inconsistency between `_prepare_packed_dataloader` and `_prepare_non_packed_dataloader` * made attention_mask field in ConstantLengthDataset a tensor
jondurbin · Jan 8, 2024 · 654e067 · 654e067
1 parent dcd8f13
commit 654e067
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 1 deletion.
diff --git a/trl/trainer/sft_trainer.py b/trl/trainer/sft_trainer.py
@@ -402,7 +402,11 @@ def tokenize(element):
                 else:
                     self._dataset_sanity_checked = True
 
-            return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
+            return {
+                "input_ids": outputs["input_ids"],
+                "labels": outputs["input_ids"],
+                "attention_mask": outputs["attention_mask"],
+            }
 
         tokenized_dataset = dataset.map(
             tokenize,

diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
@@ -452,6 +452,7 @@ def __iter__(self):
                 yield {
                     "input_ids": torch.LongTensor(example),
                     "labels": torch.LongTensor(example),
+                    "attention_mask": torch.ones(len(example)),
                 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -452,6 +452,7 @@ def __iter__(self): @@
                     yield {
                         "input_ids": torch.LongTensor(example),
                         "labels": torch.LongTensor(example),
+                        "attention_mask": torch.ones(len(example)),
                     }
@@ Expand Down @@