diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py index 7e6ca16b9f..3375f8fc4b 100644 --- a/recipes/full_finetune_distributed.py +++ b/recipes/full_finetune_distributed.py @@ -546,7 +546,7 @@ def _setup_data( batch_size=batch_size, sampler=sampler, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index fa1b7b14ff..2addd92944 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -516,7 +516,7 @@ def _setup_data( batch_size=batch_size, sampler=sampler, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py index 833c9aec56..c2ee8c7cc4 100644 --- a/recipes/knowledge_distillation_single_device.py +++ b/recipes/knowledge_distillation_single_device.py @@ -526,7 +526,7 @@ def _setup_data( sampler=sampler, batch_size=batch_size, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=( partial( padded_collate_sft, diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py index d655889305..e903ab274a 100644 --- a/recipes/lora_dpo_distributed.py +++ b/recipes/lora_dpo_distributed.py @@ -475,7 +475,7 @@ def _setup_data( batch_size=batch_size, sampler=sampler, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=partial( padded_collate_dpo, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_dpo_single_device.py b/recipes/lora_dpo_single_device.py index b7d931accc..c158d17875 100644 --- a/recipes/lora_dpo_single_device.py +++ b/recipes/lora_dpo_single_device.py @@ -364,7 +364,7 @@ def _setup_data( sampler=sampler, batch_size=batch_size, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=partial( padded_collate_dpo, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py index 2be9aa94a2..1569dfee63 100644 --- a/recipes/lora_finetune_distributed.py +++ b/recipes/lora_finetune_distributed.py @@ -623,7 +623,7 @@ def _setup_data( batch_size=batch_size, sampler=sampler, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=partial( collate_fn, padding_idx=self._tokenizer.pad_id, diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 6641863e4d..5d39b72086 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -535,7 +535,7 @@ def _setup_data( sampler=sampler, batch_size=batch_size, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=( partial( collate_fn, diff --git a/recipes/ppo_full_finetune_single_device.py b/recipes/ppo_full_finetune_single_device.py index 9f645b5fdd..7679af3fd3 100644 --- a/recipes/ppo_full_finetune_single_device.py +++ b/recipes/ppo_full_finetune_single_device.py @@ -580,7 +580,7 @@ def _setup_data( sampler=sampler, batch_size=batch_size, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=partial( padded_collate, pad_direction="left", diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py index c6a7ec0ed1..eb2e44fae2 100644 --- a/recipes/qat_distributed.py +++ b/recipes/qat_distributed.py @@ -524,7 +524,7 @@ def _setup_data( batch_size=batch_size, sampler=sampler, # dropping last avoids shape issues with compile + flex attention - drop_last=cfg_dataset.get("drop_last", True), + drop_last=True, collate_fn=partial( padded_collate_sft, padding_idx=self._tokenizer.pad_id,