pytorch · RdoubleA · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/docs/source/api_ref_datasets.rst b/docs/source/api_ref_datasets.rst
@@ -11,6 +11,7 @@ torchtune.datasets
     :nosignatures:
 
     alpaca_dataset
+    alpaca_cleaned_dataset
     grammar_dataset
     samsum_dataset
     slimorca_dataset
diff --git a/docs/source/examples/configs.rst b/docs/source/examples/configs.rst
@@ -119,7 +119,7 @@ keyword arguments not specified in the config if we'd like:
     def alpaca_dataset(
         tokenizer: Tokenizer,
         train_on_input: bool = True,
-        use_clean: bool = False,
+        max_seq_len: int = 512,
     ) -> InstructDataset:
 
     from torchtune import config
@@ -132,7 +132,7 @@ keyword arguments not specified in the config if we'd like:
     dataset = config.instantiate(
         cfg.dataset,
         tokenizer,
-        use_clean=True,
+        train_on_input=False,
     )
 
 Note that additional keyword arguments will overwrite any duplicated keys in the

diff --git a/docs/source/examples/finetune_llm.rst b/docs/source/examples/finetune_llm.rst
@@ -88,17 +88,9 @@ from Stanford. The following parameters are related to the data:
     # This is the default value
     train_on_input: True
 
-    # Train on the raw data, not the cleaned version
-    # This is the default value
-    use_clean: False
+    # Truncate after a maximum sequence length to limit memory usage
+    max_seq_len: 512
 
-    # Shuffle the data between epochs
-    # This is set in the config
-    shuffle: True
-
-.. note::
-    Shuffling the data after every epoch is a good practice. This helps makes sure the model does not learn
-    spurious patterns related to the how the data is sequenced.
 
 .. note::
     Set ``train_on_input`` to False if you want to learn on the label only i.e. mask out the prompt. The resulting loss

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -54,9 +54,8 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_dataset
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
   train_on_input: True
-  use_clean: True
 seed: null
 shuffle: True
 batch_size: 32

diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -49,9 +49,8 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_dataset
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
   train_on_input: True
-  use_clean: True
 seed: null
 shuffle: True
 batch_size: 2

diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -47,9 +47,8 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_dataset
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
   train_on_input: True
-  use_clean: True
 seed: null
 shuffle: True
 batch_size: 2

diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -47,9 +47,8 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_dataset
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
   train_on_input: True
-  use_clean: True
 seed: null
 shuffle: True
 batch_size: 2

diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py
@@ -33,8 +33,8 @@ def _get_test_config_overrides(self):
             "batch_size=4",
             "enable_activation_checkpointing=False",
             "tokenizer.path=/tmp/test-artifacts/tokenizer.model",
+            "dataset=torchtune.datasets.alpaca_dataset",
             "dataset.train_on_input=False",
-            "dataset.use_clean=False",
             "seed=9",
             "epochs=2",
             "dtype=fp32",

diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py
@@ -34,8 +34,8 @@ def _get_test_config_overrides(self, dtype_str: str = "fp32"):
             f"dtype={dtype_str}",
             "enable_activation_checkpointing=False",
             "tokenizer.path=/tmp/test-artifacts/tokenizer.model",
+            "dataset=torchtune.datasets.alpaca_dataset",
             "dataset.train_on_input=False",
-            "dataset.use_clean=False",
             "seed=9",
             "epochs=2",
             "max_steps_per_epoch=2",

diff --git a/tests/torchtune/datasets/test_alpaca_dataset.py b/tests/torchtune/datasets/test_alpaca_dataset.py
@@ -11,7 +11,7 @@
 from tests.test_utils import get_assets_path
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
 
-from torchtune.datasets import alpaca_dataset
+from torchtune.datasets import alpaca_cleaned_dataset, alpaca_dataset
 from torchtune.modules.tokenizer import Tokenizer
 
 
@@ -103,7 +103,7 @@ def test_alpaca_clean(self, load_dataset, tokenizer):
             }
         ]
 
-        alpaca_ds = alpaca_dataset(tokenizer=tokenizer, use_clean=True)
+        alpaca_ds = alpaca_cleaned_dataset(tokenizer=tokenizer)
         input, labels = alpaca_ds[0]
 
         assert len(input) == len(labels)

diff --git a/torchtune/datasets/__init__.py b/torchtune/datasets/__init__.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from torchtune.datasets._alpaca import alpaca_dataset
+from torchtune.datasets._alpaca import alpaca_cleaned_dataset, alpaca_dataset
 from torchtune.datasets._chat import ChatDataset
 from torchtune.datasets._grammar import grammar_dataset
 from torchtune.datasets._instruct import instruct_dataset, InstructDataset
@@ -13,6 +13,7 @@
 
 __all__ = [
     "alpaca_dataset",
+    "alpaca_cleaned_dataset",
     "grammar_dataset",
     "samsum_dataset",
     "InstructDataset",

diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py
@@ -12,11 +12,10 @@
 def alpaca_dataset(
     tokenizer: Tokenizer,
     train_on_input: bool = True,
-    use_clean: bool = False,
     max_seq_len: int = 512,
 ) -> InstructDataset:
     """
-    Support for the Alpaca dataset and its variants from Hugging Face Datasets.
+    Support for the Alpaca dataset from Hugging Face Datasets.
     https://huggingface.co/datasets/tatsu-lab/alpaca
 
     Data input format: https://huggingface.co/datasets/tatsu-lab/alpaca#data-instances
@@ -32,14 +31,64 @@ def alpaca_dataset(
     contributes to the loss.
     - If `train_on_input` is False, the prompt is masked out (tokens replaced with -100)
 
-    The version of the dataset used is controlled by the `use_clean` flag which set to False by default.
-    - If `use_clean` is True, then https://huggingface.co/datasets/yahma/alpaca-cleaned is used
-    - If `use_clean` is False, then https://huggingface.co/datasets/tatsu-lab/alpaca is used
+    Args:
+        tokenizer (Tokenizer): Tokenizer used to encode data. Tokenize must implement an `encode` and `decode` method.
+        train_on_input (bool): Whether the model is trained on the prompt or not. Default is True.
+        max_seq_len (int): Maximum number of tokens in the returned input and label token id lists.
+            Default is 512, as set by Stanford Alpaca (https://github.com/tatsu-lab/stanford_alpaca?tab=readme-ov-file#fine-tuning),
+            but we recommend setting this to the highest you can fit in memory and is supported by the model.
+            For example, llama2-7B supports up to 4096 for sequence length.
+
+    Returns:
+        InstructDataset: dataset configured with Alpaca source data and template
+
+
+    Example:
+        >>> alpaca_ds = alpaca_dataset(tokenizer=tokenizer)
+        >>> for batch in Dataloader(alpaca_ds, batch_size=8):
+        >>>     print(f"Batch size: {len(batch)}")
+        >>> Batch size: 8
+    """
+
+    return InstructDataset(
+        tokenizer=tokenizer,
+        source="tatsu-lab/alpaca",
+        template=AlpacaInstructTemplate,
+        train_on_input=train_on_input,
+        max_seq_len=max_seq_len,
+        split="train",
+    )
+
+
+def alpaca_cleaned_dataset(
+    tokenizer: Tokenizer,
+    train_on_input: bool = True,
+    max_seq_len: int = 512,
+) -> InstructDataset:
+    """
+    Support for the Alpaca cleaned dataset from Hugging Face Datasets.
+    https://huggingface.co/datasets/yahma/alpaca-cleaned
+
+    Data input format: https://huggingface.co/datasets/tatsu-lab/alpaca#data-instances
+
+    The input is created using the prompt template from the original alpaca codebase:
+    https://github.com/tatsu-lab/stanford_alpaca/blob/761dc5bfbdeeffa89b8bff5d038781a4055f796a/train.py#L31
+
+    where `instruction`, `input`, and `output` are fields from the dataset.
+
+    Masking of the prompt during training is controlled by the `train_on_input` flag, which is
+    set to `True` by default (ref: https://github.com/tloen/alpaca-lora/blob/main/finetune.py#L49)
+    - If `train_on_input` is True, the prompt is used during training and
+    contributes to the loss.
+    - If `train_on_input` is False, the prompt is masked out (tokens replaced with -100)
+
+    This is the cleaned version of the original Alpaca dataset, which removes hallucinations,
+    poorly formed instructions/inputs/outputs, wrong answers, and other errors. See more details
+    on the Hugging Face dataset card.
 
     Args:
         tokenizer (Tokenizer): Tokenizer used to encode data. Tokenize must implement an `encode` and `decode` method.
         train_on_input (bool): Whether the model is trained on the prompt or not. Default is True.
-        use_clean (bool): Whether to use the cleaned version of the dataset or not. Default is False.
         max_seq_len (int): Maximum number of tokens in the returned input and label token id lists.
             Default is 512, as set by Stanford Alpaca (https://github.com/tatsu-lab/stanford_alpaca?tab=readme-ov-file#fine-tuning),
             but we recommend setting this to the highest you can fit in memory and is supported by the model.
@@ -58,7 +107,7 @@ def alpaca_dataset(
 
     return InstructDataset(
         tokenizer=tokenizer,
-        source="yahma/alpaca-cleaned" if use_clean else "tatsu-lab/alpaca",
+        source="yahma/alpaca-cleaned",
         template=AlpacaInstructTemplate,
         train_on_input=train_on_input,
         max_seq_len=max_seq_len,