[BE][6/n] replace large c4_mini datasets by c4_test with the first 2K…

… entries ghstack-source-id: 319f4961b092778703101b98937803073132afa1 Pull Request resolved: #512
pytorch · Aug 8, 2024 · 942f479 · 942f479
1 parent 48485a8
commit 942f479
Show file tree

Hide file tree

Showing 5 changed files with 2,011 additions and 45,014 deletions.
diff --git a/test/assets/c4_test/data.json b/test/assets/c4_test/data.json
diff --git a/test/datasets/test_checkpoint.py b/test/datasets/test_checkpoint.py
@@ -11,8 +11,8 @@
 
 class TestCheckpoint:
     def test_c4_resumption(self):
-        dataset_name = "c4_mini"
-        dataset_path = "./torchtitan/datasets/c4_mini"
+        dataset_name = "c4_test"
+        dataset_path = "./test/assets/c4_test"
         batch_size = 1
         seq_len = 1024
         world_size = 4

diff --git a/torchtitan/datasets/c4_mini/data.json b/torchtitan/datasets/c4_mini/data.json
diff --git a/torchtitan/datasets/hf_datasets.py b/torchtitan/datasets/hf_datasets.py
@@ -28,7 +28,7 @@
 # map from dataset name to a local directory, or
 # a dataset repository on the HF hub
 _supported_datasets = {
-    "c4_mini": "torchtitan/datasets/c4_mini",
+    "c4_test": "test/assets/c4_test",
     "c4": "allenai/c4",
 }
 
@@ -48,8 +48,8 @@ class HuggingFaceDataset(IterableDataset, Stateful):
         rank (int): rank of the current data parallel process
         infinite (bool): whether to loop infinitely over the dataset
 
-    We currently support the c4 dataset and a subset of it:
-    c4_mini (45K training entries)
+    We currently support the c4 dataset, and a subset of it for testing purposes:
+    c4_test (2K training entries)
     c4 (177M training entries - this dataset is streamed due to the size)
 
     >> c4 (EN) <<:
@@ -83,12 +83,12 @@ def __init__(
             if dataset_path:
                 logger.warning(
                     f"Dataset {dataset_name} is not tested or verfied. "
-                    f"Recommended datasets are: {list(_supported_datasets.keys())}."
+                    f"Recommended datasets are: {list(_supported_datasets.keys())}"
                 )
             else:
                 raise ValueError(
                     f"Dataset {dataset_name} is not supported. "
-                    f"Supported datasets are: {list(_supported_datasets.keys())}."
+                    f"Supported datasets are: {list(_supported_datasets.keys())}"
                 )
 
         if not dataset_path:
@@ -132,15 +132,12 @@ def __iter__(self):
                     yield input, label
 
             if not self.infinite:
-                logger.warning(f"Dataset {self.dataset_name} has run out of data.")
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
                 break
             else:
                 # Reset offset for the next iteration
                 self._sample_idx = 0
-                logger.warning(
-                    f"Dataset {self.dataset_name} is being re-looped. "
-                    "Loss related metrics might be misleading."
-                )
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
 
     def _get_data_iter(self):
         if self._sample_idx == 0:
@@ -188,7 +185,7 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
 
         if self._rank_id not in state_dict:
             logger.warning(
-                f"DataLoader state is empty for dp rank {self._dp_rank}, expected key {self._rank_id}."
+                f"DataLoader state is empty for dp rank {self._dp_rank}, expected key {self._rank_id}"
             )
             return
         super().load_state_dict(pickle.loads(state_dict[self._rank_id]))

diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
@@ -38,7 +38,7 @@ steps = 10
 data_parallel_degree = -1
 tensor_parallel_degree = 1
 compile = false
-dataset = "c4_mini"  # supported datasets: c4_mini (45K), c4 (177M)
+dataset = "c4_test"  # supported datasets: c4_test (2K), c4 (177M)
 
 [experimental]
 pipeline_parallel_degree = 1