Skip to content

Commit

Permalink
[BE][6/n] replace large c4_mini datasets by c4_test with the first 2K…
Browse files Browse the repository at this point in the history
… entries

ghstack-source-id: 319f4961b092778703101b98937803073132afa1
Pull Request resolved: #512
  • Loading branch information
tianyu-l committed Aug 8, 2024
1 parent 48485a8 commit 942f479
Show file tree
Hide file tree
Showing 5 changed files with 2,011 additions and 45,014 deletions.
2,000 changes: 2,000 additions & 0 deletions test/assets/c4_test/data.json

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions test/datasets/test_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@

class TestCheckpoint:
def test_c4_resumption(self):
dataset_name = "c4_mini"
dataset_path = "./torchtitan/datasets/c4_mini"
dataset_name = "c4_test"
dataset_path = "./test/assets/c4_test"
batch_size = 1
seq_len = 1024
world_size = 4
Expand Down
45,000 changes: 0 additions & 45,000 deletions torchtitan/datasets/c4_mini/data.json

This file was deleted.

19 changes: 8 additions & 11 deletions torchtitan/datasets/hf_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
# map from dataset name to a local directory, or
# a dataset repository on the HF hub
_supported_datasets = {
"c4_mini": "torchtitan/datasets/c4_mini",
"c4_test": "test/assets/c4_test",
"c4": "allenai/c4",
}

Expand All @@ -48,8 +48,8 @@ class HuggingFaceDataset(IterableDataset, Stateful):
rank (int): rank of the current data parallel process
infinite (bool): whether to loop infinitely over the dataset
We currently support the c4 dataset and a subset of it:
c4_mini (45K training entries)
We currently support the c4 dataset, and a subset of it for testing purposes:
c4_test (2K training entries)
c4 (177M training entries - this dataset is streamed due to the size)
>> c4 (EN) <<:
Expand Down Expand Up @@ -83,12 +83,12 @@ def __init__(
if dataset_path:
logger.warning(
f"Dataset {dataset_name} is not tested or verfied. "
f"Recommended datasets are: {list(_supported_datasets.keys())}."
f"Recommended datasets are: {list(_supported_datasets.keys())}"
)
else:
raise ValueError(
f"Dataset {dataset_name} is not supported. "
f"Supported datasets are: {list(_supported_datasets.keys())}."
f"Supported datasets are: {list(_supported_datasets.keys())}"
)

if not dataset_path:
Expand Down Expand Up @@ -132,15 +132,12 @@ def __iter__(self):
yield input, label

if not self.infinite:
logger.warning(f"Dataset {self.dataset_name} has run out of data.")
logger.warning(f"Dataset {self.dataset_name} has run out of data")
break
else:
# Reset offset for the next iteration
self._sample_idx = 0
logger.warning(
f"Dataset {self.dataset_name} is being re-looped. "
"Loss related metrics might be misleading."
)
logger.warning(f"Dataset {self.dataset_name} is being re-looped")

def _get_data_iter(self):
if self._sample_idx == 0:
Expand Down Expand Up @@ -188,7 +185,7 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:

if self._rank_id not in state_dict:
logger.warning(
f"DataLoader state is empty for dp rank {self._dp_rank}, expected key {self._rank_id}."
f"DataLoader state is empty for dp rank {self._dp_rank}, expected key {self._rank_id}"
)
return
super().load_state_dict(pickle.loads(state_dict[self._rank_id]))
Expand Down
2 changes: 1 addition & 1 deletion train_configs/debug_model.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ steps = 10
data_parallel_degree = -1
tensor_parallel_degree = 1
compile = false
dataset = "c4_mini" # supported datasets: c4_mini (45K), c4 (177M)
dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M)

[experimental]
pipeline_parallel_degree = 1
Expand Down

0 comments on commit 942f479

Please sign in to comment.