Skip to content

Commit

Permalink
optimize the iteration when tokenizeing large datasets (#332)
Browse files Browse the repository at this point in the history
  • Loading branch information
winglian authored Aug 4, 2023
1 parent 0d2e34f commit fe28543
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion src/axolotl/utils/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Module containing data utilities"""
import functools
import itertools
import logging
from hashlib import md5
from pathlib import Path
Expand Down Expand Up @@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
LOG.info("tokenizing, merging, and shuffling master dataset")

samples: List[int] = []
chunk_size = 1000
for d in datasets:
samples = samples + list(d)
d_iter = iter(d)
while True:
chunk = list(itertools.islice(d_iter, chunk_size))
if not chunk:
break
samples.extend(chunk)

LOG.info("shuffle")
dataset = Dataset.from_list(samples).shuffle(seed=seed)
if cfg.local_rank == 0:
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
Expand Down

0 comments on commit fe28543

Please sign in to comment.