Skip to content

Commit

Permalink
fix mmap
Browse files Browse the repository at this point in the history
  • Loading branch information
greycooker committed Aug 2, 2024
1 parent 4665ccf commit e988cf5
Showing 1 changed file with 5 additions and 5 deletions.
10 changes: 5 additions & 5 deletions paddlenlp/data/indexed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,13 @@ def make_dataset(path, impl, skip_warmup=False):
return None


def make_sft_dataset(path, impl, dataclass, skip_warmup=False):
def make_sft_dataset(path, dataclass, skip_warmup=False, impl="mmap"):
if impl != "mmap":
raise ValueError("SFT Indexed Dataset only support mmap memory-mapped method temporarily")

Check warning on line 74 in paddlenlp/data/indexed_dataset.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/data/indexed_dataset.py#L73-L74

Added lines #L73 - L74 were not covered by tests

print_rank_0(" > building dataset index ...")
start_time = time.time()
sft_indexed_dataset = SFT_MMapIndexedDataset(path, dataclass, skip_warmup)
sft_indexed_dataset = SftMMapIndexedDataset(path, dataclass, skip_warmup)
print_rank_0(" > finished creating SFT indexed dataset in {:4f} " "seconds".format(time.time() - start_time))
print_rank_0(" number of samples: {}".format(len(sft_indexed_dataset.doc_idx) - 1))

Check warning on line 80 in paddlenlp/data/indexed_dataset.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/data/indexed_dataset.py#L76-L80

Added lines #L76 - L80 were not covered by tests

Expand Down Expand Up @@ -574,7 +574,7 @@ def exists(path):
return os.path.exists(index_file_path(path)) and os.path.exists(data_file_path(path))


class SFT_MMapIndexedDataset(paddle.io.Dataset):
class SftMMapIndexedDataset(paddle.io.Dataset):
class Index(object):
_HDR_MAGIC = b"MMIDIDX\x00\x00"

Expand Down Expand Up @@ -798,7 +798,7 @@ def make_builder(out_file, impl, save_dtype, loss_mask_file=None):
return IndexedDatasetBuilder(out_file, dtype=save_dtype)


class SFT_MMapIndexedDatasetBuilder(object):
class SftMMapIndexedDatasetBuilder(object):
def __init__(self, output_file_dict, dtype):
self._data_file_dict = {}
for key, filename in output_file_dict.items():
Expand All @@ -823,7 +823,7 @@ def end_document(self):
def finalize(self, index_file):
for key, filename in self._data_file_dict.items():
filename.close()
with SFT_MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
with SftMMapIndexedDataset.Index.writer(index_file, self._dtype) as index:
index.write(self._sizes, self._doc_idx)

Check warning on line 827 in paddlenlp/data/indexed_dataset.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/data/indexed_dataset.py#L824-L827

Added lines #L824 - L827 were not covered by tests


Expand Down

0 comments on commit e988cf5

Please sign in to comment.