From 5438da7c801bd24d6713de8ef4c15836d9a8095f Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Mon, 29 Jul 2024 17:43:09 +0800 Subject: [PATCH 1/2] update --- paddlenlp/data/causal_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddlenlp/data/causal_dataset.py b/paddlenlp/data/causal_dataset.py index b6551b5fd1c8..25da86cf1a33 100644 --- a/paddlenlp/data/causal_dataset.py +++ b/paddlenlp/data/causal_dataset.py @@ -147,7 +147,9 @@ def build_train_valid_test_datasets( # Parse the values. output = get_datasets_weights_and_num_samples(data_prefix, train_val_test_num_samples) prefixes, weights, datasets_train_valid_test_num_samples = output - train_num_samples, valid_num_samples, test_num_samples = map(sum, zip(*datasets_train_valid_test_num_samples)) + # NOTE: megatron/gpt_dataset.py has been updated. When creating BlendableDataset, we will use the raw train_val_test_num_samples instead of the expanded ones. + # Please refer to https://github.com/NVIDIA/NeMo/blob/72f630d087d45655b1a069dc72debf01dfdbdb2d/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py#L74-L80 for more information + train_num_samples, valid_num_samples, test_num_samples = datasets_train_valid_test_num_samples # Build individual datasets. train_datasets = [] From f9457d64f9e4d6fa45229ddd009c3f37f4c31ec8 Mon Sep 17 00:00:00 2001 From: yujun <573009727@qq.com> Date: Tue, 30 Jul 2024 11:52:22 +0800 Subject: [PATCH 2/2] train_val_test_num_samples --- paddlenlp/data/causal_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddlenlp/data/causal_dataset.py b/paddlenlp/data/causal_dataset.py index 25da86cf1a33..9d70506205ac 100644 --- a/paddlenlp/data/causal_dataset.py +++ b/paddlenlp/data/causal_dataset.py @@ -149,7 +149,7 @@ def build_train_valid_test_datasets( prefixes, weights, datasets_train_valid_test_num_samples = output # NOTE: megatron/gpt_dataset.py has been updated. When creating BlendableDataset, we will use the raw train_val_test_num_samples instead of the expanded ones. # Please refer to https://github.com/NVIDIA/NeMo/blob/72f630d087d45655b1a069dc72debf01dfdbdb2d/nemo/collections/nlp/data/language_modeling/megatron/gpt_dataset.py#L74-L80 for more information - train_num_samples, valid_num_samples, test_num_samples = datasets_train_valid_test_num_samples + train_num_samples, valid_num_samples, test_num_samples = train_val_test_num_samples # Build individual datasets. train_datasets = []