You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Loading a dataset from with a data_dir argument generates a NonMatchingSplitsSizesError if there are multiple directories in the dataset.
This appears to happen because the expected split is calculated based on the data in all the directories whereas the recorded split is calculated based on the data in the directory specified using the data_dir argument.
This is recent behavior. Until the past few weeks loading using the data_dir argument worked without any issue.
File ~/.python/current/lib/python3.10/site-packages/datasets/load.py:2609, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs) 2606 return builder_instance.as_streaming_dataset(split=split) 2608 # Download and prepare data
-> 2609 builder_instance.download_and_prepare( 2610 download_config=download_config, 2611 download_mode=download_mode, 2612 verification_mode=verification_mode, 2613 num_proc=num_proc, 2614 storage_options=storage_options, 2615 ) 2617 # Build dataset for splits 2618 keep_in_memory = ( 2619 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size) 2620 )
File ~/.python/current/lib/python3.10/site-packages/datasets/builder.py:1027, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs) 1025 if num_proc is not None: 1026 prepare_split_kwargs["num_proc"] = num_proc
-> 1027 self._download_and_prepare( 1028 dl_manager=dl_manager, 1029 verification_mode=verification_mode, 1030 **prepare_split_kwargs, 1031 **download_and_prepare_kwargs, 1032 ) 1033 # Sync info 1034 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~/.python/current/lib/python3.10/site-packages/datasets/builder.py:1140, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs) 1137 dl_manager.manage_extracted_files() 1139 if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
-> 1140 verify_splits(self.info.splits, split_dict) 1142 # Update the info object with the splits. 1143 self.info.splits = split_dict
File ~/.python/current/lib/python3.10/site-packages/datasets/utils/info_utils.py:101, in verify_splits(expected_splits, recorded_splits) 95 bad_splits = [ 96 {"expected": expected_splits[name], "recorded": recorded_splits[name]} 97 for name in expected_splits 98 if expected_splits[name].num_examples != recorded_splits[name].num_examples 99 ] 100 if len(bad_splits) > 0:
--> 101 raise NonMatchingSplitsSizesError(str(bad_splits)) 102 logger.info("All the splits matched successfully.")
Describe the bug
Loading a dataset from with a data_dir argument generates a NonMatchingSplitsSizesError if there are multiple directories in the dataset.
This appears to happen because the expected split is calculated based on the data in all the directories whereas the recorded split is calculated based on the data in the directory specified using the data_dir argument.
This is recent behavior. Until the past few weeks loading using the data_dir argument worked without any issue.
Steps to reproduce the bug
Simple test dataset available here: https://huggingface.co/datasets/srehaag/hf-bug-temp
The dataset contains two directories "data1" and "data2", each with a file called "train.parquet" with a 2 x 5 table.
from datasets import load_dataset
dataset = load_dataset("srehaag/hf-bug-temp", data_dir = "data1")
Generates:
NonMatchingSplitsSizesError Traceback (most recent call last)
Cell In[3], line 2
1 from datasets import load_dataset
----> 2 dataset = load_dataset("srehaag/hf-bug-temp", data_dir = "data1")
File ~/.python/current/lib/python3.10/site-packages/datasets/load.py:2609, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, ignore_verifications, keep_in_memory, save_infos, revision, token, use_auth_token, task, streaming, num_proc, storage_options, trust_remote_code, **config_kwargs)
2606 return builder_instance.as_streaming_dataset(split=split)
2608 # Download and prepare data
-> 2609 builder_instance.download_and_prepare(
2610 download_config=download_config,
2611 download_mode=download_mode,
2612 verification_mode=verification_mode,
2613 num_proc=num_proc,
2614 storage_options=storage_options,
2615 )
2617 # Build dataset for splits
2618 keep_in_memory = (
2619 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
2620 )
File ~/.python/current/lib/python3.10/site-packages/datasets/builder.py:1027, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
1025 if num_proc is not None:
1026 prepare_split_kwargs["num_proc"] = num_proc
-> 1027 self._download_and_prepare(
1028 dl_manager=dl_manager,
1029 verification_mode=verification_mode,
1030 **prepare_split_kwargs,
1031 **download_and_prepare_kwargs,
1032 )
1033 # Sync info
1034 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File ~/.python/current/lib/python3.10/site-packages/datasets/builder.py:1140, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
1137 dl_manager.manage_extracted_files()
1139 if verification_mode == VerificationMode.BASIC_CHECKS or verification_mode == VerificationMode.ALL_CHECKS:
-> 1140 verify_splits(self.info.splits, split_dict)
1142 # Update the info object with the splits.
1143 self.info.splits = split_dict
File ~/.python/current/lib/python3.10/site-packages/datasets/utils/info_utils.py:101, in verify_splits(expected_splits, recorded_splits)
95 bad_splits = [
96 {"expected": expected_splits[name], "recorded": recorded_splits[name]}
97 for name in expected_splits
98 if expected_splits[name].num_examples != recorded_splits[name].num_examples
99 ]
100 if len(bad_splits) > 0:
--> 101 raise NonMatchingSplitsSizesError(str(bad_splits))
102 logger.info("All the splits matched successfully.")
NonMatchingSplitsSizesError: [{'expected': SplitInfo(name='train', num_bytes=212, num_examples=10, shard_lengths=None, dataset_name=None), 'recorded': SplitInfo(name='train', num_bytes=106, num_examples=5, shard_lengths=None, dataset_name='hf-bug-temp')}]
By contrast, this loads the data from both data1/train.parquet and data2/train.parquet without any error message:
from datasets import load_dataset
dataset = load_dataset("srehaag/hf-bug-temp")
Expected behavior
Should load the 5 x 2 table from data1/train.parquet without error message.
Environment info
Used Codespaces to simplify environment (see details below), but bug is present across various configurations.
datasets
version: 2.19.1huggingface_hub
version: 0.23.1fsspec
version: 2024.3.1The text was updated successfully, but these errors were encountered: