Skip to content

Commit

Permalink
Refac module factory + avoid etag requests for hub datasets (#2986)
Browse files Browse the repository at this point in the history
* refac module factory + avoid etag requests for hub datasets

* fix tests

* typing

* fixes

* prepare timeout

* fix offline simulator with hugginggace_hub

* add module factory tests (1/N)

* add module factory test (2/N)

* add data files tests (1/N)

* add data fiels tests (2/N)

* add data files tests (3/N)

* style

* docstrings

* don't update counts when running tests

* nump huggingface_hub

* add timeouts for offline mode

* minor

* minor bis

* install ruamel-yaml properly in the CI for windows

* fix windows test

* style

* fix comet intensive calls patcher

* warning message when loading from the master branch

* style

* albert's comments

* remove unnecessary check

* don't use master if HF_SCRIPTS_VERSION is specified
  • Loading branch information
lhoestq authored Oct 11, 2021
1 parent 93c828b commit d86c7fb
Show file tree
Hide file tree
Showing 12 changed files with 1,741 additions and 784 deletions.
8 changes: 4 additions & 4 deletions metrics/comet/comet.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
```
"""

from comet.models import download_model # From: unbabel-comet
import comet.models # From: unbabel-comet

import datasets

Expand Down Expand Up @@ -121,7 +121,7 @@ def _info(self):
"references": datasets.Value("string", id="sequence"),
}
),
codebase_urls=["hhttps://github.com/Unbabel/COMET"],
codebase_urls=["https://github.com/Unbabel/COMET"],
reference_urls=[
"https://github.com/Unbabel/COMET",
"https://www.aclweb.org/anthology/2020.emnlp-main.213/",
Expand All @@ -131,9 +131,9 @@ def _info(self):

def _download_and_prepare(self, dl_manager):
if self.config_name == "default":
self.scorer = download_model("wmt-large-da-estimator-1719")
self.scorer = comet.models.download_model("wmt-large-da-estimator-1719")
else:
self.scorer = download_model(self.config_name)
self.scorer = comet.models.download_model(self.config_name)

def _compute(self, sources, predictions, references, cuda=True, show_progress=False):
data = {"src": sources, "mt": predictions, "ref": references}
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
# for data streaming via http
"aiohttp",
# To get datasets from the Datasets Hub on huggingface.co
"huggingface_hub>=0.0.14,<0.1.0",
"huggingface_hub>=0.0.18,<0.1.0",
# Utilities from PyPA to e.g., compare versions
"packaging",
]
Expand Down
93 changes: 27 additions & 66 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,10 @@
import urllib
from dataclasses import dataclass
from functools import partial
from pathlib import PurePath
from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union
from typing import Dict, Mapping, Optional, Tuple, Union

from datasets.features import Features
from datasets.utils.mock_download_manager import MockDownloadManager
from datasets.utils.py_utils import map_nested

from . import config, utils
from .arrow_dataset import Dataset
Expand All @@ -43,6 +41,7 @@
ReadInstruction,
)
from .arrow_writer import ArrowWriter, BeamWriter
from .data_files import DataFilesDict, _sanitize_patterns
from .dataset_dict import DatasetDict, IterableDatasetDict
from .fingerprint import Hasher
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
Expand All @@ -51,7 +50,7 @@
from .splits import Split, SplitDict, SplitGenerator
from .utils import logging
from .utils.download_manager import DownloadManager, GenerateMode
from .utils.file_utils import DownloadConfig, is_relative_path, is_remote_url, request_etags, url_or_path_join
from .utils.file_utils import DownloadConfig, is_remote_url
from .utils.filelock import FileLock
from .utils.info_utils import get_size_checksum_dict, verify_checksums, verify_splits
from .utils.streaming_download_manager import StreamingDownloadManager
Expand Down Expand Up @@ -90,7 +89,7 @@ class BuilderConfig:
name: str = "default"
version: Optional[Union[str, utils.Version]] = "0.0.0"
data_dir: Optional[str] = None
data_files: Optional[Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]] = None
data_files: Optional[DataFilesDict] = None
description: Optional[str] = None

def __post_init__(self):
Expand All @@ -99,12 +98,11 @@ def __post_init__(self):
for invalid_char in invalid_windows_characters:
if invalid_char in self.name:
raise InvalidConfigName(
(
"Bad characters from black list '{}' found in '{}'. "
"They could create issues when creating a directory "
"for this config on Windows filesystem."
).format(invalid_windows_characters, self.name)
f"Bad characters from black list '{invalid_windows_characters}' found in '{self.name}'. "
f"They could create issues when creating a directory for this config on Windows filesystem."
)
if self.data_files is not None and not isinstance(self.data_files, DataFilesDict):
raise ValueError(f"Expected a DataFilesDict in data_files but got {self.data_files}")

def __eq__(self, o):
# we need to override the default dataclass __eq__ since it doesn't check for
Expand All @@ -117,8 +115,6 @@ def create_config_id(
self,
config_kwargs: dict,
custom_features: Optional[Features] = None,
use_auth_token: Optional[Union[bool, str]] = None,
base_path: Optional[Union[bool, str]] = None,
) -> str:
"""
The config id is used to build the cache directory.
Expand All @@ -136,14 +132,12 @@ def create_config_id(
# name and version are already used to build the cache directory
config_kwargs_to_add_to_suffix.pop("name", None)
config_kwargs_to_add_to_suffix.pop("version", None)
# data files are handled differently
config_kwargs_to_add_to_suffix.pop("data_files", None)
# data dir handling (when specified it points to the manually downloaded data):
# it was previously ignored before the introduction of config id because we didn't want
# to change the config name. Now it's fine to take it into account for the config id.
# config_kwargs_to_add_to_suffix.pop("data_dir", None)
if "data_dir" in config_kwargs_to_add_to_suffix and config_kwargs_to_add_to_suffix["data_dir"] is None:
del config_kwargs_to_add_to_suffix["data_dir"]
config_kwargs_to_add_to_suffix.pop("data_dir", None)
if config_kwargs_to_add_to_suffix:
# we don't care about the order of the kwargs
config_kwargs_to_add_to_suffix = {
Expand All @@ -158,49 +152,6 @@ def create_config_id(
else:
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)

if self.data_files is not None:
m = Hasher()
if suffix:
m.update(suffix)
if isinstance(self.data_files, str):
data_files = {"train": [self.data_files]}
elif isinstance(self.data_files, (tuple, list)):
data_files = {"train": self.data_files}
elif isinstance(self.data_files, dict):
data_files = {
str(key): files if isinstance(files, (tuple, list)) else [files]
for key, files in self.data_files.items()
}
else:
raise ValueError("Please provide a valid `data_files` in `DatasetBuilder`")

def abspath(data_file) -> str:
data_file = data_file.as_posix() if isinstance(data_file, PurePath) else str(data_file)
return url_or_path_join(base_path, data_file) if is_relative_path(data_file) else data_file

data_files: Dict[str, List[str]] = map_nested(abspath, data_files)
remote_urls = [
data_file for key in data_files for data_file in data_files[key] if is_remote_url(data_file)
]
etags = dict(
zip(
remote_urls,
request_etags(
remote_urls, use_auth_token=use_auth_token, tqdm_kwargs={"desc": "Check remote data files"}
),
)
)
for key in sorted(data_files.keys()):
m.update(key)
for data_file in data_files[key]:
if is_remote_url(data_file):
m.update(data_file)
m.update(etags[data_file])
else:
m.update(os.path.abspath(data_file))
m.update(str(os.path.getmtime(data_file)))
suffix = m.hexdigest()

if custom_features is not None:
m = Hasher()
if suffix:
Expand Down Expand Up @@ -255,6 +206,8 @@ def __init__(
features: Optional[Features] = None,
use_auth_token: Optional[Union[bool, str]] = None,
namespace: Optional[str] = None,
data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
data_dir: Optional[str] = None,
**config_kwargs,
):
"""Constructs a DatasetBuilder.
Expand All @@ -274,6 +227,12 @@ def __init__(
It can be used to changed the :obj:`datasets.Features` description of a dataset for example.
use_auth_token (:obj:`str` or :obj:`bool`, optional): Optional string or boolean to use as Bearer token
for remote files on the Datasets Hub. If True, will get token from ``"~/.huggingface"``.
namespace: `str`, used to separate builders with the same name but not coming from the same namespace.
For example to separate "squad" from "lhoestq/squad" (the builder name would be "lhoestq___squad").
data_files: for builders like "csv" or "json" that need the user to specify data files. They can be either
local or remote files. For convenience you can use a DataFilesDict.
data_files: `str`, for builders that require manual download. It must be the path to the local directory containing
the manually downloaded data.
config_kwargs: will override the defaults kwargs in config
"""
Expand All @@ -284,14 +243,18 @@ def __init__(
self.use_auth_token = use_auth_token
self.namespace = namespace

if data_files is not None and not isinstance(data_files, DataFilesDict):
data_files = DataFilesDict.from_local_or_remote(
_sanitize_patterns(data_files), base_path=base_path, use_auth_token=use_auth_token
)

# Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
if "features" in inspect.signature(self.BUILDER_CONFIG_CLASS.__init__).parameters and features is not None:
config_kwargs["features"] = features
# Discard default config parameters
if "data_files" in config_kwargs and config_kwargs["data_files"] is None:
del config_kwargs["data_files"]
if "data_dir" in config_kwargs and config_kwargs["data_dir"] is None:
del config_kwargs["data_dir"]
if data_files is not None:
config_kwargs["data_files"] = data_files
if data_dir is not None:
config_kwargs["data_dir"] = data_dir
self.config, self.config_id = self._create_builder_config(
name,
custom_features=features,
Expand Down Expand Up @@ -412,8 +375,6 @@ def _create_builder_config(self, name=None, custom_features=None, **config_kwarg
config_id = builder_config.create_config_id(
config_kwargs,
custom_features=custom_features,
use_auth_token=self.use_auth_token,
base_path=self.base_path if self.base_path is not None else "",
)
is_custom = config_id not in self.builder_configs
if is_custom:
Expand Down Expand Up @@ -1193,7 +1154,7 @@ def _prepare_split(self, split_generator):
generator = self._generate_tables(**split_generator.gen_kwargs)
with ArrowWriter(features=self.info.features, path=fpath) as writer:
for key, table in utils.tqdm(
generator, unit=" tables", leave=False, disable=bool(logging.get_verbosity() == logging.NOTSET)
generator, unit=" tables", leave=False, disable=True # bool(logging.get_verbosity() == logging.NOTSET)
):
writer.write_table(table)
num_examples, num_bytes = writer.finalize()
Expand Down
13 changes: 7 additions & 6 deletions src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
USE_BEAM = os.environ.get("USE_BEAM", "AUTO").upper()
BEAM_VERSION = "N/A"
BEAM_AVAILABLE = False
if USE_BEAM in ("1", "ON", "YES", "AUTO"):
if USE_BEAM in ENV_VARS_TRUE_AND_AUTO_VALUES:
try:
BEAM_VERSION = version.parse(importlib_metadata.version("apache_beam"))
BEAM_AVAILABLE = True
Expand Down Expand Up @@ -150,6 +150,11 @@
DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR)
EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH))

# Download count for the website
HF_UPDATE_DOWNLOAD_COUNTS = (
os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES
)

# Batch size constants. For more info, see:
# https://github.com/apache/arrow/blob/master/docs/source/cpp/arrays.rst#size-limitations-and-recommendations)
DEFAULT_MAX_BATCH_SIZE = 10_000
Expand All @@ -159,11 +164,7 @@
MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30

# Offline mode
HF_DATASETS_OFFLINE = os.environ.get("HF_DATASETS_OFFLINE", "AUTO").upper()
if HF_DATASETS_OFFLINE in ("1", "ON", "YES"):
HF_DATASETS_OFFLINE = True
else:
HF_DATASETS_OFFLINE = False
HF_DATASETS_OFFLINE = os.environ.get("HF_DATASETS_OFFLINE", "AUTO").upper() in ENV_VARS_TRUE_VALUES

# In-memory
DEFAULT_IN_MEMORY_MAX_SIZE = 0 # Disabled
Expand Down
Loading

1 comment on commit d86c7fb

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.013707 / 0.011353 (0.002354) 0.005296 / 0.011008 (-0.005712) 0.046515 / 0.038508 (0.008006) 0.052562 / 0.023109 (0.029453) 0.374353 / 0.275898 (0.098455) 0.431381 / 0.323480 (0.107901) 0.011333 / 0.007986 (0.003348) 0.005625 / 0.004328 (0.001297) 0.012527 / 0.004250 (0.008276) 0.052494 / 0.037052 (0.015441) 0.376668 / 0.258489 (0.118179) 0.425490 / 0.293841 (0.131649) 0.038780 / 0.128546 (-0.089766) 0.013903 / 0.075646 (-0.061743) 0.392101 / 0.419271 (-0.027170) 0.061380 / 0.043533 (0.017848) 0.384461 / 0.255139 (0.129322) 0.442918 / 0.283200 (0.159718) 0.107773 / 0.141683 (-0.033910) 2.185674 / 1.452155 (0.733520) 2.252641 / 1.492716 (0.759925)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.300608 / 0.018006 (0.282601) 0.708919 / 0.000490 (0.708430) 0.017104 / 0.000200 (0.016904) 0.000720 / 0.000054 (0.000666)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.044925 / 0.037411 (0.007513) 0.029474 / 0.014526 (0.014949) 0.042798 / 0.176557 (-0.133758) 0.155161 / 0.737135 (-0.581974) 0.035459 / 0.296338 (-0.260879)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.501560 / 0.215209 (0.286351) 4.900143 / 2.077655 (2.822488) 2.357814 / 1.504120 (0.853694) 2.061142 / 1.541195 (0.519947) 2.061240 / 1.468490 (0.592750) 0.520887 / 4.584777 (-4.063890) 6.773882 / 3.745712 (3.028170) 1.502592 / 5.269862 (-3.767270) 1.386804 / 4.565676 (-3.178872) 0.060715 / 0.424275 (-0.363560) 0.006073 / 0.007607 (-0.001534) 0.636648 / 0.226044 (0.410604) 6.703033 / 2.268929 (4.434105) 3.187333 / 55.444624 (-52.257291) 2.908307 / 6.876477 (-3.968170) 2.735220 / 2.142072 (0.593148) 0.701273 / 4.805227 (-4.103954) 0.154261 / 6.500664 (-6.346403) 0.078503 / 0.075469 (0.003034)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.255591 / 1.841788 (-0.586197) 18.331032 / 8.074308 (10.256724) 36.878939 / 10.191392 (26.687547) 1.063622 / 0.680424 (0.383198) 0.688899 / 0.534201 (0.154698) 0.291041 / 0.579283 (-0.288242) 0.742795 / 0.434364 (0.308431) 0.244284 / 0.540337 (-0.296054) 0.253090 / 1.386936 (-1.133846)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.011462 / 0.011353 (0.000109) 0.004170 / 0.011008 (-0.006838) 0.037338 / 0.038508 (-0.001170) 0.035610 / 0.023109 (0.012501) 0.327434 / 0.275898 (0.051536) 0.357336 / 0.323480 (0.033856) 0.008314 / 0.007986 (0.000329) 0.005151 / 0.004328 (0.000823) 0.010414 / 0.004250 (0.006164) 0.045100 / 0.037052 (0.008048) 0.327824 / 0.258489 (0.069335) 0.381647 / 0.293841 (0.087806) 0.032715 / 0.128546 (-0.095831) 0.011789 / 0.075646 (-0.063857) 0.296289 / 0.419271 (-0.122983) 0.056885 / 0.043533 (0.013352) 0.356342 / 0.255139 (0.101203) 0.391682 / 0.283200 (0.108482) 0.088525 / 0.141683 (-0.053157) 1.829910 / 1.452155 (0.377756) 2.043972 / 1.492716 (0.551256)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.220336 / 0.018006 (0.202330) 0.510172 / 0.000490 (0.509682) 0.012486 / 0.000200 (0.012286) 0.000093 / 0.000054 (0.000038)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.042658 / 0.037411 (0.005247) 0.029973 / 0.014526 (0.015447) 0.038549 / 0.176557 (-0.138008) 0.147973 / 0.737135 (-0.589163) 0.038967 / 0.296338 (-0.257371)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.560686 / 0.215209 (0.345477) 5.166777 / 2.077655 (3.089122) 2.355011 / 1.504120 (0.850891) 2.126085 / 1.541195 (0.584890) 2.140159 / 1.468490 (0.671668) 0.532240 / 4.584777 (-4.052537) 6.719808 / 3.745712 (2.974096) 1.477421 / 5.269862 (-3.792440) 1.384164 / 4.565676 (-3.181512) 0.062177 / 0.424275 (-0.362098) 0.005989 / 0.007607 (-0.001618) 0.654042 / 0.226044 (0.427998) 6.569254 / 2.268929 (4.300326) 3.085953 / 55.444624 (-52.358671) 2.447469 / 6.876477 (-4.429008) 2.467565 / 2.142072 (0.325493) 0.718062 / 4.805227 (-4.087166) 0.162212 / 6.500664 (-6.338452) 0.068412 / 0.075469 (-0.007058)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.024061 / 1.841788 (-0.817727) 13.672041 / 8.074308 (5.597733) 30.616974 / 10.191392 (20.425582) 0.879203 / 0.680424 (0.198779) 0.644682 / 0.534201 (0.110481) 0.263682 / 0.579283 (-0.315601) 0.715525 / 0.434364 (0.281161) 0.237246 / 0.540337 (-0.303091) 0.268090 / 1.386936 (-1.118846)

CML watermark

Please sign in to comment.