Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GH->HF] Load datasets from the Hub #4973

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ on:
- main

env:
HF_SCRIPTS_VERSION: main
HF_ALLOW_CODE_EVAL: 1

jobs:
Expand Down
2 changes: 0 additions & 2 deletions src/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,6 @@
"If you are running this in a Google Colab, you should probably just restart the runtime to use the right version of `pyarrow`."
)

SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__

del platform
del pyarrow
del version
Expand Down
196 changes: 49 additions & 147 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,93 +424,6 @@ def get_module(self) -> MetricModule:
raise NotImplementedError


class GithubDatasetModuleFactory(_DatasetModuleFactory):
"""
Get the module of a dataset from GitHub (legacy).
The dataset script is downloaded from GitHub.
This class will eventually be removed and a HubDatasetModuleFactory will be used instead.
"""

def __init__(
self,
name: str,
revision: Optional[Union[str, Version]] = None,
download_config: Optional[DownloadConfig] = None,
download_mode: Optional[DownloadMode] = None,
dynamic_modules_path: Optional[str] = None,
):
self.name = name
self.revision = revision
self.download_config = download_config.copy() if download_config else DownloadConfig()
if self.download_config.max_retries < 3:
self.download_config.max_retries = 3
self.download_mode = download_mode
self.dynamic_modules_path = dynamic_modules_path
assert self.name.count("/") == 0
increase_load_count(name, resource_type="dataset")

def download_loading_script(self, revision: Optional[str]) -> str:
file_path = hf_github_url(path=self.name, name=self.name + ".py", revision=revision)
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading builder script"
return cached_path(file_path, download_config=download_config)

def download_dataset_infos_file(self, revision: Optional[str]) -> str:
dataset_infos = hf_github_url(path=self.name, name=config.DATASETDICT_INFOS_FILENAME, revision=revision)
# Download the dataset infos file if available
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading metadata"
try:
return cached_path(
dataset_infos,
download_config=download_config,
)
except (FileNotFoundError, ConnectionError):
return None

def get_module(self) -> DatasetModule:
# get script and other files
revision = self.revision
try:
local_path = self.download_loading_script(revision)
except FileNotFoundError:
if revision is not None or os.getenv("HF_SCRIPTS_VERSION", None) is not None:
raise
else:
revision = "main"
local_path = self.download_loading_script(revision)
logger.warning(
f"Couldn't find a directory or a dataset named '{self.name}' in this version. "
f"It was picked from the main branch on github instead."
)
dataset_infos_path = self.download_dataset_infos_file(revision)
imports = get_imports(local_path)
local_imports = _download_additional_modules(
name=self.name,
base_path=hf_github_url(path=self.name, name="", revision=revision),
imports=imports,
download_config=self.download_config,
)
additional_files = [(config.DATASETDICT_INFOS_FILENAME, dataset_infos_path)] if dataset_infos_path else []
# copy the script and the files in an importable directory
dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
module_path, hash = _create_importable_file(
local_path=local_path,
local_imports=local_imports,
additional_files=additional_files,
dynamic_modules_path=dynamic_modules_path,
module_namespace="datasets",
name=self.name,
download_mode=self.download_mode,
)
# make the new module to be noticed by the import system
importlib.invalidate_caches()
builder_kwargs = {"hash": hash, "base_path": hf_hub_url(self.name, "", revision=self.revision)}
return DatasetModule(module_path, hash, builder_kwargs)


class GithubMetricModuleFactory(_MetricModuleFactory):
"""Get the module of a metric. The metric script is downloaded from GitHub.

Expand Down Expand Up @@ -554,7 +467,7 @@ def get_module(self) -> MetricModule:
local_path = self.download_loading_script(revision)
revision = self.revision
except FileNotFoundError:
if revision is not None or os.getenv("HF_SCRIPTS_VERSION", None) is not None:
if revision is not None:
raise
else:
revision = "main"
Expand Down Expand Up @@ -826,7 +739,7 @@ def __init__(
self.data_dir = data_dir
self.download_config = download_config or DownloadConfig()
self.download_mode = download_mode
assert self.name.count("/") == 1
assert self.name.count("/") <= 1
increase_load_count(name, resource_type="dataset")

def get_module(self) -> DatasetModule:
Expand Down Expand Up @@ -917,11 +830,11 @@ def __init__(
self.download_config = download_config or DownloadConfig()
self.download_mode = download_mode
self.dynamic_modules_path = dynamic_modules_path
assert self.name.count("/") == 1
assert self.name.count("/") <= 1
increase_load_count(name, resource_type="dataset")

def download_loading_script(self) -> str:
file_path = hf_hub_url(repo_id=self.name, path=self.name.split("/")[1] + ".py", revision=self.revision)
file_path = hf_hub_url(repo_id=self.name, path=self.name.split("/")[-1] + ".py", revision=self.revision)
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading builder script"
Expand Down Expand Up @@ -1197,67 +1110,57 @@ def dataset_module_factory(
elif is_relative_path(path) and path.count("/") <= 1:
try:
_raise_if_offline_mode_is_enabled()
if path.count("/") == 0: # even though the dataset is on the Hub, we get it from GitHub for now
# TODO(QL): use a Hub dataset module factory instead of GitHub
return GithubDatasetModuleFactory(
hf_api = HfApi(config.HF_ENDPOINT)
try:
if isinstance(download_config.use_auth_token, bool):
token = HfFolder.get_token() if download_config.use_auth_token else None
else:
token = download_config.use_auth_token
dataset_info = hf_api.dataset_info(
repo_id=path,
revision=revision,
token=token if token else "no-token",
timeout=100.0,
)
except Exception as e: # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
if isinstance(
e,
(
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
),
):
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
elif "404" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
raise FileNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
msg = msg + f" at revision '{revision}'" if revision else msg
raise FileNotFoundError(
msg
+ ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
)
else:
raise e
if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
return HubDatasetModuleFactoryWithScript(
path,
revision=revision,
download_config=download_config,
download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path,
).get_module()
elif path.count("/") == 1: # community dataset on the Hub
hf_api = HfApi(config.HF_ENDPOINT)
try:
if isinstance(download_config.use_auth_token, bool):
token = HfFolder.get_token() if download_config.use_auth_token else None
else:
token = download_config.use_auth_token
dataset_info = hf_api.dataset_info(
repo_id=path,
revision=revision,
token=token if token else "no-token",
timeout=100.0,
)
except Exception as e: # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
if isinstance(
e,
(
OfflineModeIsEnabled,
requests.exceptions.ConnectTimeout,
requests.exceptions.ConnectionError,
),
):
raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({type(e).__name__})")
elif "404" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
raise FileNotFoundError(msg + f" at revision '{revision}'" if revision else msg)
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
msg = msg + f" at revision '{revision}'" if revision else msg
raise FileNotFoundError(
msg
+ ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
)
else:
raise e
if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
return HubDatasetModuleFactoryWithScript(
path,
revision=revision,
download_config=download_config,
download_mode=download_mode,
dynamic_modules_path=dynamic_modules_path,
).get_module()
else:
return HubDatasetModuleFactoryWithoutScript(
path,
revision=revision,
data_dir=data_dir,
data_files=data_files,
download_config=download_config,
download_mode=download_mode,
).get_module()
else:
return HubDatasetModuleFactoryWithoutScript(
path,
revision=revision,
data_dir=data_dir,
data_files=data_files,
download_config=download_config,
download_mode=download_mode,
).get_module()
except Exception as e1: # noqa: all the attempts failed, before raising the error we should check if the module is already cached.
try:
return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
Expand Down Expand Up @@ -1624,7 +1527,6 @@ def load_dataset(
Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
contain the path or URL to the original data files and the code to load examples from the original data files.

You can find some of the scripts here: https://github.com/huggingface/datasets/tree/main/datasets
You can find the complete list of datasets in the Datasets Hub at https://huggingface.co/datasets

2. Run the dataset script which will:
Expand Down
5 changes: 3 additions & 2 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from urllib.parse import urljoin, urlparse

import requests
from packaging import version

from .. import __version__, config
from ..download.download_config import DownloadConfig
Expand Down Expand Up @@ -97,9 +98,9 @@ def head_hf_s3(


def hf_github_url(path: str, name: str, dataset=True, revision: Optional[str] = None) -> str:
from .. import SCRIPTS_VERSION

revision = revision or os.getenv("HF_SCRIPTS_VERSION", SCRIPTS_VERSION)
default_revision = "main" if version.parse(__version__).is_devrelease else __version__
revision = revision or default_revision
if dataset:
return config.REPO_DATASETS_URL.format(revision=revision, path=path, name=name)
else:
Expand Down
Loading