Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move info_utils errors to exceptions module #6952

Merged
merged 6 commits into from
Jun 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,9 @@
# Deprecated modules
from . import arrow_dataset as _arrow_dataset
from . import utils as _utils
from .exceptions import ExpectedMoreDownloadedFiles, ExpectedMoreSplits, UnexpectedDownloadedFile, UnexpectedSplits
from .utils import download_manager as _deprecated_download_manager
from .utils import info_utils as _deprecated_info_utils


_arrow_dataset.concatenate_datasets = concatenate_datasets
Expand All @@ -68,5 +70,10 @@
_deprecated_download_manager.DownloadConfig = DownloadConfig
_deprecated_download_manager.DownloadMode = DownloadMode
_deprecated_download_manager.DownloadManager = DownloadManager
_deprecated_info_utils.ExpectedMoreDownloadedFiles = ExpectedMoreDownloadedFiles
_deprecated_info_utils.ExpectedMoreSplits = ExpectedMoreSplits
_deprecated_info_utils.UnexpectedDownloadedFile = UnexpectedDownloadedFile
_deprecated_info_utils.UnexpectedSplits = UnexpectedSplits

del _arrow_dataset, _utils, _deprecated_download_manager
del _deprecated_info_utils, ExpectedMoreDownloadedFiles, ExpectedMoreSplits, UnexpectedDownloadedFile, UnexpectedSplits
111 changes: 111 additions & 0 deletions src/datasets/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from . import config
from .table import CastError
from .utils.deprecation_utils import deprecated
from .utils.track import TrackedIterable, tracked_list, tracked_str


Expand Down Expand Up @@ -83,3 +84,113 @@ def from_cast_error(
explanation_message += f"\n\nThis happened while the {builder_name} dataset builder was generating data using\n\n{', '.join(formatted_tracked_gen_kwargs)}"
help_message = "\n\nPlease either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)"
return cls("An error occurred while generating the dataset" + explanation_message + help_message)


@deprecated("Use 'ChecksumVerificationError' instead.")
class ChecksumVerificationException(Exception):
"""Exceptions during checksums verifications of downloaded files.

<Deprecated version="2.20.0">

Use `ChecksumVerificationError` instead.

</Deprecated>
"""


class ChecksumVerificationError(DatasetsError, ChecksumVerificationException):
"""Error raised during checksums verifications of downloaded files."""

def __init__(self, *args, **kwargs):
DatasetsError.__init__(self, *args, **kwargs)


@deprecated("Use 'UnexpectedDownloadedFileError' instead.")
class UnexpectedDownloadedFile(ChecksumVerificationException):
"""Some downloaded files were not expected.

<Deprecated version="2.20.0">

Use `UnexpectedDownloadedFileError` instead.

</Deprecated>
"""


class UnexpectedDownloadedFileError(ChecksumVerificationError, UnexpectedDownloadedFile):
"""Some downloaded files were not expected."""


@deprecated("Use 'ExpectedMoreDownloadedFilesError' instead.")
class ExpectedMoreDownloadedFiles(ChecksumVerificationException):
"""Some files were supposed to be downloaded but were not.

<Deprecated version="2.20.0">

Use `ExpectedMoreDownloadedFilesError` instead.

</Deprecated>
"""


class ExpectedMoreDownloadedFilesError(ChecksumVerificationError, ExpectedMoreDownloadedFiles):
"""Some files were supposed to be downloaded but were not."""


class NonMatchingChecksumError(ChecksumVerificationError):
"""The downloaded file checksum don't match the expected checksum."""


@deprecated("Use 'SplitsVerificationError' instead.")
class SplitsVerificationException(Exception):
"""Exceptions during splits verifications.

<Deprecated version="2.20.0">

Use `SplitsVerificationError` instead.

</Deprecated>
"""


class SplitsVerificationError(DatasetsError, SplitsVerificationException):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it also marked as deprecated since it inherits from SplitsVerificationException ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have renamed SplitsVerificationException to SplitsVerificationError and deprecated the former. The new SplitsVerificationError is not deprecated.

It inherits from SplitsVerificationException for backward compatibility: if a user is catching SplitsVerificationException, now it will also catch SplitsVerificationError.

Copy link
Member

@lhoestq lhoestq Jun 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes but since you subclass from the deprecated class, the subclass is also marked as deprecated

In [1]: from datasets.exceptions import *

In [2]: ChecksumVerificationError()
<ipython-input-2-0953a165cf8e>:1: FutureWarning: ChecksumVerificationException is deprecated and will be removed in the next major version of datasets. Use 'ChecksumVerificationError' instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oups! Thanks for letting me know... 😅

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lhoestq I fixed it.

"""Error raised during splits verifications."""

def __init__(self, *args, **kwargs):
DatasetsError.__init__(self, *args, **kwargs)


@deprecated("Use 'UnexpectedSplitsError' instead.")
class UnexpectedSplits(SplitsVerificationException):
"""The expected splits of the downloaded file is missing.

<Deprecated version="2.20.0">

Use `UnexpectedSplitsError` instead.

</Deprecated>
"""


class UnexpectedSplitsError(SplitsVerificationError, UnexpectedSplits):
"""The expected splits of the downloaded file is missing."""


@deprecated("Use 'ExpectedMoreSplitsError' instead.")
class ExpectedMoreSplits(SplitsVerificationException):
"""Some recorded splits are missing.

<Deprecated version="2.20.0">

Use `ExpectedMoreSplitsError` instead.

</Deprecated>
"""


class ExpectedMoreSplitsError(SplitsVerificationError, ExpectedMoreSplits):
"""Some recorded splits are missing."""


class NonMatchingSplitsSizesError(SplitsVerificationError):
"""The splits sizes don't match the expected splits sizes."""
48 changes: 12 additions & 36 deletions src/datasets/utils/info_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
from huggingface_hub.utils import insecure_hashlib

from .. import config
from ..exceptions import (
ExpectedMoreDownloadedFilesError,
ExpectedMoreSplitsError,
NonMatchingChecksumError,
NonMatchingSplitsSizesError,
UnexpectedDownloadedFileError,
UnexpectedSplitsError,
)
from .logging import get_logger


Expand Down Expand Up @@ -33,30 +41,14 @@ class VerificationMode(enum.Enum):
NO_CHECKS = "no_checks"


class ChecksumVerificationException(Exception):
"""Exceptions during checksums verifications of downloaded files."""


class UnexpectedDownloadedFile(ChecksumVerificationException):
"""Some downloaded files were not expected."""


class ExpectedMoreDownloadedFiles(ChecksumVerificationException):
"""Some files were supposed to be downloaded but were not."""


class NonMatchingChecksumError(ChecksumVerificationException):
"""The downloaded file checksum don't match the expected checksum."""


def verify_checksums(expected_checksums: Optional[dict], recorded_checksums: dict, verification_name=None):
if expected_checksums is None:
logger.info("Unable to verify checksums.")
return
if len(set(expected_checksums) - set(recorded_checksums)) > 0:
raise ExpectedMoreDownloadedFiles(str(set(expected_checksums) - set(recorded_checksums)))
raise ExpectedMoreDownloadedFilesError(str(set(expected_checksums) - set(recorded_checksums)))
if len(set(recorded_checksums) - set(expected_checksums)) > 0:
raise UnexpectedDownloadedFile(str(set(recorded_checksums) - set(expected_checksums)))
raise UnexpectedDownloadedFileError(str(set(recorded_checksums) - set(expected_checksums)))
bad_urls = [url for url in expected_checksums if expected_checksums[url] != recorded_checksums[url]]
for_verification_name = " for " + verification_name if verification_name is not None else ""
if len(bad_urls) > 0:
Expand All @@ -68,30 +60,14 @@ def verify_checksums(expected_checksums: Optional[dict], recorded_checksums: dic
logger.info("All the checksums matched successfully" + for_verification_name)


class SplitsVerificationException(Exception):
"""Exceptions during splis verifications"""


class UnexpectedSplits(SplitsVerificationException):
"""The expected splits of the downloaded file is missing."""


class ExpectedMoreSplits(SplitsVerificationException):
"""Some recorded splits are missing."""


class NonMatchingSplitsSizesError(SplitsVerificationException):
"""The splits sizes don't match the expected splits sizes."""


def verify_splits(expected_splits: Optional[dict], recorded_splits: dict):
if expected_splits is None:
logger.info("Unable to verify splits sizes.")
return
if len(set(expected_splits) - set(recorded_splits)) > 0:
raise ExpectedMoreSplits(str(set(expected_splits) - set(recorded_splits)))
raise ExpectedMoreSplitsError(str(set(expected_splits) - set(recorded_splits)))
if len(set(recorded_splits) - set(expected_splits)) > 0:
raise UnexpectedSplits(str(set(recorded_splits) - set(expected_splits)))
raise UnexpectedSplitsError(str(set(recorded_splits) - set(expected_splits)))
bad_splits = [
{"expected": expected_splits[name], "recorded": recorded_splits[name]}
for name in expected_splits
Expand Down
58 changes: 58 additions & 0 deletions tests/test_exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import warnings

import pytest

import datasets.utils.deprecation_utils
from datasets.exceptions import (
ChecksumVerificationError,
ChecksumVerificationException,
ExpectedMoreDownloadedFiles,
ExpectedMoreDownloadedFilesError,
ExpectedMoreSplits,
ExpectedMoreSplitsError,
NonMatchingChecksumError,
NonMatchingSplitsSizesError,
SplitsVerificationError,
SplitsVerificationException,
UnexpectedDownloadedFile,
UnexpectedDownloadedFileError,
UnexpectedSplits,
UnexpectedSplitsError,
)


@pytest.mark.parametrize(
"error",
[
ChecksumVerificationException,
UnexpectedDownloadedFile,
ExpectedMoreDownloadedFiles,
SplitsVerificationException,
UnexpectedSplits,
ExpectedMoreSplits,
],
)
def test_error_deprecated(error, monkeypatch):
monkeypatch.setattr(datasets.utils.deprecation_utils, "_emitted_deprecation_warnings", set())
with pytest.deprecated_call():
error()


@pytest.mark.parametrize(
"error",
[
ChecksumVerificationError,
UnexpectedDownloadedFileError,
ExpectedMoreDownloadedFilesError,
NonMatchingChecksumError,
SplitsVerificationError,
UnexpectedSplitsError,
ExpectedMoreSplitsError,
NonMatchingSplitsSizesError,
],
)
def test_error_not_deprecated(error, monkeypatch):
monkeypatch.setattr(datasets.utils.deprecation_utils, "_emitted_deprecation_warnings", set())
with warnings.catch_warnings():
warnings.simplefilter("error")
error()
Loading