Skip to content

Commit

Permalink
use huggingface_hub auth
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Feb 8, 2021
1 parent 36bf6e2 commit e1760d0
Showing 1 changed file with 32 additions and 6 deletions.
38 changes: 32 additions & 6 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,22 @@ def is_rarfile_available():
return _rarfile_available


def is_remote_url(url_or_filename):
def is_remote_url(url_or_filename: str) -> bool:
parsed = urlparse(url_or_filename)
return parsed.scheme in ("http", "https", "s3", "gs", "hdfs", "ftp")


def is_local_path(url_or_filename: str) -> bool:
# On unix the scheme of a local path is empty (for both absolute and relative),
# while on windows the scheme is the drive name (ex: "c") for absolute paths.
# for details on the windows behavior, see https://bugs.python.org/issue42215
return urlparse(url_or_filename).scheme == "" or os.path.ismount(urlparse(url_or_filename).scheme + ":/")


def is_relative_path(url_or_filename: str) -> bool:
return urlparse(url_or_filename).scheme == "" and not os.path.isabs(url_or_filename)


def hf_bucket_url(identifier: str, filename: str, use_cdn=False, dataset=True) -> str:
if dataset:
endpoint = CLOUDFRONT_DATASETS_DISTRIB_PREFIX if use_cdn else S3_DATASETS_BUCKET_PREFIX
Expand Down Expand Up @@ -323,7 +334,7 @@ def cached_path(
url_or_filename,
download_config=None,
**download_kwargs,
) -> Optional[str]:
) -> str:
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
Expand Down Expand Up @@ -366,10 +377,8 @@ def cached_path(
elif os.path.exists(url_or_filename):
# File, and it exists.
output_path = url_or_filename
elif urlparse(url_or_filename).scheme == "" or os.path.ismount(urlparse(url_or_filename).scheme + ":/"):
elif is_local_path(url_or_filename):
# File, but it doesn't exist.
# On unix the scheme of a local path is empty, while on windows the scheme is the drive name (ex: "c")
# for details on the windows behavior, see https://bugs.python.org/issue42215
raise FileNotFoundError("Local file {} doesn't exist".format(url_or_filename))
else:
# Something unknown
Expand Down Expand Up @@ -454,6 +463,21 @@ def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> st
return ua


def get_authentication_headers_for_url(url: str) -> dict:
"""Handle the HF authentication"""
headers = {}
if url.startswith("https://huggingface.co/"):
try:
from huggingface_hub import hf_api

token = hf_api.HfFolder.get_token()
if token:
headers["authorization"] = "Bearer {}".format(token)
except ImportError:
pass
return headers


def _request_with_retry(
verb: str, url: str, max_retries: int = 0, base_wait_time: float = 0.5, max_wait_time: float = 2, **params
) -> requests.Response:
Expand Down Expand Up @@ -504,6 +528,7 @@ def ftp_get(url, temp_file, proxies=None, resume_size=0, user_agent=None, cookie

def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None, cookies=None, max_retries=0):
headers = {"user-agent": get_datasets_user_agent(user_agent=user_agent)}
headers.update(get_authentication_headers_for_url(url))
if resume_size > 0:
headers["Range"] = "bytes=%d-" % (resume_size,)
response = _request_with_retry(
Expand Down Expand Up @@ -533,6 +558,7 @@ def http_head(
url, proxies=None, user_agent=None, cookies=None, allow_redirects=True, timeout=10, max_retries=0
) -> requests.Response:
headers = {"user-agent": get_datasets_user_agent(user_agent=user_agent)}
headers.update(get_authentication_headers_for_url(url))
response = _request_with_retry(
verb="HEAD",
url=url,
Expand All @@ -557,7 +583,7 @@ def get_from_cache(
local_files_only=False,
use_etag=True,
max_retries=0,
) -> Optional[str]:
) -> str:
"""
Given a URL, look for the corresponding file in the local cache.
If it's not there, download it. Then return the path to the cached file.
Expand Down

1 comment on commit e1760d0

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==0.17.1

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.019881 / 0.011353 (0.008528) 0.018471 / 0.011008 (0.007462) 0.050711 / 0.038508 (0.012203) 0.034283 / 0.023109 (0.011174) 0.223995 / 0.275898 (-0.051903) 0.259043 / 0.323480 (-0.064437) 0.006275 / 0.007986 (-0.001710) 0.005114 / 0.004328 (0.000785) 0.007139 / 0.004250 (0.002889) 0.048380 / 0.037052 (0.011328) 0.239638 / 0.258489 (-0.018851) 0.258525 / 0.293841 (-0.035316) 0.179120 / 0.128546 (0.050574) 0.138246 / 0.075646 (0.062599) 0.481540 / 0.419271 (0.062268) 0.667781 / 0.043533 (0.624248) 0.239812 / 0.255139 (-0.015327) 0.254677 / 0.283200 (-0.028522) 6.738861 / 0.141683 (6.597178) 1.960762 / 1.452155 (0.508607) 1.981168 / 1.492716 (0.488452)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.044067 / 0.037411 (0.006656) 0.021986 / 0.014526 (0.007460) 0.029917 / 0.176557 (-0.146640) 0.048778 / 0.737135 (-0.688357) 0.049090 / 0.296338 (-0.247248)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.272020 / 0.215209 (0.056811) 2.837633 / 2.077655 (0.759978) 1.335653 / 1.504120 (-0.168467) 1.222447 / 1.541195 (-0.318747) 1.244357 / 1.468490 (-0.224133) 7.457275 / 4.584777 (2.872498) 6.459023 / 3.745712 (2.713311) 9.309979 / 5.269862 (4.040118) 8.081842 / 4.565676 (3.516165) 0.747774 / 0.424275 (0.323499) 0.012216 / 0.007607 (0.004609) 0.291894 / 0.226044 (0.065849) 3.322992 / 2.268929 (1.054064) 1.819667 / 55.444624 (-53.624958) 1.531338 / 6.876477 (-5.345139) 1.572998 / 2.142072 (-0.569074) 7.508278 / 4.805227 (2.703050) 7.512719 / 6.500664 (1.012055) 10.555206 / 0.075469 (10.479737)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 12.627027 / 1.841788 (10.785240) 14.044656 / 8.074308 (5.970348) 24.907326 / 10.191392 (14.715934) 0.524197 / 0.680424 (-0.156227) 0.338936 / 0.534201 (-0.195265) 0.913451 / 0.579283 (0.334168) 0.735984 / 0.434364 (0.301620) 0.834953 / 0.540337 (0.294616) 1.774469 / 1.386936 (0.387533)
PyArrow==1.0
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.018750 / 0.011353 (0.007398) 0.015959 / 0.011008 (0.004951) 0.058861 / 0.038508 (0.020353) 0.034436 / 0.023109 (0.011327) 0.342908 / 0.275898 (0.067010) 0.439975 / 0.323480 (0.116495) 0.005682 / 0.007986 (-0.002303) 0.005158 / 0.004328 (0.000829) 0.006972 / 0.004250 (0.002722) 0.045494 / 0.037052 (0.008441) 0.388651 / 0.258489 (0.130162) 0.427845 / 0.293841 (0.134004) 0.173313 / 0.128546 (0.044767) 0.145424 / 0.075646 (0.069778) 0.506387 / 0.419271 (0.087115) 0.441051 / 0.043533 (0.397518) 0.334889 / 0.255139 (0.079750) 0.405060 / 0.283200 (0.121861) 1.857868 / 0.141683 (1.716185) 1.865114 / 1.452155 (0.412960) 2.002421 / 1.492716 (0.509705)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.047090 / 0.037411 (0.009679) 0.021073 / 0.014526 (0.006547) 0.024070 / 0.176557 (-0.152486) 0.049740 / 0.737135 (-0.687396) 0.026752 / 0.296338 (-0.269586)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.323159 / 0.215209 (0.107950) 3.276463 / 2.077655 (1.198809) 1.989649 / 1.504120 (0.485529) 1.715265 / 1.541195 (0.174071) 1.830055 / 1.468490 (0.361565) 7.229014 / 4.584777 (2.644237) 6.691690 / 3.745712 (2.945978) 9.600162 / 5.269862 (4.330301) 8.233358 / 4.565676 (3.667681) 0.740046 / 0.424275 (0.315771) 0.011579 / 0.007607 (0.003972) 0.414334 / 0.226044 (0.188289) 3.927819 / 2.268929 (1.658890) 2.614024 / 55.444624 (-52.830600) 2.300683 / 6.876477 (-4.575794) 2.442311 / 2.142072 (0.300238) 7.558122 / 4.805227 (2.752894) 6.268684 / 6.500664 (-0.231980) 9.865256 / 0.075469 (9.789786)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 13.351951 / 1.841788 (11.510163) 15.282051 / 8.074308 (7.207743) 25.849531 / 10.191392 (15.658139) 1.130894 / 0.680424 (0.450470) 0.621316 / 0.534201 (0.087115) 0.857921 / 0.579283 (0.278638) 0.684086 / 0.434364 (0.249722) 0.764748 / 0.540337 (0.224410) 1.732781 / 1.386936 (0.345845)

CML watermark

Please sign in to comment.