Skip to content

Commit

Permalink
better error message when downloading (#3343)
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq authored Dec 1, 2021
1 parent e2c3cea commit 8f08bee
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
3 changes: 3 additions & 0 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,9 @@ def dataset_module_factory(
download_config = DownloadConfig(**download_kwargs)
download_config.extract_compressed_file = True
download_config.force_extract = True
download_config.force_download = download_mode = (
GenerateMode(download_mode or GenerateMode.REUSE_DATASET_IF_EXISTS) == GenerateMode.FORCE_REDOWNLOAD
)

filename = list(filter(lambda x: x, path.replace(os.sep, "/").split("/")))[-1]
if not filename.endswith(".py"):
Expand Down
17 changes: 14 additions & 3 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ def get_from_cache(
response = None
cookies = None
etag = None
head_error = None

# Try a first time to file the file on the local file system without eTag (None)
# if we don't ask for 'force_download' then we spare a request
Expand Down Expand Up @@ -588,14 +589,19 @@ def get_from_cache(
):
connected = True
logger.info(f"Couldn't get ETag version for url {url}")
except (EnvironmentError, requests.exceptions.Timeout):
elif response.status_code == 401 and config.HF_ENDPOINT in url and use_auth_token is None:
raise ConnectionError(
f"Unauthorized for URL {url}. Please use the parameter ``use_auth_token=True`` after logging in with ``huggingface-cli login``"
)
except (EnvironmentError, requests.exceptions.Timeout) as e:
# not connected
head_error = e
pass

# connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
# try to get the last downloaded one
if not connected:
if os.path.exists(cache_path):
if os.path.exists(cache_path) and not force_download:
return cache_path
if local_files_only:
raise FileNotFoundError(
Expand All @@ -605,7 +611,12 @@ def get_from_cache(
elif response is not None and response.status_code == 404:
raise FileNotFoundError(f"Couldn't find file at {url}")
_raise_if_offline_mode_is_enabled(f"Tried to reach {url}")
raise ConnectionError(f"Couldn't reach {url}")
if head_error is not None:
raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})")
elif response is not None:
raise ConnectionError(f"Couldn't reach {url} (error {response.status_code})")
else:
raise ConnectionError(f"Couldn't reach {url}")

# Try a second time
filename = hash_url_to_filename(cached_url, etag)
Expand Down

1 comment on commit 8f08bee

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.068681 / 0.011353 (0.057328) 0.004332 / 0.011008 (-0.006676) 0.032360 / 0.038508 (-0.006148) 0.033711 / 0.023109 (0.010602) 0.309099 / 0.275898 (0.033200) 0.330410 / 0.323480 (0.006930) 0.071774 / 0.007986 (0.063788) 0.003916 / 0.004328 (-0.000412) 0.009532 / 0.004250 (0.005282) 0.034690 / 0.037052 (-0.002362) 0.299066 / 0.258489 (0.040577) 0.337405 / 0.293841 (0.043564) 0.093017 / 0.128546 (-0.035529) 0.011736 / 0.075646 (-0.063911) 0.261696 / 0.419271 (-0.157575) 0.055065 / 0.043533 (0.011532) 0.313129 / 0.255139 (0.057990) 0.329643 / 0.283200 (0.046443) 0.080046 / 0.141683 (-0.061637) 1.828850 / 1.452155 (0.376696) 1.898121 / 1.492716 (0.405405)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.357619 / 0.018006 (0.339612) 0.488978 / 0.000490 (0.488488) 0.056312 / 0.000200 (0.056112) 0.000821 / 0.000054 (0.000766)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.038042 / 0.037411 (0.000631) 0.025848 / 0.014526 (0.011322) 0.025659 / 0.176557 (-0.150897) 0.213415 / 0.737135 (-0.523721) 0.031812 / 0.296338 (-0.264527)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.552799 / 0.215209 (0.337590) 5.535388 / 2.077655 (3.457734) 2.196760 / 1.504120 (0.692640) 1.781753 / 1.541195 (0.240558) 1.936043 / 1.468490 (0.467552) 0.787853 / 4.584777 (-3.796924) 6.624697 / 3.745712 (2.878984) 4.819850 / 5.269862 (-0.450011) 1.356893 / 4.565676 (-3.208783) 0.078156 / 0.424275 (-0.346119) 0.011853 / 0.007607 (0.004245) 0.738503 / 0.226044 (0.512458) 7.192697 / 2.268929 (4.923769) 2.827876 / 55.444624 (-52.616748) 2.145814 / 6.876477 (-4.730662) 2.163693 / 2.142072 (0.021621) 0.844186 / 4.805227 (-3.961041) 0.166545 / 6.500664 (-6.334119) 0.068250 / 0.075469 (-0.007219)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.866529 / 1.841788 (0.024742) 12.908237 / 8.074308 (4.833929) 37.319950 / 10.191392 (27.128558) 0.883624 / 0.680424 (0.203200) 0.560202 / 0.534201 (0.026001) 0.428115 / 0.579283 (-0.151168) 0.668160 / 0.434364 (0.233796) 0.298067 / 0.540337 (-0.242271) 0.308170 / 1.386936 (-1.078766)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.063738 / 0.011353 (0.052385) 0.003902 / 0.011008 (-0.007107) 0.029136 / 0.038508 (-0.009372) 0.029950 / 0.023109 (0.006841) 0.305247 / 0.275898 (0.029349) 0.341936 / 0.323480 (0.018456) 0.074379 / 0.007986 (0.066393) 0.004760 / 0.004328 (0.000431) 0.007093 / 0.004250 (0.002843) 0.034883 / 0.037052 (-0.002169) 0.301682 / 0.258489 (0.043193) 0.333549 / 0.293841 (0.039708) 0.082754 / 0.128546 (-0.045792) 0.011340 / 0.075646 (-0.064306) 0.263971 / 0.419271 (-0.155300) 0.054634 / 0.043533 (0.011102) 0.302171 / 0.255139 (0.047032) 0.346391 / 0.283200 (0.063191) 0.079917 / 0.141683 (-0.061766) 1.676343 / 1.452155 (0.224188) 1.785250 / 1.492716 (0.292533)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.258419 / 0.018006 (0.240413) 0.495247 / 0.000490 (0.494757) 0.009502 / 0.000200 (0.009302) 0.000134 / 0.000054 (0.000079)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.038572 / 0.037411 (0.001161) 0.024271 / 0.014526 (0.009746) 0.025847 / 0.176557 (-0.150710) 0.228654 / 0.737135 (-0.508482) 0.029500 / 0.296338 (-0.266838)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.544154 / 0.215209 (0.328945) 5.600183 / 2.077655 (3.522528) 2.179388 / 1.504120 (0.675268) 1.775274 / 1.541195 (0.234079) 1.937917 / 1.468490 (0.469427) 0.652674 / 4.584777 (-3.932103) 6.325540 / 3.745712 (2.579828) 2.776910 / 5.269862 (-2.492951) 1.307425 / 4.565676 (-3.258251) 0.077675 / 0.424275 (-0.346600) 0.012629 / 0.007607 (0.005022) 0.757270 / 0.226044 (0.531225) 7.379059 / 2.268929 (5.110131) 2.964557 / 55.444624 (-52.480068) 2.195592 / 6.876477 (-4.680884) 2.278718 / 2.142072 (0.136645) 0.848500 / 4.805227 (-3.956727) 0.153447 / 6.500664 (-6.347217) 0.056625 / 0.075469 (-0.018844)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.819393 / 1.841788 (-0.022395) 13.255176 / 8.074308 (5.180868) 39.998034 / 10.191392 (29.806642) 0.863176 / 0.680424 (0.182752) 0.590711 / 0.534201 (0.056510) 0.435211 / 0.579283 (-0.144073) 0.685461 / 0.434364 (0.251097) 0.317474 / 0.540337 (-0.222864) 0.320269 / 1.386936 (-1.066667)

CML watermark

Please sign in to comment.