Skip to content

Commit

Permalink
test use_auth_token
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Feb 12, 2021
1 parent 2406e1d commit 0517553
Showing 1 changed file with 27 additions and 0 deletions.
27 changes: 27 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
from hashlib import sha256
from unittest import TestCase
from unittest.mock import patch

import pyarrow as pa
import pytest
Expand Down Expand Up @@ -42,6 +43,9 @@ def _generate_examples(self, filepath, **kwargs):
yield i, {"text": line.strip()}
"""

SAMPLE_DATASET_IDENTIFIER = "lhoestq/test"
SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER = "lhoestq/_dummy"


@pytest.fixture
def data_dir(tmp_path):
Expand Down Expand Up @@ -192,3 +196,26 @@ def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory
with pytest.raises(FileNotFoundError) as exc_info:
datasets.load_dataset("_dummy")
assert "at " + os.path.join("_dummy", "_dummy.py") in str(exc_info.value)


def test_loading_from_the_datasets_hub():
with tempfile.TemporaryDirectory() as tmp_dir:
dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER, cache_dir=tmp_dir)
assert len(dataset["train"]), 2
assert len(dataset["validation"]), 3


def test_loading_from_the_datasets_hub_with_use_auth_token():
from datasets.utils.file_utils import http_head

def assert_auth(url, *args, headers, **kwargs):
assert headers["authorization"] == "Bearer foo"
return http_head(url, *args, headers=headers, **kwargs)

with patch("datasets.utils.file_utils.http_head") as mock_head:
mock_head.side_effect = assert_auth
with tempfile.TemporaryDirectory() as tmp_dir:
with offline():
with pytest.raises(ConnectionError):
load_dataset(SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER, cache_dir=tmp_dir, use_auth_token="foo")
mock_head.assert_called()

1 comment on commit 0517553

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==0.17.1

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.018288 / 0.011353 (0.006935) 0.015617 / 0.011008 (0.004609) 0.045114 / 0.038508 (0.006606) 0.030091 / 0.023109 (0.006982) 0.188590 / 0.275898 (-0.087308) 0.215610 / 0.323480 (-0.107870) 0.006015 / 0.007986 (-0.001971) 0.004793 / 0.004328 (0.000464) 0.006354 / 0.004250 (0.002103) 0.045954 / 0.037052 (0.008901) 0.187581 / 0.258489 (-0.070908) 0.223838 / 0.293841 (-0.070003) 0.152265 / 0.128546 (0.023718) 0.140678 / 0.075646 (0.065032) 0.395637 / 0.419271 (-0.023634) 0.416532 / 0.043533 (0.372999) 0.213821 / 0.255139 (-0.041318) 0.200496 / 0.283200 (-0.082704) 1.651248 / 0.141683 (1.509566) 1.752940 / 1.452155 (0.300785) 1.690323 / 1.492716 (0.197607)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.035238 / 0.037411 (-0.002174) 0.019452 / 0.014526 (0.004927) 0.067528 / 0.176557 (-0.109029) 0.042303 / 0.737135 (-0.694832) 0.038821 / 0.296338 (-0.257517)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.259671 / 0.215209 (0.044462) 2.578084 / 2.077655 (0.500429) 1.225901 / 1.504120 (-0.278219) 1.087908 / 1.541195 (-0.453287) 1.073782 / 1.468490 (-0.394708) 7.354524 / 4.584777 (2.769747) 6.529800 / 3.745712 (2.784088) 8.562076 / 5.269862 (3.292214) 7.750584 / 4.565676 (3.184908) 0.707399 / 0.424275 (0.283124) 0.010086 / 0.007607 (0.002479) 0.286848 / 0.226044 (0.060803) 3.026995 / 2.268929 (0.758066) 1.706283 / 55.444624 (-53.738342) 1.377461 / 6.876477 (-5.499016) 1.436589 / 2.142072 (-0.705483) 7.013599 / 4.805227 (2.208371) 6.012145 / 6.500664 (-0.488519) 9.693644 / 0.075469 (9.618175)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 11.753292 / 1.841788 (9.911504) 13.279978 / 8.074308 (5.205670) 22.630348 / 10.191392 (12.438956) 0.847245 / 0.680424 (0.166822) 0.264720 / 0.534201 (-0.269481) 0.789335 / 0.579283 (0.210052) 0.625651 / 0.434364 (0.191287) 0.713586 / 0.540337 (0.173249) 1.463721 / 1.386936 (0.076785)
PyArrow==1.0
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.017181 / 0.011353 (0.005828) 0.016858 / 0.011008 (0.005850) 0.045168 / 0.038508 (0.006659) 0.031109 / 0.023109 (0.007999) 0.312594 / 0.275898 (0.036696) 0.315473 / 0.323480 (-0.008007) 0.008233 / 0.007986 (0.000248) 0.006430 / 0.004328 (0.002101) 0.006581 / 0.004250 (0.002331) 0.041787 / 0.037052 (0.004734) 0.318320 / 0.258489 (0.059831) 0.361697 / 0.293841 (0.067857) 0.152115 / 0.128546 (0.023569) 0.124147 / 0.075646 (0.048500) 0.392094 / 0.419271 (-0.027178) 0.423644 / 0.043533 (0.380112) 0.309588 / 0.255139 (0.054449) 0.314568 / 0.283200 (0.031369) 1.636495 / 0.141683 (1.494812) 1.609883 / 1.452155 (0.157728) 1.698536 / 1.492716 (0.205820)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.037585 / 0.037411 (0.000174) 0.019283 / 0.014526 (0.004757) 0.027511 / 0.176557 (-0.149045) 0.043471 / 0.737135 (-0.693665) 0.026330 / 0.296338 (-0.270008)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.303742 / 0.215209 (0.088533) 3.007955 / 2.077655 (0.930300) 1.841560 / 1.504120 (0.337440) 1.702918 / 1.541195 (0.161723) 1.808778 / 1.468490 (0.340288) 6.879511 / 4.584777 (2.294735) 6.122738 / 3.745712 (2.377026) 8.321623 / 5.269862 (3.051761) 7.463530 / 4.565676 (2.897853) 0.675583 / 0.424275 (0.251308) 0.009545 / 0.007607 (0.001938) 0.346199 / 0.226044 (0.120155) 3.438250 / 2.268929 (1.169322) 2.223174 / 55.444624 (-53.221450) 1.988927 / 6.876477 (-4.887550) 2.102433 / 2.142072 (-0.039640) 7.232131 / 4.805227 (2.426904) 6.446010 / 6.500664 (-0.054654) 5.285393 / 0.075469 (5.209924)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 11.346538 / 1.841788 (9.504750) 12.574747 / 8.074308 (4.500439) 22.146042 / 10.191392 (11.954650) 0.786591 / 0.680424 (0.106167) 0.607221 / 0.534201 (0.073021) 0.823890 / 0.579283 (0.244606) 0.655822 / 0.434364 (0.221458) 0.725364 / 0.540337 (0.185026) 1.481410 / 1.386936 (0.094474)

CML watermark

Please sign in to comment.