From fb621b9630a69643255d25f192fdb011935122b1 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 27 Sep 2023 09:32:06 +0200 Subject: [PATCH 1/5] Increase precision in tmp repo ids to avoid collisions --- tests/test_upstream_hub.py | 56 +++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index 1c722c65446..8d133988099 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -43,7 +43,7 @@ def test_push_dataset_dict_to_hub_no_token(self, temporary_repo, set_ci_hub_acce local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -65,7 +65,7 @@ def test_push_dataset_dict_to_hub_name_without_namespace(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -88,7 +88,7 @@ def test_push_dataset_dict_to_hub_datasets_with_different_features(self, cleanup local_ds = DatasetDict({"train": ds_train, "test": ds_test}) - ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}" + ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}" try: with pytest.raises(ValueError): local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) @@ -101,7 +101,7 @@ def test_push_dataset_dict_to_hub_private(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, token=self._token, private=True) hub_ds = load_dataset(ds_name, download_mode="force_redownload", token=self._token) @@ -123,7 +123,7 @@ def test_push_dataset_dict_to_hub(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -145,7 +145,7 @@ def test_push_dataset_dict_to_hub_multiple_files(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: with patch("datasets.config.MAX_SHARD_SIZE", "16KB"): local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -174,7 +174,7 @@ def test_push_dataset_dict_to_hub_multiple_files_with_max_shard_size(self, tempo local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, token=self._token, max_shard_size="16KB") hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -202,7 +202,7 @@ def test_push_dataset_dict_to_hub_multiple_files_with_num_shards(self, temporary local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, token=self._token, num_shards={"train": 2}) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -231,7 +231,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo): local_ds = DatasetDict({"train": ds, "random": ds2}) - ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}" + ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}" # Push to hub two times, but the second time with a larger amount of files. # Verify that the new files contain the correct dataset. @@ -332,7 +332,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo): def test_push_dataset_to_hub(self, temporary_repo): local_ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, split="train", token=self._token) local_ds_dict = {"train": local_ds} hub_ds_dict = load_dataset(ds_name, download_mode="force_redownload") @@ -350,7 +350,7 @@ def test_push_dataset_to_hub_custom_features(self, temporary_repo): features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])}) ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -367,7 +367,7 @@ def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -391,7 +391,7 @@ def test_push_dataset_to_hub_custom_features_image(self, temporary_repo): ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -413,7 +413,7 @@ def test_push_dataset_to_hub_custom_features_image_list(self, temporary_repo): ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -433,7 +433,7 @@ def test_push_dataset_dict_to_hub_custom_features(self, temporary_repo): local_ds = DatasetDict({"test": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -444,7 +444,7 @@ def test_push_dataset_dict_to_hub_custom_features(self, temporary_repo): def test_push_dataset_to_hub_custom_splits(self, temporary_repo): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds.push_to_hub(ds_name, split="random", token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -454,7 +454,7 @@ def test_push_dataset_to_hub_custom_splits(self, temporary_repo): def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo): ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: with patch("datasets.arrow_dataset.HfApi.upload_file", side_effect=self._api.upload_file) as mock_hf_api: # Initial push ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB") @@ -479,7 +479,7 @@ def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo): def test_push_dataset_to_hub_multiple_splits_one_by_one(self, temporary_repo): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds.push_to_hub(ds_name, split="train", token=self._token) ds.push_to_hub(ds_name, split="test", token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -493,7 +493,7 @@ def test_push_dataset_dict_to_hub_custom_splits(self, temporary_repo): local_ds = DatasetDict({"random": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -509,7 +509,7 @@ def test_push_streaming_dataset_dict_to_hub(self, temporary_repo): local_ds.save_to_disk(tmp) local_ds = load_dataset(tmp, streaming=True) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -522,7 +522,7 @@ def test_push_multiple_dataset_configs_to_hub_load_dataset_builder(self, tempora ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -556,7 +556,7 @@ def test_push_multiple_dataset_configs_to_hub_load_dataset(self, temporary_repo) ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -600,7 +600,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(self, temp ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -639,7 +639,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_load_dataset_builder(self, te ds_config1 = DatasetDict({"random": ds_config1}) ds_config2 = DatasetDict({"random": ds_config2}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -676,7 +676,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_load_dataset(self, temporary_ ds_config1 = DatasetDict({"train": ds_config1, "random": ds_config1}) ds_config2 = DatasetDict({"train": ds_config2, "random": ds_config2}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -729,7 +729,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(self, ds_config1 = DatasetDict({"train": ds_config1, "random": ds_config1}) ds_config2 = DatasetDict({"train": ds_config2, "random": ds_config2}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -770,7 +770,7 @@ def test_push_dataset_to_hub_with_config_no_metadata_configs(self, temporary_rep ds.to_parquet(parquet_buf) parquet_content = parquet_buf.getvalue() - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: self._api.create_repo(ds_name, token=self._token, repo_type="dataset") # old push_to_hub was uploading the parquet files only - without metadata configs self._api.upload_file( @@ -804,7 +804,7 @@ def test_push_dataset_dict_to_hub_with_config_no_metadata_configs(self, temporar local_ds_another_config = DatasetDict({"random": ds_another_config}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name: + with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: self._api.create_repo(ds_name, token=self._token, repo_type="dataset") # old push_to_hub was uploading the parquet files only - without metadata configs self._api.upload_file( From 0082342ac792a05f4a615e4985d1c791e155115a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 27 Sep 2023 09:44:11 +0200 Subject: [PATCH 2/5] Refactor temporary_repo fixture --- tests/fixtures/hub.py | 4 ++- tests/test_upstream_hub.py | 58 ++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py index bda110406c0..821aa380400 100644 --- a/tests/fixtures/hub.py +++ b/tests/fixtures/hub.py @@ -1,6 +1,7 @@ import time from contextlib import contextmanager from pathlib import Path +from typing import Optional import pytest import requests @@ -63,7 +64,8 @@ def _cleanup_repo(repo_id): @pytest.fixture def temporary_repo(cleanup_repo): @contextmanager - def _temporary_repo(repo_id): + def _temporary_repo(repo_id: Optional[str] = None): + repo_id = repo_id or f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}" try: yield repo_id finally: diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index 8d133988099..6d4b33f85c5 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -43,7 +43,7 @@ def test_push_dataset_dict_to_hub_no_token(self, temporary_repo, set_ci_hub_acce local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -65,7 +65,7 @@ def test_push_dataset_dict_to_hub_name_without_namespace(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -101,7 +101,7 @@ def test_push_dataset_dict_to_hub_private(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token, private=True) hub_ds = load_dataset(ds_name, download_mode="force_redownload", token=self._token) @@ -123,7 +123,7 @@ def test_push_dataset_dict_to_hub(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -145,7 +145,7 @@ def test_push_dataset_dict_to_hub_multiple_files(self, temporary_repo): local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: with patch("datasets.config.MAX_SHARD_SIZE", "16KB"): local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -174,7 +174,7 @@ def test_push_dataset_dict_to_hub_multiple_files_with_max_shard_size(self, tempo local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token, max_shard_size="16KB") hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -202,7 +202,7 @@ def test_push_dataset_dict_to_hub_multiple_files_with_num_shards(self, temporary local_ds = DatasetDict({"train": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token, num_shards={"train": 2}) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -231,11 +231,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo): local_ds = DatasetDict({"train": ds, "random": ds2}) - ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}" - # Push to hub two times, but the second time with a larger amount of files. # Verify that the new files contain the correct dataset. - with temporary_repo(ds_name) as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token) with tempfile.TemporaryDirectory() as tmp: @@ -284,7 +282,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo): # Push to hub two times, but the second time with fewer files. # Verify that the new files contain the correct dataset and that non-necessary files have been deleted. - with temporary_repo(ds_name) as ds_name: + with temporary_repo(ds_name): local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5) with tempfile.TemporaryDirectory() as tmp: @@ -332,7 +330,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo): def test_push_dataset_to_hub(self, temporary_repo): local_ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, split="train", token=self._token) local_ds_dict = {"train": local_ds} hub_ds_dict = load_dataset(ds_name, download_mode="force_redownload") @@ -350,7 +348,7 @@ def test_push_dataset_to_hub_custom_features(self, temporary_repo): features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])}) ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -367,7 +365,7 @@ def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo): ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -391,7 +389,7 @@ def test_push_dataset_to_hub_custom_features_image(self, temporary_repo): ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -413,7 +411,7 @@ def test_push_dataset_to_hub_custom_features_image_list(self, temporary_repo): ds = Dataset.from_dict(data, features=features) for embed_external_files in [True, False]: - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token) hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload") @@ -433,7 +431,7 @@ def test_push_dataset_dict_to_hub_custom_features(self, temporary_repo): local_ds = DatasetDict({"test": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -444,7 +442,7 @@ def test_push_dataset_dict_to_hub_custom_features(self, temporary_repo): def test_push_dataset_to_hub_custom_splits(self, temporary_repo): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds.push_to_hub(ds_name, split="random", token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -454,7 +452,7 @@ def test_push_dataset_to_hub_custom_splits(self, temporary_repo): def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo): ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: with patch("datasets.arrow_dataset.HfApi.upload_file", side_effect=self._api.upload_file) as mock_hf_api: # Initial push ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB") @@ -479,7 +477,7 @@ def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo): def test_push_dataset_to_hub_multiple_splits_one_by_one(self, temporary_repo): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds.push_to_hub(ds_name, split="train", token=self._token) ds.push_to_hub(ds_name, split="test", token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -493,7 +491,7 @@ def test_push_dataset_dict_to_hub_custom_splits(self, temporary_repo): local_ds = DatasetDict({"random": ds}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -509,7 +507,7 @@ def test_push_streaming_dataset_dict_to_hub(self, temporary_repo): local_ds.save_to_disk(tmp) local_ds = load_dataset(tmp, streaming=True) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") @@ -522,7 +520,7 @@ def test_push_multiple_dataset_configs_to_hub_load_dataset_builder(self, tempora ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -556,7 +554,7 @@ def test_push_multiple_dataset_configs_to_hub_load_dataset(self, temporary_repo) ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -600,7 +598,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(self, temp ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -639,7 +637,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_load_dataset_builder(self, te ds_config1 = DatasetDict({"random": ds_config1}) ds_config2 = DatasetDict({"random": ds_config2}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -676,7 +674,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_load_dataset(self, temporary_ ds_config1 = DatasetDict({"train": ds_config1, "random": ds_config1}) ds_config2 = DatasetDict({"train": ds_config2, "random": ds_config2}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -729,7 +727,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(self, ds_config1 = DatasetDict({"train": ds_config1, "random": ds_config1}) ds_config2 = DatasetDict({"train": ds_config2, "random": ds_config2}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: ds_default.push_to_hub(ds_name, token=self._token) ds_config1.push_to_hub(ds_name, "config1", token=self._token) ds_config2.push_to_hub(ds_name, "config2", token=self._token) @@ -770,7 +768,7 @@ def test_push_dataset_to_hub_with_config_no_metadata_configs(self, temporary_rep ds.to_parquet(parquet_buf) parquet_content = parquet_buf.getvalue() - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: self._api.create_repo(ds_name, token=self._token, repo_type="dataset") # old push_to_hub was uploading the parquet files only - without metadata configs self._api.upload_file( @@ -804,7 +802,7 @@ def test_push_dataset_dict_to_hub_with_config_no_metadata_configs(self, temporar local_ds_another_config = DatasetDict({"random": ds_another_config}) - with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}") as ds_name: + with temporary_repo() as ds_name: self._api.create_repo(ds_name, token=self._token, repo_type="dataset") # old push_to_hub was uploading the parquet files only - without metadata configs self._api.upload_file( From c722eb75a6cc56eac530c44a17ff679ca805aa89 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 27 Sep 2023 10:43:50 +0200 Subject: [PATCH 3/5] Catch RepositoryNotFoundError --- tests/fixtures/hub.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py index 821aa380400..5171fe56c37 100644 --- a/tests/fixtures/hub.py +++ b/tests/fixtures/hub.py @@ -5,7 +5,7 @@ import pytest import requests -from huggingface_hub.hf_api import HfApi, HfFolder +from huggingface_hub.hf_api import HfApi, HfFolder, RepositoryNotFoundError CI_HUB_USER = "__DUMMY_TRANSFORMERS_USER__" @@ -69,7 +69,10 @@ def _temporary_repo(repo_id: Optional[str] = None): try: yield repo_id finally: - cleanup_repo(repo_id) + try: + cleanup_repo(repo_id) + except RepositoryNotFoundError: + pass return _temporary_repo From ef5751522c424c758df0647ff9a449b8b0404b6a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 27 Sep 2023 11:17:55 +0200 Subject: [PATCH 4/5] Increase precision in private fixtures --- tests/fixtures/hub.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py index 5171fe56c37..31f6cae330e 100644 --- a/tests/fixtures/hub.py +++ b/tests/fixtures/hub.py @@ -79,7 +79,7 @@ def _temporary_repo(repo_id: Optional[str] = None): @pytest.fixture(scope="session") def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file): - repo_name = f"repo_txt_data-{int(time.time() * 10e3)}" + repo_name = f"repo_txt_data-{int(time.time() * 10e6)}" repo_id = f"{CI_HUB_USER}/{repo_name}" hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True) hf_api.upload_file( @@ -103,7 +103,7 @@ def hf_private_dataset_repo_txt_data(hf_private_dataset_repo_txt_data_, ci_hub_c @pytest.fixture(scope="session") def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_with_dir_path): - repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e3)}" + repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e6)}" repo_id = f"{CI_HUB_USER}/{repo_name}" hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True) hf_api.upload_file( @@ -129,7 +129,7 @@ def hf_private_dataset_repo_zipped_txt_data( @pytest.fixture(scope="session") def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_path): - repo_name = f"repo_zipped_img_data-{int(time.time() * 10e3)}" + repo_name = f"repo_zipped_img_data-{int(time.time() * 10e6)}" repo_id = f"{CI_HUB_USER}/{repo_name}" hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True) hf_api.upload_file( From ad876e8908188dcd56759a35c4da182bf121535a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 28 Sep 2023 08:42:01 +0200 Subject: [PATCH 5/5] Align with huggingface-hub --- tests/fixtures/hub.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py index 31f6cae330e..9bd8a162da5 100644 --- a/tests/fixtures/hub.py +++ b/tests/fixtures/hub.py @@ -1,4 +1,5 @@ import time +import uuid from contextlib import contextmanager from pathlib import Path from typing import Optional @@ -65,7 +66,7 @@ def _cleanup_repo(repo_id): def temporary_repo(cleanup_repo): @contextmanager def _temporary_repo(repo_id: Optional[str] = None): - repo_id = repo_id or f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}" + repo_id = repo_id or f"{CI_HUB_USER}/test-dataset-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}" try: yield repo_id finally: