Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CI 404 errors #6262

Merged
merged 6 commits into from
Sep 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions tests/fixtures/hub.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import time
import uuid
from contextlib import contextmanager
from pathlib import Path
from typing import Optional

import pytest
import requests
from huggingface_hub.hf_api import HfApi, HfFolder
from huggingface_hub.hf_api import HfApi, HfFolder, RepositoryNotFoundError


CI_HUB_USER = "__DUMMY_TRANSFORMERS_USER__"
Expand Down Expand Up @@ -63,18 +65,22 @@ def _cleanup_repo(repo_id):
@pytest.fixture
def temporary_repo(cleanup_repo):
@contextmanager
def _temporary_repo(repo_id):
def _temporary_repo(repo_id: Optional[str] = None):
repo_id = repo_id or f"{CI_HUB_USER}/test-dataset-{uuid.uuid4().hex[:6]}-{int(time.time() * 10e3)}"
try:
yield repo_id
finally:
cleanup_repo(repo_id)
try:
cleanup_repo(repo_id)
except RepositoryNotFoundError:
pass

return _temporary_repo


@pytest.fixture(scope="session")
def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file):
repo_name = f"repo_txt_data-{int(time.time() * 10e3)}"
repo_name = f"repo_txt_data-{int(time.time() * 10e6)}"
repo_id = f"{CI_HUB_USER}/{repo_name}"
hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
hf_api.upload_file(
Expand All @@ -98,7 +104,7 @@ def hf_private_dataset_repo_txt_data(hf_private_dataset_repo_txt_data_, ci_hub_c

@pytest.fixture(scope="session")
def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_with_dir_path):
repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e3)}"
repo_name = f"repo_zipped_txt_data-{int(time.time() * 10e6)}"
repo_id = f"{CI_HUB_USER}/{repo_name}"
hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
hf_api.upload_file(
Expand All @@ -124,7 +130,7 @@ def hf_private_dataset_repo_zipped_txt_data(

@pytest.fixture(scope="session")
def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_path):
repo_name = f"repo_zipped_img_data-{int(time.time() * 10e3)}"
repo_name = f"repo_zipped_img_data-{int(time.time() * 10e6)}"
repo_id = f"{CI_HUB_USER}/{repo_name}"
hf_api.create_repo(repo_id, token=hf_token, repo_type="dataset", private=True)
hf_api.upload_file(
Expand Down
60 changes: 29 additions & 31 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_push_dataset_dict_to_hub_no_token(self, temporary_repo, set_ci_hub_acce

local_ds = DatasetDict({"train": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand All @@ -65,7 +65,7 @@ def test_push_dataset_dict_to_hub_name_without_namespace(self, temporary_repo):

local_ds = DatasetDict({"train": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand All @@ -88,7 +88,7 @@ def test_push_dataset_dict_to_hub_datasets_with_different_features(self, cleanup

local_ds = DatasetDict({"train": ds_train, "test": ds_test})

ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}"
ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e6)}"
try:
with pytest.raises(ValueError):
local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token)
Expand All @@ -101,7 +101,7 @@ def test_push_dataset_dict_to_hub_private(self, temporary_repo):

local_ds = DatasetDict({"train": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token, private=True)
hub_ds = load_dataset(ds_name, download_mode="force_redownload", token=self._token)

Expand All @@ -123,7 +123,7 @@ def test_push_dataset_dict_to_hub(self, temporary_repo):

local_ds = DatasetDict({"train": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand All @@ -145,7 +145,7 @@ def test_push_dataset_dict_to_hub_multiple_files(self, temporary_repo):

local_ds = DatasetDict({"train": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
with patch("datasets.config.MAX_SHARD_SIZE", "16KB"):
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")
Expand Down Expand Up @@ -174,7 +174,7 @@ def test_push_dataset_dict_to_hub_multiple_files_with_max_shard_size(self, tempo

local_ds = DatasetDict({"train": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token, max_shard_size="16KB")
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand Down Expand Up @@ -202,7 +202,7 @@ def test_push_dataset_dict_to_hub_multiple_files_with_num_shards(self, temporary

local_ds = DatasetDict({"train": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token, num_shards={"train": 2})
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand Down Expand Up @@ -231,11 +231,9 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):

local_ds = DatasetDict({"train": ds, "random": ds2})

ds_name = f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}"

# Push to hub two times, but the second time with a larger amount of files.
# Verify that the new files contain the correct dataset.
with temporary_repo(ds_name) as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token)

with tempfile.TemporaryDirectory() as tmp:
Expand Down Expand Up @@ -284,7 +282,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):

# Push to hub two times, but the second time with fewer files.
# Verify that the new files contain the correct dataset and that non-necessary files have been deleted.
with temporary_repo(ds_name) as ds_name:
with temporary_repo(ds_name):
local_ds.push_to_hub(ds_name, token=self._token, max_shard_size=500 << 5)

with tempfile.TemporaryDirectory() as tmp:
Expand Down Expand Up @@ -332,7 +330,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self, temporary_repo):
def test_push_dataset_to_hub(self, temporary_repo):
local_ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, split="train", token=self._token)
local_ds_dict = {"train": local_ds}
hub_ds_dict = load_dataset(ds_name, download_mode="force_redownload")
Expand All @@ -350,7 +348,7 @@ def test_push_dataset_to_hub_custom_features(self, temporary_repo):
features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])})
ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features)

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

Expand All @@ -367,7 +365,7 @@ def test_push_dataset_to_hub_custom_features_audio(self, temporary_repo):
ds = Dataset.from_dict(data, features=features)

for embed_external_files in [True, False]:
with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)
hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

Expand All @@ -391,7 +389,7 @@ def test_push_dataset_to_hub_custom_features_image(self, temporary_repo):
ds = Dataset.from_dict(data, features=features)

for embed_external_files in [True, False]:
with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)
hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

Expand All @@ -413,7 +411,7 @@ def test_push_dataset_to_hub_custom_features_image_list(self, temporary_repo):
ds = Dataset.from_dict(data, features=features)

for embed_external_files in [True, False]:
with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds.push_to_hub(ds_name, embed_external_files=embed_external_files, token=self._token)
hub_ds = load_dataset(ds_name, split="train", download_mode="force_redownload")

Expand All @@ -433,7 +431,7 @@ def test_push_dataset_dict_to_hub_custom_features(self, temporary_repo):

local_ds = DatasetDict({"test": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand All @@ -444,7 +442,7 @@ def test_push_dataset_dict_to_hub_custom_features(self, temporary_repo):
def test_push_dataset_to_hub_custom_splits(self, temporary_repo):
ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds.push_to_hub(ds_name, split="random", token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand All @@ -454,7 +452,7 @@ def test_push_dataset_to_hub_custom_splits(self, temporary_repo):

def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo):
ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})
with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
with patch("datasets.arrow_dataset.HfApi.upload_file", side_effect=self._api.upload_file) as mock_hf_api:
# Initial push
ds.push_to_hub(ds_name, token=self._token, max_shard_size="1KB")
Expand All @@ -479,7 +477,7 @@ def test_push_dataset_to_hub_skip_identical_files(self, temporary_repo):

def test_push_dataset_to_hub_multiple_splits_one_by_one(self, temporary_repo):
ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds.push_to_hub(ds_name, split="train", token=self._token)
ds.push_to_hub(ds_name, split="test", token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")
Expand All @@ -493,7 +491,7 @@ def test_push_dataset_dict_to_hub_custom_splits(self, temporary_repo):

local_ds = DatasetDict({"random": ds})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand All @@ -509,7 +507,7 @@ def test_push_streaming_dataset_dict_to_hub(self, temporary_repo):
local_ds.save_to_disk(tmp)
local_ds = load_dataset(tmp, streaming=True)

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

Expand All @@ -522,7 +520,7 @@ def test_push_multiple_dataset_configs_to_hub_load_dataset_builder(self, tempora
ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds_default.push_to_hub(ds_name, token=self._token)
ds_config1.push_to_hub(ds_name, "config1", token=self._token)
ds_config2.push_to_hub(ds_name, "config2", token=self._token)
Expand Down Expand Up @@ -556,7 +554,7 @@ def test_push_multiple_dataset_configs_to_hub_load_dataset(self, temporary_repo)
ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds_default.push_to_hub(ds_name, token=self._token)
ds_config1.push_to_hub(ds_name, "config1", token=self._token)
ds_config2.push_to_hub(ds_name, "config2", token=self._token)
Expand Down Expand Up @@ -600,7 +598,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(self, temp
ds_config1 = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
ds_config2 = Dataset.from_dict({"foo": [1, 2], "bar": [4, 5]})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds_default.push_to_hub(ds_name, token=self._token)
ds_config1.push_to_hub(ds_name, "config1", token=self._token)
ds_config2.push_to_hub(ds_name, "config2", token=self._token)
Expand Down Expand Up @@ -639,7 +637,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_load_dataset_builder(self, te
ds_config1 = DatasetDict({"random": ds_config1})
ds_config2 = DatasetDict({"random": ds_config2})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds_default.push_to_hub(ds_name, token=self._token)
ds_config1.push_to_hub(ds_name, "config1", token=self._token)
ds_config2.push_to_hub(ds_name, "config2", token=self._token)
Expand Down Expand Up @@ -676,7 +674,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_load_dataset(self, temporary_
ds_config1 = DatasetDict({"train": ds_config1, "random": ds_config1})
ds_config2 = DatasetDict({"train": ds_config2, "random": ds_config2})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds_default.push_to_hub(ds_name, token=self._token)
ds_config1.push_to_hub(ds_name, "config1", token=self._token)
ds_config2.push_to_hub(ds_name, "config2", token=self._token)
Expand Down Expand Up @@ -729,7 +727,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(self,
ds_config1 = DatasetDict({"train": ds_config1, "random": ds_config1})
ds_config2 = DatasetDict({"train": ds_config2, "random": ds_config2})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
ds_default.push_to_hub(ds_name, token=self._token)
ds_config1.push_to_hub(ds_name, "config1", token=self._token)
ds_config2.push_to_hub(ds_name, "config2", token=self._token)
Expand Down Expand Up @@ -770,7 +768,7 @@ def test_push_dataset_to_hub_with_config_no_metadata_configs(self, temporary_rep
ds.to_parquet(parquet_buf)
parquet_content = parquet_buf.getvalue()

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
self._api.create_repo(ds_name, token=self._token, repo_type="dataset")
# old push_to_hub was uploading the parquet files only - without metadata configs
self._api.upload_file(
Expand Down Expand Up @@ -804,7 +802,7 @@ def test_push_dataset_dict_to_hub_with_config_no_metadata_configs(self, temporar

local_ds_another_config = DatasetDict({"random": ds_another_config})

with temporary_repo(f"{CI_HUB_USER}/test-{int(time.time() * 10e3)}") as ds_name:
with temporary_repo() as ds_name:
self._api.create_repo(ds_name, token=self._token, repo_type="dataset")
# old push_to_hub was uploading the parquet files only - without metadata configs
self._api.upload_file(
Expand Down
Loading