Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add --sort arg to delete-cache to sort by size #2815

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 61 additions & 19 deletions src/huggingface_hub/commands/delete_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
huggingface-cli delete-cache
huggingface-cli delete-cache --disable-tui
huggingface-cli delete-cache --dir ~/.cache/huggingface/hub
huggingface-cli delete-cache --sort
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
huggingface-cli delete-cache --sort
huggingface-cli delete-cache --sort=size

(to showcase how to use it)


NOTE:
This command is based on `InquirerPy` to build the multiselect menu in the terminal.
Expand Down Expand Up @@ -50,7 +51,6 @@
TODO: add support for `huggingface-cli delete-cache aaaaaa bbbbbb cccccc (...)` ?
TODO: add "--keep-last" arg to delete revisions that are not on `main` ref
TODO: add "--filter" arg to filter repositories by name ?
TODO: add "--sort" arg to sort by size ?
TODO: add "--limit" arg to limit to X repos ?
TODO: add "-y" arg for immediate deletion ?
See discussions in https://github.com/huggingface/huggingface_hub/issues/1025.
Expand Down Expand Up @@ -120,11 +120,26 @@ def register_subcommand(parser: _SubParsersAction):
),
)

delete_cache_parser.add_argument(
"--sort",
nargs="?",
choices=["size", "alphabetical", "lastUpdated", "lastUsed"],
const="size",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const="size",

By default, let's not sort

help=(
"Sort repositories by the specified criteria. Options: "
"'size' (largest first), "
"'alphabetical' (A-Z), "
"'lastUpdated' (newest first), "
"'lastUsed' (most recent first)"
),
)

delete_cache_parser.set_defaults(func=DeleteCacheCommand)

def __init__(self, args: Namespace) -> None:
self.cache_dir: Optional[str] = args.dir
self.disable_tui: bool = args.disable_tui
self.sort_by: Optional[str] = args.sort
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you define a SortingOption_T as the beginning of the module?

SortingOption_T = Literal["size", "alphabetical", "lastUpdated", "lastUsed"]

and then use it throughout the module like this?

Suggested change
self.sort_by: Optional[str] = args.sort
self.sort_by: Optional[SortingOption_T] = args.sort


def run(self):
"""Run `delete-cache` command with or without TUI."""
Expand All @@ -133,9 +148,9 @@ def run(self):

# Manual review from the user
if self.disable_tui:
selected_hashes = _manual_review_no_tui(hf_cache_info, preselected=[])
selected_hashes = _manual_review_no_tui(hf_cache_info, preselected=[], sort_by=self.sort_by)
else:
selected_hashes = _manual_review_tui(hf_cache_info, preselected=[])
selected_hashes = _manual_review_tui(hf_cache_info, preselected=[], sort_by=self.sort_by)

# If deletion is not cancelled
if len(selected_hashes) > 0 and _CANCEL_DELETION_STR not in selected_hashes:
Expand Down Expand Up @@ -164,13 +179,21 @@ def run(self):


@require_inquirer_py
def _manual_review_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) -> List[str]:
def _manual_review_tui(
hf_cache_info: HFCacheInfo,
preselected: List[str],
sort_by: Optional[str] = None,
) -> List[str]:
"""Ask the user for a manual review of the revisions to delete.

Displays a multi-select menu in the terminal (TUI).
"""
# Define multiselect list
choices = _get_tui_choices_from_scan(repos=hf_cache_info.repos, preselected=preselected)
choices = _get_tui_choices_from_scan(
repos=hf_cache_info.repos,
preselected=preselected,
sort_by=sort_by,
)
checkbox = inquirer.checkbox(
message="Select revisions to delete:",
choices=choices, # List of revisions with some pre-selection
Expand Down Expand Up @@ -213,22 +236,27 @@ def _ask_for_confirmation_tui(message: str, default: bool = True) -> bool:
return inquirer.confirm(message, default=default).execute()


def _get_tui_choices_from_scan(repos: Iterable[CachedRepoInfo], preselected: List[str]) -> List:
def _get_tui_choices_from_scan(
repos: Iterable[CachedRepoInfo],
preselected: List[str],
sort_by: Optional[str] = None,
) -> List:
"""Build a list of choices from the scanned repos.

Args:
repos (*Iterable[`CachedRepoInfo`]*):
List of scanned repos on which we want to delete revisions.
preselected (*List[`str`]*):
List of revision hashes that will be preselected.
sort_by (*Optional[str]*):
Sorting direction. Choices: "size", "alphabetical", "lastUpdated", "lastUsed".

Return:
The list of choices to pass to `inquirer.checkbox`.
"""
choices: List[Union[Choice, Separator]] = []

# First choice is to cancel the deletion. If selected, nothing will be deleted,
# no matter the other selected items.
# First choice is to cancel the deletion
choices.append(
Choice(
_CANCEL_DELETION_STR,
Expand All @@ -237,8 +265,19 @@ def _get_tui_choices_from_scan(repos: Iterable[CachedRepoInfo], preselected: Lis
)
)

# Display a separator per repo and a Choice for each revisions of the repo
for repo in sorted(repos, key=_repo_sorting_order):
# Sort repos based on specified criteria
sorted_repos = sorted(
repos,
key=lambda repo: {
"size": lambda r: -r.size_on_disk, # largest first
"alphabetical": lambda r: (r.repo_type, r.repo_id.lower()), # by type then name
"lastUpdated": lambda r: -max(rev.last_modified for rev in r.revisions), # newest first
"lastUsed": lambda r: -r.last_accessed, # most recently used first
None: lambda r: (r.repo_type, r.repo_id), # default stable order
}[sort_by](repo),
)

for repo in sorted_repos:
# Repo as separator
choices.append(
Separator(
Expand All @@ -264,7 +303,11 @@ def _get_tui_choices_from_scan(repos: Iterable[CachedRepoInfo], preselected: Lis
return choices


def _manual_review_no_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) -> List[str]:
def _manual_review_no_tui(
hf_cache_info: HFCacheInfo,
preselected: List[str],
sort_by: Optional[str] = None,
) -> List[str]:
"""Ask the user for a manual review of the revisions to delete.

Used when TUI is disabled. Manual review happens in a separate tmp file that the
Expand All @@ -275,7 +318,11 @@ def _manual_review_no_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) ->
os.close(fd)

lines = []
for repo in sorted(hf_cache_info.repos, key=_repo_sorting_order):
sorted_repos = sorted(
hf_cache_info.repos,
key=lambda repo: -repo.size_on_disk if sort_by == "size" else 1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move the sorting logic defined above in the TUI part to a separate method so that it can be reused here? So that the "no tui" path can also support all sorting options.

)
for repo in sorted_repos:
lines.append(
f"\n# {repo.repo_type.capitalize()} {repo.repo_id} ({repo.size_on_disk_str},"
f" used {repo.last_accessed_str})"
Expand Down Expand Up @@ -314,9 +361,9 @@ def _manual_review_no_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) ->
):
break

# 4. Return selected_hashes
# 4. Return selected_hashes sorted to maintain stable order
os.remove(tmp_path)
return selected_hashes
return sorted(selected_hashes) # Sort to maintain stable order


def _ask_for_confirmation_no_tui(message: str, default: bool = True) -> bool:
Expand Down Expand Up @@ -418,11 +465,6 @@ def _read_manual_review_tmp_file(tmp_path: str) -> List[str]:
""".strip()


def _repo_sorting_order(repo: CachedRepoInfo) -> Any:
# First split by Dataset/Model, then sort by last accessed (oldest first)
return (repo.repo_type, repo.last_accessed)


def _revision_sorting_order(revision: CachedRevisionInfo) -> Any:
# Sort by last modified (oldest first)
return revision.last_modified
96 changes: 88 additions & 8 deletions tests/test_command_delete_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

class TestDeleteCacheHelpers(unittest.TestCase):
def test_get_tui_choices_from_scan_empty(self) -> None:
choices = _get_tui_choices_from_scan(repos={}, preselected=[])
choices = _get_tui_choices_from_scan(repos={}, preselected=[], sort_by=None)
self.assertEqual(len(choices), 1)
self.assertIsInstance(choices[0], Choice)
self.assertEqual(choices[0].value, _CANCEL_DELETION_STR)
Expand All @@ -38,6 +38,7 @@ def test_get_tui_choices_from_scan_with_preselection(self) -> None:
"a_revision_id_that_does_not_exist", # unknown but will not complain
"older_hash_id", # only the oldest revision from model_2
],
sort_by=None, # Don't sort to maintain original order
)
self.assertEqual(len(choices), 8)

Expand Down Expand Up @@ -87,6 +88,20 @@ def test_get_tui_choices_from_scan_with_preselection(self) -> None:
self.assertEqual(choices[7].name, "abcdef12: main, refs/pr/1 # modified 2 years ago")
self.assertFalse(choices[7].enabled)

def test_get_tui_choices_from_scan_with_sort_size(self) -> None:
"""Test sorting by size."""
choices = _get_tui_choices_from_scan(repos=_get_cache_mock().repos, preselected=[], sort_by="size")

# Verify repo order: gpt2 (3.6G) -> dummy_dataset (8M) -> dummy_model (1.4K)
self.assertIsInstance(choices[1], Separator)
self.assertIn("gpt2", choices[1]._line)

self.assertIsInstance(choices[3], Separator)
self.assertIn("dummy_dataset", choices[3]._line)

self.assertIsInstance(choices[5], Separator)
self.assertIn("dummy_model", choices[5]._line)

def test_get_expectations_str_on_no_deletion_item(self) -> None:
"""Test `_get_instructions` when `_CANCEL_DELETION_STR` is passed."""
self.assertEqual(
Expand Down Expand Up @@ -191,8 +206,7 @@ def _input_answers():
# Run manual review
with capture_output() as output:
selected_hashes = _manual_review_no_tui(
hf_cache_info=cache_mock,
preselected=["abcdef123456789", "older_hash_id"],
hf_cache_info=cache_mock, preselected=["abcdef123456789", "older_hash_id"], sort_by=None
)

# Tmp file has been created but is now deleted
Expand Down Expand Up @@ -236,6 +250,50 @@ def test_ask_for_confirmation_no_tui(self, mock_input: Mock) -> None:
"Invalid input. Must be one of ('y', 'yes', '1', 'n', 'no', '0', '')\n",
)

def test_get_tui_choices_from_scan_with_different_sorts(self) -> None:
"""Test different sorting modes."""
cache_mock = _get_cache_mock()

# Test size sorting (largest first) - order: gpt2 (3.6G) -> dummy_dataset (8M) -> dummy_model (1.4K)
size_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="size")
# Separators at positions 1, 3, 5
self.assertIsInstance(size_choices[1], Separator)
self.assertIn("gpt2", size_choices[1]._line)
self.assertIsInstance(size_choices[3], Separator)
self.assertIn("dummy_dataset", size_choices[3]._line)
self.assertIsInstance(size_choices[5], Separator)
self.assertIn("dummy_model", size_choices[5]._line)

# Test alphabetical sorting - order: dummy_dataset -> dummy_model -> gpt2
alpha_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="alphabetical")
# Separators at positions 1, 3, 6 (dummy_model has 2 revisions)
self.assertIsInstance(alpha_choices[1], Separator)
self.assertIn("dummy_dataset", alpha_choices[1]._line)
self.assertIsInstance(alpha_choices[3], Separator)
self.assertIn("dummy_model", alpha_choices[3]._line)
self.assertIsInstance(alpha_choices[6], Separator)
self.assertIn("gpt2", alpha_choices[6]._line)

# Test lastUpdated sorting - order: dummy_dataset (1 day) -> gpt2 (2 years) -> dummy_model (3 years)
updated_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="lastUpdated")
# Separators at positions 1, 3, 5
self.assertIsInstance(updated_choices[1], Separator)
self.assertIn("dummy_dataset", updated_choices[1]._line)
self.assertIsInstance(updated_choices[3], Separator)
self.assertIn("gpt2", updated_choices[3]._line)
self.assertIsInstance(updated_choices[5], Separator)
self.assertIn("dummy_model", updated_choices[5]._line)

# Test lastUsed sorting - order: gpt2 (2h) -> dummy_dataset (2w) -> dummy_model (2y)
used_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="lastUsed")
# Separators at positions 1, 3, 5
self.assertIsInstance(used_choices[1], Separator)
self.assertIn("gpt2", used_choices[1]._line)
self.assertIsInstance(used_choices[3], Separator)
self.assertIn("dummy_dataset", used_choices[3]._line)
self.assertIsInstance(used_choices[5], Separator)
self.assertIn("dummy_model", used_choices[5]._line)


@patch("huggingface_hub.commands.delete_cache._ask_for_confirmation_no_tui")
@patch("huggingface_hub.commands.delete_cache._get_expectations_str")
Expand All @@ -254,6 +312,7 @@ class TestMockedDeleteCacheCommand(unittest.TestCase):

def setUp(self) -> None:
self.args = Mock()
self.args.sort = None
self.command = DeleteCacheCommand(self.args)

def test_run_and_delete_with_tui(
Expand All @@ -280,7 +339,7 @@ def test_run_and_delete_with_tui(
cache_mock = mock_scan_cache_dir.return_value

# Step 2: manual review
mock__manual_review_tui.assert_called_once_with(cache_mock, preselected=[])
mock__manual_review_tui.assert_called_once_with(cache_mock, preselected=[], sort_by=None)

# Step 3: ask confirmation
mock__get_expectations_str.assert_called_once_with(cache_mock, ["hash_1", "hash_2"])
Expand Down Expand Up @@ -352,7 +411,7 @@ def test_run_and_delete_no_tui(
cache_mock = mock_scan_cache_dir.return_value

# Step 2: manual review
mock__manual_review_no_tui.assert_called_once_with(cache_mock, preselected=[])
mock__manual_review_no_tui.assert_called_once_with(cache_mock, preselected=[], sort_by=None)

# Step 3: ask confirmation
mock__get_expectations_str.assert_called_once_with(cache_mock, ["hash_1", "hash_2"])
Expand All @@ -369,6 +428,23 @@ def test_run_and_delete_no_tui(
"Start deletion.\nDone. Deleted 0 repo(s) and 0 revision(s) for a total of 5.1M.\n",
)

def test_run_with_sorting(self):
"""Test command run with sorting enabled."""
self.args.sort = "size"
self.command = DeleteCacheCommand(self.args)

mock_scan_cache_dir = Mock()
mock_scan_cache_dir.return_value = _get_cache_mock()

with (
patch("huggingface_hub.commands.delete_cache.scan_cache_dir", mock_scan_cache_dir),
patch("huggingface_hub.commands.delete_cache._manual_review_tui") as mock_review,
):
self.command.disable_tui = False
self.command.run()

mock_review.assert_called_once_with(mock_scan_cache_dir.return_value, preselected=[], sort_by="size")


def _get_cache_mock() -> Mock:
# First model with 1 revision
Expand All @@ -378,11 +454,12 @@ def _get_cache_mock() -> Mock:
model_1.size_on_disk_str = "3.6G"
model_1.last_accessed = 1660000000
model_1.last_accessed_str = "2 hours ago"
model_1.size_on_disk = 3.6 * 1024**3 # 3.6 GiB

model_1_revision_1 = Mock()
model_1_revision_1.commit_hash = "abcdef123456789"
model_1_revision_1.refs = {"main", "refs/pr/1"}
# model_1_revision_1.last_modified = 123456789 # timestamp
model_1_revision_1.last_modified = 123456789 # 2 years ago
model_1_revision_1.last_modified_str = "2 years ago"

model_1.revisions = {model_1_revision_1}
Expand All @@ -394,17 +471,18 @@ def _get_cache_mock() -> Mock:
model_2.size_on_disk_str = "1.4K"
model_2.last_accessed = 1550000000
model_2.last_accessed_str = "2 years ago"
model_2.size_on_disk = 1.4 * 1024 # 1.4K

model_2_revision_1 = Mock()
model_2_revision_1.commit_hash = "recent_hash_id"
model_2_revision_1.refs = {"main"}
model_2_revision_1.last_modified = 123456789 # newer timestamp
model_2_revision_1.last_modified = 123456789 # 2 years ago
model_2_revision_1.last_modified_str = "2 years ago"

model_2_revision_2 = Mock()
model_2_revision_2.commit_hash = "older_hash_id"
model_2_revision_2.refs = {}
model_2_revision_2.last_modified = 12345678 # older timestamp
model_2_revision_2.last_modified = 12345678 # 3 years ago
model_2_revision_2.last_modified_str = "3 years ago"

model_2.revisions = {model_2_revision_1, model_2_revision_2}
Expand All @@ -416,10 +494,12 @@ def _get_cache_mock() -> Mock:
dataset_1.size_on_disk_str = "8M"
dataset_1.last_accessed = 1660000000
dataset_1.last_accessed_str = "2 weeks ago"
dataset_1.size_on_disk = 8 * 1024**2 # 8 MiB

dataset_1_revision_1 = Mock()
dataset_1_revision_1.commit_hash = "dataset_revision_hash_id"
dataset_1_revision_1.refs = {}
dataset_1_revision_1.last_modified = 1234567890 # 1 day ago (newest)
dataset_1_revision_1.last_modified_str = "1 day ago"

dataset_1.revisions = {dataset_1_revision_1}
Expand Down