diff --git a/src/huggingface_hub/commands/delete_cache.py b/src/huggingface_hub/commands/delete_cache.py index b2fc44d31c..efe1a819cc 100644 --- a/src/huggingface_hub/commands/delete_cache.py +++ b/src/huggingface_hub/commands/delete_cache.py @@ -18,6 +18,7 @@ huggingface-cli delete-cache huggingface-cli delete-cache --disable-tui huggingface-cli delete-cache --dir ~/.cache/huggingface/hub + huggingface-cli delete-cache --sort NOTE: This command is based on `InquirerPy` to build the multiselect menu in the terminal. @@ -50,7 +51,6 @@ TODO: add support for `huggingface-cli delete-cache aaaaaa bbbbbb cccccc (...)` ? TODO: add "--keep-last" arg to delete revisions that are not on `main` ref TODO: add "--filter" arg to filter repositories by name ? -TODO: add "--sort" arg to sort by size ? TODO: add "--limit" arg to limit to X repos ? TODO: add "-y" arg for immediate deletion ? See discussions in https://github.com/huggingface/huggingface_hub/issues/1025. @@ -120,11 +120,26 @@ def register_subcommand(parser: _SubParsersAction): ), ) + delete_cache_parser.add_argument( + "--sort", + nargs="?", + choices=["size", "alphabetical", "lastUpdated", "lastUsed"], + const="size", + help=( + "Sort repositories by the specified criteria. Options: " + "'size' (largest first), " + "'alphabetical' (A-Z), " + "'lastUpdated' (newest first), " + "'lastUsed' (most recent first)" + ), + ) + delete_cache_parser.set_defaults(func=DeleteCacheCommand) def __init__(self, args: Namespace) -> None: self.cache_dir: Optional[str] = args.dir self.disable_tui: bool = args.disable_tui + self.sort_by: Optional[str] = args.sort def run(self): """Run `delete-cache` command with or without TUI.""" @@ -133,9 +148,9 @@ def run(self): # Manual review from the user if self.disable_tui: - selected_hashes = _manual_review_no_tui(hf_cache_info, preselected=[]) + selected_hashes = _manual_review_no_tui(hf_cache_info, preselected=[], sort_by=self.sort_by) else: - selected_hashes = _manual_review_tui(hf_cache_info, preselected=[]) + selected_hashes = _manual_review_tui(hf_cache_info, preselected=[], sort_by=self.sort_by) # If deletion is not cancelled if len(selected_hashes) > 0 and _CANCEL_DELETION_STR not in selected_hashes: @@ -164,13 +179,21 @@ def run(self): @require_inquirer_py -def _manual_review_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) -> List[str]: +def _manual_review_tui( + hf_cache_info: HFCacheInfo, + preselected: List[str], + sort_by: Optional[str] = None, +) -> List[str]: """Ask the user for a manual review of the revisions to delete. Displays a multi-select menu in the terminal (TUI). """ # Define multiselect list - choices = _get_tui_choices_from_scan(repos=hf_cache_info.repos, preselected=preselected) + choices = _get_tui_choices_from_scan( + repos=hf_cache_info.repos, + preselected=preselected, + sort_by=sort_by, + ) checkbox = inquirer.checkbox( message="Select revisions to delete:", choices=choices, # List of revisions with some pre-selection @@ -213,7 +236,11 @@ def _ask_for_confirmation_tui(message: str, default: bool = True) -> bool: return inquirer.confirm(message, default=default).execute() -def _get_tui_choices_from_scan(repos: Iterable[CachedRepoInfo], preselected: List[str]) -> List: +def _get_tui_choices_from_scan( + repos: Iterable[CachedRepoInfo], + preselected: List[str], + sort_by: Optional[str] = None, +) -> List: """Build a list of choices from the scanned repos. Args: @@ -221,14 +248,15 @@ def _get_tui_choices_from_scan(repos: Iterable[CachedRepoInfo], preselected: Lis List of scanned repos on which we want to delete revisions. preselected (*List[`str`]*): List of revision hashes that will be preselected. + sort_by (*Optional[str]*): + Sorting direction. Choices: "size", "alphabetical", "lastUpdated", "lastUsed". Return: The list of choices to pass to `inquirer.checkbox`. """ choices: List[Union[Choice, Separator]] = [] - # First choice is to cancel the deletion. If selected, nothing will be deleted, - # no matter the other selected items. + # First choice is to cancel the deletion choices.append( Choice( _CANCEL_DELETION_STR, @@ -237,8 +265,19 @@ def _get_tui_choices_from_scan(repos: Iterable[CachedRepoInfo], preselected: Lis ) ) - # Display a separator per repo and a Choice for each revisions of the repo - for repo in sorted(repos, key=_repo_sorting_order): + # Sort repos based on specified criteria + sorted_repos = sorted( + repos, + key=lambda repo: { + "size": lambda r: -r.size_on_disk, # largest first + "alphabetical": lambda r: (r.repo_type, r.repo_id.lower()), # by type then name + "lastUpdated": lambda r: -max(rev.last_modified for rev in r.revisions), # newest first + "lastUsed": lambda r: -r.last_accessed, # most recently used first + None: lambda r: (r.repo_type, r.repo_id), # default stable order + }[sort_by](repo), + ) + + for repo in sorted_repos: # Repo as separator choices.append( Separator( @@ -264,7 +303,11 @@ def _get_tui_choices_from_scan(repos: Iterable[CachedRepoInfo], preselected: Lis return choices -def _manual_review_no_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) -> List[str]: +def _manual_review_no_tui( + hf_cache_info: HFCacheInfo, + preselected: List[str], + sort_by: Optional[str] = None, +) -> List[str]: """Ask the user for a manual review of the revisions to delete. Used when TUI is disabled. Manual review happens in a separate tmp file that the @@ -275,7 +318,11 @@ def _manual_review_no_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) -> os.close(fd) lines = [] - for repo in sorted(hf_cache_info.repos, key=_repo_sorting_order): + sorted_repos = sorted( + hf_cache_info.repos, + key=lambda repo: -repo.size_on_disk if sort_by == "size" else 1, + ) + for repo in sorted_repos: lines.append( f"\n# {repo.repo_type.capitalize()} {repo.repo_id} ({repo.size_on_disk_str}," f" used {repo.last_accessed_str})" @@ -314,9 +361,9 @@ def _manual_review_no_tui(hf_cache_info: HFCacheInfo, preselected: List[str]) -> ): break - # 4. Return selected_hashes + # 4. Return selected_hashes sorted to maintain stable order os.remove(tmp_path) - return selected_hashes + return sorted(selected_hashes) # Sort to maintain stable order def _ask_for_confirmation_no_tui(message: str, default: bool = True) -> bool: @@ -418,11 +465,6 @@ def _read_manual_review_tmp_file(tmp_path: str) -> List[str]: """.strip() -def _repo_sorting_order(repo: CachedRepoInfo) -> Any: - # First split by Dataset/Model, then sort by last accessed (oldest first) - return (repo.repo_type, repo.last_accessed) - - def _revision_sorting_order(revision: CachedRevisionInfo) -> Any: # Sort by last modified (oldest first) return revision.last_modified diff --git a/tests/test_command_delete_cache.py b/tests/test_command_delete_cache.py index f5d3d90208..5550934136 100644 --- a/tests/test_command_delete_cache.py +++ b/tests/test_command_delete_cache.py @@ -23,7 +23,7 @@ class TestDeleteCacheHelpers(unittest.TestCase): def test_get_tui_choices_from_scan_empty(self) -> None: - choices = _get_tui_choices_from_scan(repos={}, preselected=[]) + choices = _get_tui_choices_from_scan(repos={}, preselected=[], sort_by=None) self.assertEqual(len(choices), 1) self.assertIsInstance(choices[0], Choice) self.assertEqual(choices[0].value, _CANCEL_DELETION_STR) @@ -38,6 +38,7 @@ def test_get_tui_choices_from_scan_with_preselection(self) -> None: "a_revision_id_that_does_not_exist", # unknown but will not complain "older_hash_id", # only the oldest revision from model_2 ], + sort_by=None, # Don't sort to maintain original order ) self.assertEqual(len(choices), 8) @@ -87,6 +88,20 @@ def test_get_tui_choices_from_scan_with_preselection(self) -> None: self.assertEqual(choices[7].name, "abcdef12: main, refs/pr/1 # modified 2 years ago") self.assertFalse(choices[7].enabled) + def test_get_tui_choices_from_scan_with_sort_size(self) -> None: + """Test sorting by size.""" + choices = _get_tui_choices_from_scan(repos=_get_cache_mock().repos, preselected=[], sort_by="size") + + # Verify repo order: gpt2 (3.6G) -> dummy_dataset (8M) -> dummy_model (1.4K) + self.assertIsInstance(choices[1], Separator) + self.assertIn("gpt2", choices[1]._line) + + self.assertIsInstance(choices[3], Separator) + self.assertIn("dummy_dataset", choices[3]._line) + + self.assertIsInstance(choices[5], Separator) + self.assertIn("dummy_model", choices[5]._line) + def test_get_expectations_str_on_no_deletion_item(self) -> None: """Test `_get_instructions` when `_CANCEL_DELETION_STR` is passed.""" self.assertEqual( @@ -191,8 +206,7 @@ def _input_answers(): # Run manual review with capture_output() as output: selected_hashes = _manual_review_no_tui( - hf_cache_info=cache_mock, - preselected=["abcdef123456789", "older_hash_id"], + hf_cache_info=cache_mock, preselected=["abcdef123456789", "older_hash_id"], sort_by=None ) # Tmp file has been created but is now deleted @@ -236,6 +250,50 @@ def test_ask_for_confirmation_no_tui(self, mock_input: Mock) -> None: "Invalid input. Must be one of ('y', 'yes', '1', 'n', 'no', '0', '')\n", ) + def test_get_tui_choices_from_scan_with_different_sorts(self) -> None: + """Test different sorting modes.""" + cache_mock = _get_cache_mock() + + # Test size sorting (largest first) - order: gpt2 (3.6G) -> dummy_dataset (8M) -> dummy_model (1.4K) + size_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="size") + # Separators at positions 1, 3, 5 + self.assertIsInstance(size_choices[1], Separator) + self.assertIn("gpt2", size_choices[1]._line) + self.assertIsInstance(size_choices[3], Separator) + self.assertIn("dummy_dataset", size_choices[3]._line) + self.assertIsInstance(size_choices[5], Separator) + self.assertIn("dummy_model", size_choices[5]._line) + + # Test alphabetical sorting - order: dummy_dataset -> dummy_model -> gpt2 + alpha_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="alphabetical") + # Separators at positions 1, 3, 6 (dummy_model has 2 revisions) + self.assertIsInstance(alpha_choices[1], Separator) + self.assertIn("dummy_dataset", alpha_choices[1]._line) + self.assertIsInstance(alpha_choices[3], Separator) + self.assertIn("dummy_model", alpha_choices[3]._line) + self.assertIsInstance(alpha_choices[6], Separator) + self.assertIn("gpt2", alpha_choices[6]._line) + + # Test lastUpdated sorting - order: dummy_dataset (1 day) -> gpt2 (2 years) -> dummy_model (3 years) + updated_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="lastUpdated") + # Separators at positions 1, 3, 5 + self.assertIsInstance(updated_choices[1], Separator) + self.assertIn("dummy_dataset", updated_choices[1]._line) + self.assertIsInstance(updated_choices[3], Separator) + self.assertIn("gpt2", updated_choices[3]._line) + self.assertIsInstance(updated_choices[5], Separator) + self.assertIn("dummy_model", updated_choices[5]._line) + + # Test lastUsed sorting - order: gpt2 (2h) -> dummy_dataset (2w) -> dummy_model (2y) + used_choices = _get_tui_choices_from_scan(cache_mock.repos, [], sort_by="lastUsed") + # Separators at positions 1, 3, 5 + self.assertIsInstance(used_choices[1], Separator) + self.assertIn("gpt2", used_choices[1]._line) + self.assertIsInstance(used_choices[3], Separator) + self.assertIn("dummy_dataset", used_choices[3]._line) + self.assertIsInstance(used_choices[5], Separator) + self.assertIn("dummy_model", used_choices[5]._line) + @patch("huggingface_hub.commands.delete_cache._ask_for_confirmation_no_tui") @patch("huggingface_hub.commands.delete_cache._get_expectations_str") @@ -254,6 +312,7 @@ class TestMockedDeleteCacheCommand(unittest.TestCase): def setUp(self) -> None: self.args = Mock() + self.args.sort = None self.command = DeleteCacheCommand(self.args) def test_run_and_delete_with_tui( @@ -280,7 +339,7 @@ def test_run_and_delete_with_tui( cache_mock = mock_scan_cache_dir.return_value # Step 2: manual review - mock__manual_review_tui.assert_called_once_with(cache_mock, preselected=[]) + mock__manual_review_tui.assert_called_once_with(cache_mock, preselected=[], sort_by=None) # Step 3: ask confirmation mock__get_expectations_str.assert_called_once_with(cache_mock, ["hash_1", "hash_2"]) @@ -352,7 +411,7 @@ def test_run_and_delete_no_tui( cache_mock = mock_scan_cache_dir.return_value # Step 2: manual review - mock__manual_review_no_tui.assert_called_once_with(cache_mock, preselected=[]) + mock__manual_review_no_tui.assert_called_once_with(cache_mock, preselected=[], sort_by=None) # Step 3: ask confirmation mock__get_expectations_str.assert_called_once_with(cache_mock, ["hash_1", "hash_2"]) @@ -369,6 +428,23 @@ def test_run_and_delete_no_tui( "Start deletion.\nDone. Deleted 0 repo(s) and 0 revision(s) for a total of 5.1M.\n", ) + def test_run_with_sorting(self): + """Test command run with sorting enabled.""" + self.args.sort = "size" + self.command = DeleteCacheCommand(self.args) + + mock_scan_cache_dir = Mock() + mock_scan_cache_dir.return_value = _get_cache_mock() + + with ( + patch("huggingface_hub.commands.delete_cache.scan_cache_dir", mock_scan_cache_dir), + patch("huggingface_hub.commands.delete_cache._manual_review_tui") as mock_review, + ): + self.command.disable_tui = False + self.command.run() + + mock_review.assert_called_once_with(mock_scan_cache_dir.return_value, preselected=[], sort_by="size") + def _get_cache_mock() -> Mock: # First model with 1 revision @@ -378,11 +454,12 @@ def _get_cache_mock() -> Mock: model_1.size_on_disk_str = "3.6G" model_1.last_accessed = 1660000000 model_1.last_accessed_str = "2 hours ago" + model_1.size_on_disk = 3.6 * 1024**3 # 3.6 GiB model_1_revision_1 = Mock() model_1_revision_1.commit_hash = "abcdef123456789" model_1_revision_1.refs = {"main", "refs/pr/1"} - # model_1_revision_1.last_modified = 123456789 # timestamp + model_1_revision_1.last_modified = 123456789 # 2 years ago model_1_revision_1.last_modified_str = "2 years ago" model_1.revisions = {model_1_revision_1} @@ -394,17 +471,18 @@ def _get_cache_mock() -> Mock: model_2.size_on_disk_str = "1.4K" model_2.last_accessed = 1550000000 model_2.last_accessed_str = "2 years ago" + model_2.size_on_disk = 1.4 * 1024 # 1.4K model_2_revision_1 = Mock() model_2_revision_1.commit_hash = "recent_hash_id" model_2_revision_1.refs = {"main"} - model_2_revision_1.last_modified = 123456789 # newer timestamp + model_2_revision_1.last_modified = 123456789 # 2 years ago model_2_revision_1.last_modified_str = "2 years ago" model_2_revision_2 = Mock() model_2_revision_2.commit_hash = "older_hash_id" model_2_revision_2.refs = {} - model_2_revision_2.last_modified = 12345678 # older timestamp + model_2_revision_2.last_modified = 12345678 # 3 years ago model_2_revision_2.last_modified_str = "3 years ago" model_2.revisions = {model_2_revision_1, model_2_revision_2} @@ -416,10 +494,12 @@ def _get_cache_mock() -> Mock: dataset_1.size_on_disk_str = "8M" dataset_1.last_accessed = 1660000000 dataset_1.last_accessed_str = "2 weeks ago" + dataset_1.size_on_disk = 8 * 1024**2 # 8 MiB dataset_1_revision_1 = Mock() dataset_1_revision_1.commit_hash = "dataset_revision_hash_id" dataset_1_revision_1.refs = {} + dataset_1_revision_1.last_modified = 1234567890 # 1 day ago (newest) dataset_1_revision_1.last_modified_str = "1 day ago" dataset_1.revisions = {dataset_1_revision_1}