From 382dab9d011989fe62661f20a3e701790d69c6bf Mon Sep 17 00:00:00 2001 From: ianwal <52143079+ianwal@users.noreply.github.com> Date: Sun, 21 Jul 2024 17:44:21 -0700 Subject: [PATCH] Add keyboard interrupt to building tree and search stage, move custom query to ctor, --- src/hydrusvideodeduplicator/__main__.py | 5 +- src/hydrusvideodeduplicator/db/DedupeDB.py | 28 +++++---- src/hydrusvideodeduplicator/dedup.py | 66 ++++++++++++++-------- 3 files changed, 58 insertions(+), 41 deletions(-) diff --git a/src/hydrusvideodeduplicator/__main__.py b/src/hydrusvideodeduplicator/__main__.py index 83896b4..7779116 100644 --- a/src/hydrusvideodeduplicator/__main__.py +++ b/src/hydrusvideodeduplicator/__main__.py @@ -165,7 +165,9 @@ def exit_from_failure() -> NoReturn: db.commit() db_stats = DedupeDB.get_db_stats(db) - deduper = HydrusVideoDeduplicator(db, client=hvdclient, job_count=job_count, failed_page_name=failed_page_name) + deduper = HydrusVideoDeduplicator( + db, client=hvdclient, job_count=job_count, failed_page_name=failed_page_name, custom_query=query + ) if debug: deduper.hydlog.setLevel(logging.DEBUG) @@ -181,7 +183,6 @@ def exit_from_failure() -> NoReturn: deduper.deduplicate( overwrite=overwrite, - custom_query=query, skip_hashing=skip_hashing, ) diff --git a/src/hydrusvideodeduplicator/db/DedupeDB.py b/src/hydrusvideodeduplicator/db/DedupeDB.py index 640c9b0..32a5012 100644 --- a/src/hydrusvideodeduplicator/db/DedupeDB.py +++ b/src/hydrusvideodeduplicator/db/DedupeDB.py @@ -566,6 +566,7 @@ def print_upgrade(version: str, new_version: str): if not self.does_need_upgrade(): return + # TODO: Change 0.6.9 back to 0.7.0 before release. if SemanticVersion(version) < SemanticVersion("0.6.9"): print_upgrade(version, "0.6.9") @@ -597,8 +598,12 @@ def print_upgrade(version: str, new_version: str): "CREATE TABLE IF NOT EXISTS phashed_file_queue ( file_hash BLOB_BYTES NOT NULL UNIQUE, phash BLOB_BYTES NOT NULL, PRIMARY KEY ( file_hash, phash ) )" # noqa: E501 ) - # Insert the files from the old videos table into the DB and the newly added vptree. + # Insert the files from the old videos table into the hash queue. old_videos_data = [] + print( + "Migrating perceptually hashed videos from the old table.\n" + "This may take a few minutes, depending your db length." + ) with SqliteDict( get_db_file_path(), tablename="videos", flag="c", autocommit=False, outer_stack=False ) as videos_table: @@ -607,21 +612,14 @@ def print_upgrade(version: str, new_version: str): row = videos_table[video_hash] if "perceptual_hash" in row: old_videos_data.append((video_hash, row["perceptual_hash"])) - # TODO: Should we move the farthest search index as well? - - with tqdm( - dynamic_ncols=True, - total=len(old_videos_data), - desc="Migrating phashes to vptree...", - unit="file", - colour="BLUE", - ) as pbar: - for video_hash, perceptual_hash in old_videos_data: - # TODO: If these functions change this upgrade may not work! We need to be careful about updating them. # noqa: E501 - # An upgrade cutoff at some point to prevent bitrot is a good idea, which is what Hydrus does. - self.add_to_phashed_files_queue(video_hash, perceptual_hash) - pbar.update(1) + # The farthest search index will not be moved. + + for video_hash, perceptual_hash in old_videos_data: + # TODO: If these functions change this upgrade may not work! We need to be careful about updating them. # noqa: E501 + # An upgrade cutoff at some point to prevent bitrot is a good idea, which is what Hydrus does. + self.add_to_phashed_files_queue(video_hash, perceptual_hash) + # TODO: Change 0.6.9 back to 0.7.0 before release. self.set_version("0.6.9") # Note: We need to keep re-running get_version so that we can progressively upgrade. version = self.get_version() diff --git a/src/hydrusvideodeduplicator/dedup.py b/src/hydrusvideodeduplicator/dedup.py index 52149ca..adcc2b8 100644 --- a/src/hydrusvideodeduplicator/dedup.py +++ b/src/hydrusvideodeduplicator/dedup.py @@ -127,26 +127,15 @@ def __init__( client: HVDClient, job_count: int = -2, failed_page_name: str | None = None, + custom_query: Sequence[str] | None = None, ): self.db = db self.client = client self.job_count = job_count self.page_logger = None if failed_page_name is None else HydrusPageLogger(self.client, failed_page_name) + self.search_tags = self.get_search_tags(custom_query) - def deduplicate( - self, - overwrite: bool = False, - custom_query: Sequence[str] | None = None, - skip_hashing: bool = False, - ) -> None: - """ - Run all deduplicate functions: - 1. Retrieve video hashes - 2. Calculate perceptual hashes - 3. Find potential duplicates - """ - - # Add perceptual hashes to video files + def get_search_tags(self, custom_query: Sequence[str] | None) -> list[str]: # system:filetype tags are really inconsistent search_tags = [ "system:filetype=video, gif, apng", @@ -160,18 +149,39 @@ def deduplicate( if len(custom_query) > 0: search_tags.extend(custom_query) print(f"[yellow] Custom Query: {custom_query}") + return search_tags + + def deduplicate( + self, + overwrite: bool = False, + skip_hashing: bool = False, + ) -> None: + """ + Run all deduplicate functions. + + Dedupe Algorithm: + 1. Perceptually hash the videos. + 2. Insert the perceptual hashes into the vptree + 3. Search for similar videos in the vptree. + 4. Mark the similar videos as potential duplicates in Hydrus. + """ if skip_hashing: print("[yellow] Skipping perceptual hashing") else: - video_hashes = list(self.client.get_video_hashes(search_tags)) + video_hashes = list(self.client.get_video_hashes(self.search_tags)) if not overwrite: video_hashes = self.filter_unhashed(video_hashes) print(f"[blue] Found {len(video_hashes)} eligible files to perceptually hash.") self.add_perceptual_hashes_to_db(video_hashes) # Insert the perceptual hashed files into the vptree. - self.process_phashed_file_queue() + try: + self.process_phashed_file_queue() + except KeyboardInterrupt: + print("[yellow] Building the search tree was interrupted! Progress was saved.") + else: + print("[green] Finished fully building the search tree.") # Number of potential duplicates before adding more. # This is just to print info for the user. @@ -179,8 +189,10 @@ def deduplicate( # while this is running. pre_dedupe_count = self.client.get_potential_duplicate_count_hydrus() - # new: - self.find_potential_duplicates() + try: + self.find_potential_duplicates() + except KeyboardInterrupt: + print("[yellow] Searching for duplicates was interrupted! Progress was saved.") # Statistics for user post_dedupe_count = self.client.get_potential_duplicate_count_hydrus() @@ -214,7 +226,7 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> None: total=len(video_hashes), desc="Perceptually hashing files", dynamic_ncols=True, - unit="video", + unit="file", colour="BLUE", ) as pbar: filehasher = FileHasher(self.client) @@ -245,19 +257,22 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> None: pbar.update(1) except KeyboardInterrupt: - print("[yellow] Perceptual hash processing was interrupted!") + print("[yellow] Perceptual hash processing was interrupted! Progress was saved.") else: print("[green] Finished perceptual hash processing.") finally: + # Print some useful stats and info for users total_failures = failed_from_api_errors_count + failed_from_phash_count if total_failures > 0: print(f"[yellow] Perceptual hash processing had {total_failures} total failed files.") + if failed_from_api_errors_count > 0: print( f"[yellow] {failed_from_api_errors_count} failures were due to API errors. Ensure Hydrus is running and accessible before trying again." # noqa: E501 ) + if failed_from_phash_count > 0: print( f"[yellow] {failed_from_phash_count} failures were from an error during perceptual hashing. Are the files corrupted?" # noqa: E501 @@ -267,7 +282,7 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> None: "creating a Hydrus page with the name 'failed' and " "running the program with '--failed-page-name=failed'\n" ) - print(f"[green] Added {success_hash_count} new videos to the perceptual hash database.") + print(f"[green] Added {success_hash_count} new perceptual hashes to the database.") def compare_videos(self, video1_hash: str, video2_hash: str, video1_phash: str, video2_phash: str) -> None: """Compare videos and mark them as potential duplicates in Hydrus if they are similar.""" @@ -302,7 +317,7 @@ def process_phashed_file_queue(self): """ results = self.db.execute("SELECT file_hash, phash FROM phashed_file_queue").fetchall() for file_hash, perceptual_hash in tqdm( - results, dynamic_ncols=True, total=len(results), desc="Building vptree", unit="file", colour="BLUE" + results, dynamic_ncols=True, total=len(results), desc="Building search tree", unit="file", colour="BLUE" ): self.db.add_file(file_hash) self.db.add_perceptual_hash(perceptual_hash) @@ -316,14 +331,17 @@ def process_phashed_file_queue(self): def find_potential_duplicates( self, ) -> None: - """Find potential duplicates in the database and mark them in Hydrus.""" + """Find potential duplicates in the database and mark them as such in Hydrus.""" # TODO: Should we turn the inside of this function into a generator? It might make testing super easy. tree = vptree.VpTreeManager(self.db) search_threshold = vptree.fix_vpdq_similarity((self.threshold)) assert search_threshold > 0 and isinstance(search_threshold, int) if tree.MaintenanceDue(search_threshold): + # TODO: Do further testing on this. + print("[blue] Running search tree maintenance...") tree.maintain_tree() + self.db.commit() files = self.db.execute( "SELECT hash_id FROM shape_search_cache WHERE searched_distance is NULL or searched_distance < :threshold", @@ -331,7 +349,7 @@ def find_potential_duplicates( ).fetchall() with tqdm( - dynamic_ncols=True, total=len(files), desc="Finding potential duplicates", unit="video", colour="BLUE" + dynamic_ncols=True, total=len(files), desc="Finding potential duplicates", unit="file", colour="BLUE" ) as pbar: for hash_id in files: hash_id = hash_id[0]