Skip to content

Commit

Permalink
Add keyboard interrupt to building tree and search stage, move custom…
Browse files Browse the repository at this point in the history
… query to ctor,
  • Loading branch information
ianwal committed Jul 22, 2024
1 parent 7f7bee9 commit 382dab9
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 41 deletions.
5 changes: 3 additions & 2 deletions src/hydrusvideodeduplicator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,9 @@ def exit_from_failure() -> NoReturn:
db.commit()
db_stats = DedupeDB.get_db_stats(db)

deduper = HydrusVideoDeduplicator(db, client=hvdclient, job_count=job_count, failed_page_name=failed_page_name)
deduper = HydrusVideoDeduplicator(
db, client=hvdclient, job_count=job_count, failed_page_name=failed_page_name, custom_query=query
)

if debug:
deduper.hydlog.setLevel(logging.DEBUG)
Expand All @@ -181,7 +183,6 @@ def exit_from_failure() -> NoReturn:

deduper.deduplicate(
overwrite=overwrite,
custom_query=query,
skip_hashing=skip_hashing,
)

Expand Down
28 changes: 13 additions & 15 deletions src/hydrusvideodeduplicator/db/DedupeDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,6 +566,7 @@ def print_upgrade(version: str, new_version: str):
if not self.does_need_upgrade():
return

# TODO: Change 0.6.9 back to 0.7.0 before release.
if SemanticVersion(version) < SemanticVersion("0.6.9"):
print_upgrade(version, "0.6.9")

Expand Down Expand Up @@ -597,8 +598,12 @@ def print_upgrade(version: str, new_version: str):
"CREATE TABLE IF NOT EXISTS phashed_file_queue ( file_hash BLOB_BYTES NOT NULL UNIQUE, phash BLOB_BYTES NOT NULL, PRIMARY KEY ( file_hash, phash ) )" # noqa: E501
)

# Insert the files from the old videos table into the DB and the newly added vptree.
# Insert the files from the old videos table into the hash queue.
old_videos_data = []
print(
"Migrating perceptually hashed videos from the old table.\n"
"This may take a few minutes, depending your db length."
)
with SqliteDict(
get_db_file_path(), tablename="videos", flag="c", autocommit=False, outer_stack=False
) as videos_table:
Expand All @@ -607,21 +612,14 @@ def print_upgrade(version: str, new_version: str):
row = videos_table[video_hash]
if "perceptual_hash" in row:
old_videos_data.append((video_hash, row["perceptual_hash"]))
# TODO: Should we move the farthest search index as well?

with tqdm(
dynamic_ncols=True,
total=len(old_videos_data),
desc="Migrating phashes to vptree...",
unit="file",
colour="BLUE",
) as pbar:
for video_hash, perceptual_hash in old_videos_data:
# TODO: If these functions change this upgrade may not work! We need to be careful about updating them. # noqa: E501
# An upgrade cutoff at some point to prevent bitrot is a good idea, which is what Hydrus does.
self.add_to_phashed_files_queue(video_hash, perceptual_hash)
pbar.update(1)
# The farthest search index will not be moved.

for video_hash, perceptual_hash in old_videos_data:
# TODO: If these functions change this upgrade may not work! We need to be careful about updating them. # noqa: E501
# An upgrade cutoff at some point to prevent bitrot is a good idea, which is what Hydrus does.
self.add_to_phashed_files_queue(video_hash, perceptual_hash)

# TODO: Change 0.6.9 back to 0.7.0 before release.
self.set_version("0.6.9")
# Note: We need to keep re-running get_version so that we can progressively upgrade.
version = self.get_version()
Expand Down
66 changes: 42 additions & 24 deletions src/hydrusvideodeduplicator/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,26 +127,15 @@ def __init__(
client: HVDClient,
job_count: int = -2,
failed_page_name: str | None = None,
custom_query: Sequence[str] | None = None,
):
self.db = db
self.client = client
self.job_count = job_count
self.page_logger = None if failed_page_name is None else HydrusPageLogger(self.client, failed_page_name)
self.search_tags = self.get_search_tags(custom_query)

def deduplicate(
self,
overwrite: bool = False,
custom_query: Sequence[str] | None = None,
skip_hashing: bool = False,
) -> None:
"""
Run all deduplicate functions:
1. Retrieve video hashes
2. Calculate perceptual hashes
3. Find potential duplicates
"""

# Add perceptual hashes to video files
def get_search_tags(self, custom_query: Sequence[str] | None) -> list[str]:
# system:filetype tags are really inconsistent
search_tags = [
"system:filetype=video, gif, apng",
Expand All @@ -160,27 +149,50 @@ def deduplicate(
if len(custom_query) > 0:
search_tags.extend(custom_query)
print(f"[yellow] Custom Query: {custom_query}")
return search_tags

def deduplicate(
self,
overwrite: bool = False,
skip_hashing: bool = False,
) -> None:
"""
Run all deduplicate functions.
Dedupe Algorithm:
1. Perceptually hash the videos.
2. Insert the perceptual hashes into the vptree
3. Search for similar videos in the vptree.
4. Mark the similar videos as potential duplicates in Hydrus.
"""

if skip_hashing:
print("[yellow] Skipping perceptual hashing")
else:
video_hashes = list(self.client.get_video_hashes(search_tags))
video_hashes = list(self.client.get_video_hashes(self.search_tags))
if not overwrite:
video_hashes = self.filter_unhashed(video_hashes)
print(f"[blue] Found {len(video_hashes)} eligible files to perceptually hash.")
self.add_perceptual_hashes_to_db(video_hashes)

# Insert the perceptual hashed files into the vptree.
self.process_phashed_file_queue()
try:
self.process_phashed_file_queue()
except KeyboardInterrupt:
print("[yellow] Building the search tree was interrupted! Progress was saved.")
else:
print("[green] Finished fully building the search tree.")

# Number of potential duplicates before adding more.
# This is just to print info for the user.
# Note: This will be inaccurate if the user searches for duplicates in the Hydrus client
# while this is running.
pre_dedupe_count = self.client.get_potential_duplicate_count_hydrus()

# new:
self.find_potential_duplicates()
try:
self.find_potential_duplicates()
except KeyboardInterrupt:
print("[yellow] Searching for duplicates was interrupted! Progress was saved.")

# Statistics for user
post_dedupe_count = self.client.get_potential_duplicate_count_hydrus()
Expand Down Expand Up @@ -214,7 +226,7 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> None:
total=len(video_hashes),
desc="Perceptually hashing files",
dynamic_ncols=True,
unit="video",
unit="file",
colour="BLUE",
) as pbar:
filehasher = FileHasher(self.client)
Expand Down Expand Up @@ -245,19 +257,22 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> None:
pbar.update(1)

except KeyboardInterrupt:
print("[yellow] Perceptual hash processing was interrupted!")
print("[yellow] Perceptual hash processing was interrupted! Progress was saved.")

else:
print("[green] Finished perceptual hash processing.")

finally:
# Print some useful stats and info for users
total_failures = failed_from_api_errors_count + failed_from_phash_count
if total_failures > 0:
print(f"[yellow] Perceptual hash processing had {total_failures} total failed files.")

if failed_from_api_errors_count > 0:
print(
f"[yellow] {failed_from_api_errors_count} failures were due to API errors. Ensure Hydrus is running and accessible before trying again." # noqa: E501
)

if failed_from_phash_count > 0:
print(
f"[yellow] {failed_from_phash_count} failures were from an error during perceptual hashing. Are the files corrupted?" # noqa: E501
Expand All @@ -267,7 +282,7 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> None:
"creating a Hydrus page with the name 'failed' and "
"running the program with '--failed-page-name=failed'\n"
)
print(f"[green] Added {success_hash_count} new videos to the perceptual hash database.")
print(f"[green] Added {success_hash_count} new perceptual hashes to the database.")

def compare_videos(self, video1_hash: str, video2_hash: str, video1_phash: str, video2_phash: str) -> None:
"""Compare videos and mark them as potential duplicates in Hydrus if they are similar."""
Expand Down Expand Up @@ -302,7 +317,7 @@ def process_phashed_file_queue(self):
"""
results = self.db.execute("SELECT file_hash, phash FROM phashed_file_queue").fetchall()
for file_hash, perceptual_hash in tqdm(
results, dynamic_ncols=True, total=len(results), desc="Building vptree", unit="file", colour="BLUE"
results, dynamic_ncols=True, total=len(results), desc="Building search tree", unit="file", colour="BLUE"
):
self.db.add_file(file_hash)
self.db.add_perceptual_hash(perceptual_hash)
Expand All @@ -316,22 +331,25 @@ def process_phashed_file_queue(self):
def find_potential_duplicates(
self,
) -> None:
"""Find potential duplicates in the database and mark them in Hydrus."""
"""Find potential duplicates in the database and mark them as such in Hydrus."""
# TODO: Should we turn the inside of this function into a generator? It might make testing super easy.
tree = vptree.VpTreeManager(self.db)
search_threshold = vptree.fix_vpdq_similarity((self.threshold))
assert search_threshold > 0 and isinstance(search_threshold, int)

if tree.MaintenanceDue(search_threshold):
# TODO: Do further testing on this.
print("[blue] Running search tree maintenance...")
tree.maintain_tree()
self.db.commit()

files = self.db.execute(
"SELECT hash_id FROM shape_search_cache WHERE searched_distance is NULL or searched_distance < :threshold",
{"threshold": search_threshold},
).fetchall()

with tqdm(
dynamic_ncols=True, total=len(files), desc="Finding potential duplicates", unit="video", colour="BLUE"
dynamic_ncols=True, total=len(files), desc="Finding potential duplicates", unit="file", colour="BLUE"
) as pbar:
for hash_id in files:
hash_id = hash_id[0]
Expand Down

0 comments on commit 382dab9

Please sign in to comment.