Skip to content

Commit

Permalink
Fix missing dupe pair comparisons, add VSCode dev stuff (#59)
Browse files Browse the repository at this point in the history
* Fix missing search comparisons

* Add VSCode debug launch for module, fix deprecated VSCode setting
  • Loading branch information
ianwal authored Jul 13, 2024
1 parent a9ed22d commit e11a5da
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 19 deletions.
17 changes: 17 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Module",
"type": "debugpy",
"request": "launch",
"module": "hydrusvideodeduplicator",
// You may want to customize these args or use a dotenv.
// Below API key may need to be configured to your own. This is the testdb api key.
"args": "--api-key='3b3cf10cc13862818ea95ddecfe434bed0828fb319b1ff56413917b471b566ab'"
}
]
}
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"editor.defaultFormatter": "ms-python.black-formatter",
"editor.formatOnSave": true,
"editor.codeActionsOnSave": {
"source.organizeImports": true
"source.organizeImports": "explicit"
},
},
}
46 changes: 28 additions & 18 deletions src/hydrusvideodeduplicator/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,9 @@ def deduplicate(
# Add perceptual hashes to video files
# system:filetype tags are really inconsistent
search_tags = [
'system:filetype=video, gif, apng',
'system:has duration',
'system:file service is not currently in trash',
"system:filetype=video, gif, apng",
"system:has duration",
"system:file service is not currently in trash",
]

if custom_query is not None:
Expand Down Expand Up @@ -203,7 +203,7 @@ def compare_videos(self, video1_hash: str, video2_hash: str, video1_phash: str,
# Getting the file names will be VERY slow because of the API call
# file_names = get_file_names_hydrus(self.client.client, [video1_hash, video2_hash])
# self.hydlog.info(f"Duplicates filenames: {file_names}")
self.hydlog.info(f"\"Similar {similarity}%: {video1_hash}\" and \"{video2_hash}\"")
self.hydlog.info(f'"Similar {similarity}%: {video1_hash}" and "{video2_hash}"')

self.mark_videos_as_duplicates(video1_hash, video2_hash)

Expand Down Expand Up @@ -232,29 +232,39 @@ def _find_potential_duplicates(
video_counter = 0
with SqliteDict(
str(DedupeDB.get_db_file_path()), tablename="videos", flag="c", autocommit=True, outer_stack=False
) as hashdb:
) as videos_table:
current_hash = None
try:
total = len(hashdb)
# Make a copy of the video hashes here so we can preserve their order because SqliteDict row order
# changes during writes for the farthest search index. This is a bandaid solution.
# This assumes SqliteDict row order is preserved when opened and closed, even if it's not preserved
# while modifying elements.
video_hashes = [video_hash for video_hash in videos_table]
total = len(video_hashes)

with tqdm(
dynamic_ncols=True, total=total, desc="Finding duplicates", unit="video", colour="BLUE"
) as pbar:
# -1 is all cores, -2 is all cores but one
with Parallel(n_jobs=self.job_count) as parallel:
for i, video1_hash in enumerate(hashdb):
for i, video1_hash in enumerate(video_hashes):
current_hash = video1_hash
video_counter += 1
pbar.update(1)

row = hashdb[video1_hash]
row = videos_table[video1_hash]

# We only care about combinations of pairs, not permutations,
# so start at the next unique comparison.
start_index = i + 1

# Store last furthest searched position in the database for each element
# This way you only have to start searching at that place instead of at i+1 if it exists
farthest_search_index = row.setdefault("farthest_search_index", i + 1)
# Start at the last furthest searched position in the database for each element.
# This way you only have to start searching at that place instead of at i+1, if it exists
if "farthest_search_index" in row:
start_index = row["farthest_search_index"]

assert farthest_search_index <= total
if farthest_search_index == total:
assert start_index <= total
if start_index == total:
# This file has already been searched for dupes against all other videos in the DB
continue

Expand All @@ -263,15 +273,15 @@ def _find_potential_duplicates(
video1_hash,
video2_hash,
row["perceptual_hash"],
hashdb[video2_hash]["perceptual_hash"],
videos_table[video2_hash]["perceptual_hash"],
)
for video2_hash in islice(hashdb, row["farthest_search_index"], None)
for video2_hash in islice(video_hashes, start_index, None)
)

# Video has now been compared against all other videos for dupes,
# so update farthest_search_index to the current length of the table
row["farthest_search_index"] = total
hashdb[video1_hash] = row
videos_table[video1_hash] = row

except KeyboardInterrupt:
print("[yellow] Duplicate search was interrupted!")
Expand All @@ -280,9 +290,9 @@ def _find_potential_duplicates(
if current_hash is not None:
# Set the last element farthest_search_index to the end of the
# table since it won't get hashed because of the islice optimization
row = hashdb[current_hash]
row = videos_table[current_hash]
row["farthest_search_index"] = total
hashdb[current_hash] = row
videos_table[current_hash] = row

# Statistics for user
post_dedupe_count = self.client.get_potential_duplicate_count_hydrus()
Expand Down

0 comments on commit e11a5da

Please sign in to comment.