Fix missing dupe pair comparisons, add VSCode dev stuff (#59)

* Fix missing search comparisons * Add VSCode debug launch for module, fix deprecated VSCode setting
hydrusvideodeduplicator · Jul 13, 2024 · e11a5da · e11a5da
1 parent a9ed22d
commit e11a5da
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 19 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,17 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Module",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "hydrusvideodeduplicator",
+            // You may want to customize these args or use a dotenv.
+            // Below API key may need to be configured to your own. This is the testdb api key.
+            "args": "--api-key='3b3cf10cc13862818ea95ddecfe434bed0828fb319b1ff56413917b471b566ab'"
+        }
+    ]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -12,7 +12,7 @@
         "editor.defaultFormatter": "ms-python.black-formatter",
         "editor.formatOnSave": true,
         "editor.codeActionsOnSave": {
-            "source.organizeImports": true
+            "source.organizeImports": "explicit"
         },
     },
 }
diff --git a/src/hydrusvideodeduplicator/dedup.py b/src/hydrusvideodeduplicator/dedup.py
@@ -67,9 +67,9 @@ def deduplicate(
         # Add perceptual hashes to video files
         # system:filetype tags are really inconsistent
         search_tags = [
-            'system:filetype=video, gif, apng',
-            'system:has duration',
-            'system:file service is not currently in trash',
+            "system:filetype=video, gif, apng",
+            "system:has duration",
+            "system:file service is not currently in trash",
         ]
 
         if custom_query is not None:
@@ -203,7 +203,7 @@ def compare_videos(self, video1_hash: str, video2_hash: str, video1_phash: str,
                 # Getting the file names will be VERY slow because of the API call
                 # file_names = get_file_names_hydrus(self.client.client, [video1_hash, video2_hash])
                 # self.hydlog.info(f"Duplicates filenames: {file_names}")
-                self.hydlog.info(f"\"Similar {similarity}%: {video1_hash}\" and \"{video2_hash}\"")
+                self.hydlog.info(f'"Similar {similarity}%: {video1_hash}" and "{video2_hash}"')
 
             self.mark_videos_as_duplicates(video1_hash, video2_hash)
 
@@ -232,29 +232,39 @@ def _find_potential_duplicates(
         video_counter = 0
         with SqliteDict(
             str(DedupeDB.get_db_file_path()), tablename="videos", flag="c", autocommit=True, outer_stack=False
-        ) as hashdb:
+        ) as videos_table:
             current_hash = None
             try:
-                total = len(hashdb)
+                # Make a copy of the video hashes here so we can preserve their order because SqliteDict row order
+                # changes during writes for the farthest search index. This is a bandaid solution.
+                # This assumes SqliteDict row order is preserved when opened and closed, even if it's not preserved
+                # while modifying elements.
+                video_hashes = [video_hash for video_hash in videos_table]
+                total = len(video_hashes)
 
                 with tqdm(
                     dynamic_ncols=True, total=total, desc="Finding duplicates", unit="video", colour="BLUE"
                 ) as pbar:
                     # -1 is all cores, -2 is all cores but one
                     with Parallel(n_jobs=self.job_count) as parallel:
-                        for i, video1_hash in enumerate(hashdb):
+                        for i, video1_hash in enumerate(video_hashes):
                             current_hash = video1_hash
                             video_counter += 1
                             pbar.update(1)
 
-                            row = hashdb[video1_hash]
+                            row = videos_table[video1_hash]
+
+                            # We only care about combinations of pairs, not permutations,
+                            # so start at the next unique comparison.
+                            start_index = i + 1
 
-                            # Store last furthest searched position in the database for each element
-                            # This way you only have to start searching at that place instead of at i+1 if it exists
-                            farthest_search_index = row.setdefault("farthest_search_index", i + 1)
+                            # Start at the last furthest searched position in the database for each element.
+                            # This way you only have to start searching at that place instead of at i+1, if it exists
+                            if "farthest_search_index" in row:
+                                start_index = row["farthest_search_index"]
 
-                            assert farthest_search_index <= total
-                            if farthest_search_index == total:
+                            assert start_index <= total
+                            if start_index == total:
                                 # This file has already been searched for dupes against all other videos in the DB
                                 continue
 
@@ -263,15 +273,15 @@ def _find_potential_duplicates(
                                     video1_hash,
                                     video2_hash,
                                     row["perceptual_hash"],
-                                    hashdb[video2_hash]["perceptual_hash"],
+                                    videos_table[video2_hash]["perceptual_hash"],
                                 )
-                                for video2_hash in islice(hashdb, row["farthest_search_index"], None)
+                                for video2_hash in islice(video_hashes, start_index, None)
                             )
 
                             # Video has now been compared against all other videos for dupes,
                             # so update farthest_search_index to the current length of the table
                             row["farthest_search_index"] = total
-                            hashdb[video1_hash] = row
+                            videos_table[video1_hash] = row
 
             except KeyboardInterrupt:
                 print("[yellow] Duplicate search was interrupted!")
@@ -280,9 +290,9 @@ def _find_potential_duplicates(
                 if current_hash is not None:
                     # Set the last element farthest_search_index to the end of the
                     # table since it won't get hashed because of the islice optimization
-                    row = hashdb[current_hash]
+                    row = videos_table[current_hash]
                     row["farthest_search_index"] = total
-                    hashdb[current_hash] = row
+                    videos_table[current_hash] = row
 
         # Statistics for user
         post_dedupe_count = self.client.get_potential_duplicate_count_hydrus()