diff --git a/pyproject.toml b/pyproject.toml index b131574..44b4b3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,8 +32,13 @@ dependencies = [ # Below is for vpdqpy "pillow", "pyav<12", + "hvdaccelerators @ git+https://github.com/hydrusvideodeduplicator/hvdaccelerators.git", ] +# pypi doesn't allow this, so need to remove and upload hvdaccelerators to pypi before release. +[tool.hatch.metadata] +allow-direct-references = true + [project.urls] Documentation = "https://github.com/hydrusvideodeduplicator/hydrus-video-deduplicator#readme" Issues = "https://github.com/hydrusvideodeduplicator/hydrus-video-deduplicator/issues" diff --git a/src/hydrusvideodeduplicator/dedup.py b/src/hydrusvideodeduplicator/dedup.py index 1572410..cc61227 100644 --- a/src/hydrusvideodeduplicator/dedup.py +++ b/src/hydrusvideodeduplicator/dedup.py @@ -5,7 +5,6 @@ from dataclasses import dataclass from typing import TYPE_CHECKING -from joblib import Parallel, delayed from rich import print from tqdm import tqdm @@ -14,6 +13,8 @@ FileHash = str +import gc + import hydrusvideodeduplicator.hydrus_api as hydrus_api from .client import HVDClient @@ -280,32 +281,31 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> Perceptual colour="BLUE", ) as pbar: filehasher = FileHasher(self.client) - with Parallel(n_jobs=self.job_count, return_as="generator_unordered") as parallel: - # Note: joblib actually copies the entire filehasher into a new process, including the client. - result_generator = parallel( - delayed(filehasher.fetch_and_phash_file)(video_hash) for video_hash in video_hashes - ) - for result in result_generator: - if isinstance(result, FailedPerceptuallyHashedFile): - # We only want to add the failure to the page if the file was the actual cause of failure. - if isinstance(result.exc, HydrusApiException): - stats.failed_from_api_errors_count += 1 - print("[red] Hydrus API error during perceptual hashing:") - print(f"{result.exc}") - else: - stats.failed_from_phash_count += 1 - print("[red] Failed to perceptually hash a file.") - print(f"Failed file SHA256 hash: {result.file_hash}") - print(f"{result.exc}") - if self.page_logger: - self.page_logger.add_failed_video(result.file_hash) + for video_hash in video_hashes: + result = filehasher.fetch_and_phash_file(video_hash) + if isinstance(result, FailedPerceptuallyHashedFile): + # We only want to add the failure to the page if the file was the actual cause of failure. + if isinstance(result.exc, HydrusApiException): + stats.failed_from_api_errors_count += 1 + print("[red] Hydrus API error during perceptual hashing:") + print(f"{result.exc}") else: - stats.success_hash_count += 1 - self.db.add_to_phashed_files_queue(result.file_hash, result.perceptual_hash) - - pbar.update(1) + stats.failed_from_phash_count += 1 + print("[red] Failed to perceptually hash a file.") + print(f"Failed file SHA256 hash: {result.file_hash}") + print(f"{result.exc}") + if self.page_logger: + self.page_logger.add_failed_video(result.file_hash) + else: + stats.success_hash_count += 1 + self.db.add_to_phashed_files_queue(result.file_hash, result.perceptual_hash) + + # Collect garbage now to avoid huge memory usage from the video files and frames. + gc.collect() + pbar.update(1) except KeyboardInterrupt: raise CancelledPerceptualHashException(stats) + gc.collect() return stats def mark_videos_as_duplicates(self, video1_hash: str, video2_hash: str): diff --git a/src/hydrusvideodeduplicator/vpdqpy/vpdqpy.py b/src/hydrusvideodeduplicator/vpdqpy/vpdqpy.py index 51506f5..a723f83 100644 --- a/src/hydrusvideodeduplicator/vpdqpy/vpdqpy.py +++ b/src/hydrusvideodeduplicator/vpdqpy/vpdqpy.py @@ -8,9 +8,9 @@ from typing import TYPE_CHECKING import av +from hvdaccelerators import stuff from PIL import Image -from ..pdqhashing.hasher.pdq_hasher import PDQHasher from ..pdqhashing.pdq_types.hash256 import Hash256 if TYPE_CHECKING: @@ -161,14 +161,33 @@ def computeHash( if video is None: raise ValueError - pdq = PDQHasher() + # pdq = PDQHasher() features: VpdqHash = [] + hasher = None for second, frame in enumerate(Vpdq.frame_extract_pyav(video)): - pdq_hash_and_quality = pdq.fromBufferedImage(frame.to_image()) - pdq_frame = VpdqFeature(pdq_hash_and_quality.getHash(), pdq_hash_and_quality.getQuality(), second) - features.append(pdq_frame) - + # TODO: This uses SO MUCH memory is hashing gets behind decoding since there will be + # lots of raw frames in the queue which are HUGE. Add a max size for the queue. + # ... or I have a memory leak :( + im = frame.to_image() + im.thumbnail((512, 512)) + if not hasher: + # TODO: Fix this to get the average fps from frame_extract_pyav or a new method. + # Although this doesn't appear to actually do anything. Exact hashing tests pass... + average_fps = 1 + hasher = stuff.Hasher(average_fps, im.width, im.height) + rgb_image = im.convert("RGB") + # result = stuff.hash_frame(rgb_image.tobytes(), im.width, im.height) + hasher.hash_frame(rgb_image.tobytes()) + # (pdq_hash, pdq_quality) = result + # pdq_hash = str(pdq_hash, encoding="utf-8") + # pdq_frame = VpdqFeature(Hash256.fromHexString(pdq_hash), pdq_quality, second) + # features.append(pdq_frame) + features = hasher.finish() + features = [ + VpdqFeature(Hash256.fromHexString(feature.get_hash()), feature.get_quality(), feature.get_frame_number()) + for feature in features + ] deduped_features = Vpdq.dedupe_features(features) return deduped_features