Skip to content

Commit

Permalink
Add multithreaded single file hashing with accelerators
Browse files Browse the repository at this point in the history
  • Loading branch information
ianwal committed Jul 27, 2024
1 parent 91e9b3f commit cf36d54
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 31 deletions.
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,13 @@ dependencies = [
# Below is for vpdqpy
"pillow",
"pyav<12",
"hvdaccelerators @ git+https://github.com/hydrusvideodeduplicator/hvdaccelerators.git",
]

# pypi doesn't allow this, so need to remove and upload hvdaccelerators to pypi before release.
[tool.hatch.metadata]
allow-direct-references = true

[project.urls]
Documentation = "https://github.com/hydrusvideodeduplicator/hydrus-video-deduplicator#readme"
Issues = "https://github.com/hydrusvideodeduplicator/hydrus-video-deduplicator/issues"
Expand Down
2 changes: 1 addition & 1 deletion src/hydrusvideodeduplicator/__about__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# SPDX-License-Identifier: MIT
__version__ = "0.6.9"
__version__ = "0.7.0"
43 changes: 19 additions & 24 deletions src/hydrusvideodeduplicator/dedup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING

from joblib import Parallel, delayed
from rich import print
from tqdm import tqdm

Expand Down Expand Up @@ -280,30 +279,26 @@ def add_perceptual_hashes_to_db(self, video_hashes: Sequence[str]) -> Perceptual
colour="BLUE",
) as pbar:
filehasher = FileHasher(self.client)
with Parallel(n_jobs=self.job_count, return_as="generator_unordered") as parallel:
# Note: joblib actually copies the entire filehasher into a new process, including the client.
result_generator = parallel(
delayed(filehasher.fetch_and_phash_file)(video_hash) for video_hash in video_hashes
)
for result in result_generator:
if isinstance(result, FailedPerceptuallyHashedFile):
# We only want to add the failure to the page if the file was the actual cause of failure.
if isinstance(result.exc, HydrusApiException):
stats.failed_from_api_errors_count += 1
print("[red] Hydrus API error during perceptual hashing:")
print(f"{result.exc}")
else:
stats.failed_from_phash_count += 1
print("[red] Failed to perceptually hash a file.")
print(f"Failed file SHA256 hash: {result.file_hash}")
print(f"{result.exc}")
if self.page_logger:
self.page_logger.add_failed_video(result.file_hash)
for video_hash in video_hashes:
result = filehasher.fetch_and_phash_file(video_hash)
if isinstance(result, FailedPerceptuallyHashedFile):
# We only want to add the failure to the page if the file was the actual cause of failure.
if isinstance(result.exc, HydrusApiException):
stats.failed_from_api_errors_count += 1
print("[red] Hydrus API error during perceptual hashing:")
print(f"{result.exc}")
else:
stats.success_hash_count += 1
self.db.add_to_phashed_files_queue(result.file_hash, result.perceptual_hash)

pbar.update(1)
stats.failed_from_phash_count += 1
print("[red] Failed to perceptually hash a file.")
print(f"Failed file SHA256 hash: {result.file_hash}")
print(f"{result.exc}")
if self.page_logger:
self.page_logger.add_failed_video(result.file_hash)
else:
stats.success_hash_count += 1
self.db.add_to_phashed_files_queue(result.file_hash, result.perceptual_hash)

pbar.update(1)
except KeyboardInterrupt:
raise CancelledPerceptualHashException(stats)
return stats
Expand Down
31 changes: 25 additions & 6 deletions src/hydrusvideodeduplicator/vpdqpy/vpdqpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from typing import TYPE_CHECKING

import av
from hvdaccelerators import stuff
from PIL import Image

from ..pdqhashing.hasher.pdq_hasher import PDQHasher
from ..pdqhashing.pdq_types.hash256 import Hash256

if TYPE_CHECKING:
Expand Down Expand Up @@ -161,14 +161,33 @@ def computeHash(
if video is None:
raise ValueError

pdq = PDQHasher()
# pdq = PDQHasher()
features: VpdqHash = []

hasher = None
for second, frame in enumerate(Vpdq.frame_extract_pyav(video)):
pdq_hash_and_quality = pdq.fromBufferedImage(frame.to_image())
pdq_frame = VpdqFeature(pdq_hash_and_quality.getHash(), pdq_hash_and_quality.getQuality(), second)
features.append(pdq_frame)

# TODO: This uses SO MUCH memory is hashing gets behind decoding since there will be
# lots of raw frames in the queue which are HUGE. Add a max size for the queue.
# ... or I have a memory leak :(
im = frame.to_image()
im.thumbnail((512, 512))
if not hasher:
# TODO: Fix this to get the average fps from frame_extract_pyav or a new method.
# Although this doesn't appear to actually do anything. Exact hashing tests pass...
average_fps = 1
hasher = stuff.Hasher(average_fps, im.width, im.height)
rgb_image = im.convert("RGB")
# result = stuff.hash_frame(rgb_image.tobytes(), im.width, im.height)
hasher.hash_frame(rgb_image.tobytes())
# (pdq_hash, pdq_quality) = result
# pdq_hash = str(pdq_hash, encoding="utf-8")
# pdq_frame = VpdqFeature(Hash256.fromHexString(pdq_hash), pdq_quality, second)
# features.append(pdq_frame)
features = hasher.finish()
features = [
VpdqFeature(Hash256.fromHexString(feature.get_hash()), feature.get_quality(), feature.get_frame_number())
for feature in features
]
deduped_features = Vpdq.dedupe_features(features)
return deduped_features

Expand Down

0 comments on commit cf36d54

Please sign in to comment.