Skip to content

Commit

Permalink
pdq utils
Browse files Browse the repository at this point in the history
  • Loading branch information
16BitNarwhal committed Dec 13, 2024
1 parent 96a608b commit 4ced269
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 51 deletions.
6 changes: 4 additions & 2 deletions pdq/cpp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ LIBHDRS=\
io/pdqio.h \
index/mih.h \
hashing/torben.h \
./CImg.h
./CImg.h \
common/pdqutils.h

LIBSRCS=\
common/pdqhashtypes.cpp \
Expand All @@ -41,7 +42,8 @@ LIBSRCS=\
hashing/pdqhashing.cpp \
downscaling/downscaling.cpp \
io/pdqio.cpp \
hashing/torben.cpp
hashing/torben.cpp \
common/pdqutils.cpp

MAINS=\
pdq-photo-hasher \
Expand Down
55 changes: 6 additions & 49 deletions pdq/cpp/bin/benchmark-query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <string.h>
#include <pdq/cpp/index/mih.h>
#include <pdq/cpp/io/hashio.h>
#include <pdq/cpp/common/pdqutils.h>

#include <algorithm>
#include <chrono>
Expand Down Expand Up @@ -36,13 +37,6 @@ static void queryMIH(
const std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>>&
index);

// Helper declarations
static facebook::pdq::hashing::Hash256 generateRandomHash(std::mt19937& gen);
static facebook::pdq::hashing::Hash256 addNoise(
const facebook::pdq::hashing::Hash256& original,
int numBitsToFlip,
std::mt19937& gen);

// ----------------------------------------------------------------
int main(int argc, char** argv) {
if (argc > 1 && (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help"))) {
Expand Down Expand Up @@ -145,22 +139,22 @@ static void query(char* argv0, int argc, char** argv) {
// Generate random hashes for queries
std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>> queries;
for (size_t i = 0; i < querySize; i++) {
auto hash = generateRandomHash(gen);
auto hash = facebook::pdq::hashing::generateRandomHash(gen);
queries.push_back({hash, "query_" + std::to_string(i)});
}

// Generate random hashes for index
std::vector<std::pair<facebook::pdq::hashing::Hash256, std::string>> index;
for (size_t i = 0; i < indexSize - querySize; i++) {
auto hash = generateRandomHash(gen);
auto hash = facebook::pdq::hashing::generateRandomHash(gen);
index.push_back({hash, "index_" + std::to_string(i)});
}

// Add noise to queries then insert into index
std::uniform_int_distribution<int> noiseDist(1, maxDistance);
for (const auto& query : queries) {
int bitsToFlip = noiseDist(gen);
auto noisyHash = addNoise(query.first, bitsToFlip, gen);
auto noisyHash = facebook::pdq::hashing::addNoise(query.first, bitsToFlip, gen);
index.push_back({noisyHash, "index_noisy_" + query.second});
}
std::shuffle(index.begin(), index.end(), gen);
Expand All @@ -184,10 +178,6 @@ static void query(char* argv0, int argc, char** argv) {
maxDistance, verbose, seed, indexSize, querySize, queries, index);
} else if (method == "mih") {
queryMIH(maxDistance, verbose, seed, indexSize, querySize, queries, index);
} else {
fprintf(stderr, "Unknown method: %s\n", method.c_str());
usage(argv0, 1);
return;
}
}

Expand Down Expand Up @@ -223,7 +213,7 @@ static void queryLinear(
elapsedSeconds = t2 - t1;
double seconds = elapsedSeconds.count();

printf("METHOD: Linear query\n");
printf("METHOD: Linear query\n"); // TODO: dont make people rewrite this a buncha times
printf("QUERY COUNT: %d\n", (int)queries.size());
printf("INDEX COUNT: %d\n", (int)index.size());
printf("TOTAL MATCH COUNT: %d\n", (int)matches.size());
Expand All @@ -234,7 +224,7 @@ static void queryLinear(
printf("\n");
}

static void queryMIH(
static void queryMIH( // TOOD: pull timing out of func / query? pull index out somehow?
const int maxDistance,
const bool verbose,
const unsigned int seed,
Expand Down Expand Up @@ -288,36 +278,3 @@ static void queryMIH(
querySize > 0 ? seconds / querySize : 0);
printf("\n");
}

//////////////////////////
//// Helper Functions ////
//////////////////////////

// Generate random hash
static facebook::pdq::hashing::Hash256 generateRandomHash(std::mt19937& gen) {
facebook::pdq::hashing::Hash256 hash;
std::uniform_int_distribution<uint16_t> dist(0, UINT16_MAX);

for (int i = 0; i < facebook::pdq::hashing::HASH256_NUM_WORDS; i++) {
hash.w[i] = dist(gen);
}
return hash;
}

// Add noise to hash by flipping random bits
static facebook::pdq::hashing::Hash256 addNoise(
const facebook::pdq::hashing::Hash256& original,
int numBitsToFlip,
std::mt19937& gen) {
facebook::pdq::hashing::Hash256 noisy = original;
std::vector<int> bitIndices(256);
for (int i = 0; i < 256; i++) bitIndices[i] = i;
std::shuffle(bitIndices.begin(), bitIndices.end(), gen);
for (int i = 0; i < numBitsToFlip; i++) {
int bitIndex = bitIndices[i];
int wordIndex = bitIndex / 16;
int position = bitIndex % 16;
noisy.w[wordIndex] ^= (1 << position);
}
return noisy;
}
41 changes: 41 additions & 0 deletions pdq/cpp/common/pdqutils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include <random>
#include <vector>
#include <pdq/cpp/common/pdqutils.h>
#include <algorithm>

namespace facebook {
namespace pdq {
namespace hashing {

// Generate random hash
Hash256 generateRandomHash(std::mt19937& gen) {
Hash256 hash;
std::uniform_int_distribution<uint16_t> dist(0, UINT16_MAX);

for (int i = 0; i < HASH256_NUM_WORDS; i++) {
hash.w[i] = dist(gen);
}
return hash;
}

// Add noise to hash by flipping random bits
Hash256 addNoise(
const Hash256& original,
int numBitsToFlip,
std::mt19937& gen) {
Hash256 noisy = original;
std::vector<int> bitIndices(256);
for (int i = 0; i < 256; i++) bitIndices[i] = i;
std::shuffle(bitIndices.begin(), bitIndices.end(), gen);
for (int i = 0; i < numBitsToFlip; i++) {
int bitIndex = bitIndices[i];
int wordIndex = bitIndex / 16;
int position = bitIndex % 16;
noisy.w[wordIndex] ^= (1 << position);
}
return noisy;
}

} // namespace hashing
} // namespace pdq
} // namespace facebook
25 changes: 25 additions & 0 deletions pdq/cpp/common/pdqutils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef PDQ_UTILS_H
#define PDQ_UTILS_H

#include <pdq/cpp/common/pdqhashtypes.h>
#include <random>
#include <vector>

namespace facebook {
namespace pdq {
namespace hashing {

// Generate random hash
Hash256 generateRandomHash(std::mt19937& gen);

// Add noise to hash by flipping random bits
Hash256 addNoise(
const Hash256& original,
int numBitsToFlip,
std::mt19937& gen);

} // namespace hashing
} // namespace pdq
} // namespace facebook

#endif

0 comments on commit 4ced269

Please sign in to comment.