Skip to content

Commit

Permalink
Prototyping and testing of Bloom filter changes proposed in issue 4120
Browse files Browse the repository at this point in the history
  • Loading branch information
pdillinger committed Jul 12, 2018
1 parent d7fcb4a commit 90d1ab9
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 20 deletions.
26 changes: 21 additions & 5 deletions util/bloom.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.

#define PETERD_FIX_IMPL1_ENH 1
#define PETERD_FIX_IMPL1_XOR 1

#define PETERD_FIX_IMPL2_ODD 0
#define PETERD_FIX_IMPL2_ENH 1
#define PETERD_FIX_IMPL2_XOR 1

#include "rocksdb/filter_policy.h"

#include "rocksdb/slice.h"
Expand Down Expand Up @@ -124,10 +131,13 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data,
#endif
assert(num_lines > 0 && total_bits > 0);

const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
uint32_t delta = (h >> 17) | (h << 15) | PETERD_FIX_IMPL2_ODD; // Rotate right 17 bits
if (PETERD_FIX_IMPL2_XOR) { h ^= 0x6740bca3; }
uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8);
if (PETERD_FIX_IMPL2_XOR) { h ^= 0x41fc0926; }

for (uint32_t i = 0; i < num_probes_; ++i) {
if (PETERD_FIX_IMPL2_ENH) { delta += i; }
// Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
// to a simple operation by compiler.
const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8));
Expand Down Expand Up @@ -226,12 +236,15 @@ bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash,
const char* data = filter.data();

uint32_t h = hash;
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
uint32_t delta = (h >> 17) | (h << 15) | PETERD_FIX_IMPL2_ODD; // Rotate right 17 bits
if (PETERD_FIX_IMPL2_XOR) { h ^= 0x6740bca3; }
uint32_t b = (h % num_lines) * (cache_line_size * 8);
if (PETERD_FIX_IMPL2_XOR) { h ^= 0x41fc0926; }
PREFETCH(&data[b / 8], 0 /* rw */, 1 /* locality */);
PREFETCH(&data[b / 8 + cache_line_size - 1], 0 /* rw */, 1 /* locality */);

for (uint32_t i = 0; i < num_probes; ++i) {
if (PETERD_FIX_IMPL2_ENH) { delta += i; }
// Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized
// to a simple and operation by compiler.
const uint32_t bitpos = b + (h % (cache_line_size * 8));
Expand Down Expand Up @@ -272,7 +285,6 @@ class BloomFilterPolicy : public FilterPolicy {

size_t bytes = (bits + 7) / 8;
bits = bytes * 8;

const size_t init_size = dst->size();
dst->resize(init_size + bytes, 0);
dst->push_back(static_cast<char>(num_probes_)); // Remember # of probes
Expand All @@ -281,8 +293,10 @@ class BloomFilterPolicy : public FilterPolicy {
// Use double-hashing to generate a sequence of hash values.
// See analysis in [Kirsch,Mitzenmacher 2006].
uint32_t h = hash_func_(keys[i]);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
if (PETERD_FIX_IMPL1_XOR) { h ^= 0x6740bca3; }
for (size_t j = 0; j < num_probes_; j++) {
if (PETERD_FIX_IMPL1_ENH) { delta += j; }
const uint32_t bitpos = h % bits;
array[bitpos/8] |= (1 << (bitpos % 8));
h += delta;
Expand All @@ -308,8 +322,10 @@ class BloomFilterPolicy : public FilterPolicy {
}

uint32_t h = hash_func_(key);
const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits
if (PETERD_FIX_IMPL1_XOR) { h ^= 0x6740bca3; }
for (size_t j = 0; j < k; j++) {
if (PETERD_FIX_IMPL1_ENH) { delta += j; }
const uint32_t bitpos = h % bits;
if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false;
h += delta;
Expand Down
67 changes: 52 additions & 15 deletions util/bloom_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ DEFINE_int32(bits_per_key, 10, "");

namespace rocksdb {

#define BLOOM_TEST_THOROUGH 1
#ifdef BLOOM_TEST_THOROUGH
static const int bloom_test_max_length = 10000000;
static const int bloom_test_fp_probes = 100000;
#else
static const int bloom_test_max_length = 10000;
static const int bloom_test_fp_probes = 10000;
#endif

static const int kVerbose = 1;

static Slice Key(int i, char* buffer) {
Expand All @@ -47,8 +56,14 @@ static int NextLength(int length) {
length += 10;
} else if (length < 1000) {
length += 100;
} else {
} else if (length < 10000) {
length += 1000;
} else if (length < 100000) {
length += 10000;
} else if (length < 1000000) {
length += 100000;
} else {
length += 1000000;
}
return length;
}
Expand Down Expand Up @@ -113,12 +128,12 @@ class BloomTest : public testing::Test {
double FalsePositiveRate() {
char buffer[sizeof(int)];
int result = 0;
for (int i = 0; i < 10000; i++) {
for (int i = 0; i < bloom_test_fp_probes; i++) {
if (Matches(Key(i + 1000000000, buffer))) {
result++;
}
}
return result / 10000.0;
return result / (double)bloom_test_fp_probes;
}
};

Expand All @@ -142,24 +157,34 @@ TEST_F(BloomTest, VaryingLengths) {
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
int good_filters = 0;
double rate_sum = 0.0;
int rate_sum_samples = 0;
double max_rate = 0.0;

for (int length = 1; length <= 10000; length = NextLength(length)) {
for (int length = 1; length <= bloom_test_max_length; length = NextLength(length)) {
Reset();
for (int i = 0; i < length; i++) {
Add(Key(i, buffer));
// For more independent results for various lengths, use different
// input keys between them (length * 10 + ...)
Add(Key(length * 10 + i, buffer));
}
Build();

ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 40)) << length;

// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(Matches(Key(i, buffer)))
ASSERT_TRUE(Matches(Key(length * 10 + i, buffer)))
<< "Length " << length << "; key " << i;
}

// Check false positive rate
double rate = FalsePositiveRate();
if (length >= 1000) {
rate_sum += rate;
rate_sum_samples++;
}
if (rate > max_rate) { max_rate = rate; }
if (kVerbose >= 1) {
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
rate*100.0, length, static_cast<int>(FilterSize()));
Expand All @@ -169,8 +194,9 @@ TEST_F(BloomTest, VaryingLengths) {
else good_filters++;
}
if (kVerbose >= 1) {
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
fprintf(stderr, "Filters: %d good, %d mediocre (avg=%5.2f%% (l>=1000), max=%5.2f%%)\n",
good_filters, mediocre_filters,
rate_sum / rate_sum_samples * 100.0, max_rate * 100.0);
}
ASSERT_LE(mediocre_filters, good_filters/5);
}
Expand Down Expand Up @@ -231,12 +257,12 @@ class FullBloomTest : public testing::Test {
double FalsePositiveRate() {
char buffer[sizeof(int)];
int result = 0;
for (int i = 0; i < 10000; i++) {
for (int i = 0; i < bloom_test_fp_probes; i++) {
if (Matches(Key(i + 1000000000, buffer))) {
result++;
}
}
return result / 10000.0;
return result / (double)bloom_test_fp_probes;
}
};

Expand Down Expand Up @@ -274,24 +300,34 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
// Count number of filters that significantly exceed the false positive rate
int mediocre_filters = 0;
int good_filters = 0;
double rate_sum = 0.0;
int rate_sum_samples = 0;
double max_rate = 0.0;

for (int length = 1; length <= 10000; length = NextLength(length)) {
for (int length = 1; length <= bloom_test_max_length; length = NextLength(length)) {
Reset();
for (int i = 0; i < length; i++) {
Add(Key(i, buffer));
// For more independent results for various lengths, use different
// input keys between them (length * 10 + ...)
Add(Key(length * 10 + i, buffer));
}
Build();

ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length;

// All added keys must match
for (int i = 0; i < length; i++) {
ASSERT_TRUE(Matches(Key(i, buffer)))
ASSERT_TRUE(Matches(Key(length * 10 + i, buffer)))
<< "Length " << length << "; key " << i;
}

// Check false positive rate
double rate = FalsePositiveRate();
if (length >= 1000) {
rate_sum += rate;
rate_sum_samples++;
}
if (rate > max_rate) { max_rate = rate; }
if (kVerbose >= 1) {
fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n",
rate*100.0, length, static_cast<int>(FilterSize()));
Expand All @@ -303,8 +339,9 @@ TEST_F(FullBloomTest, FullVaryingLengths) {
good_filters++;
}
if (kVerbose >= 1) {
fprintf(stderr, "Filters: %d good, %d mediocre\n",
good_filters, mediocre_filters);
fprintf(stderr, "Filters: %d good, %d mediocre (avg=%5.2f%% (l>=1000), max=%5.2f%%)\n",
good_filters, mediocre_filters,
rate_sum / rate_sum_samples * 100.0, max_rate * 100.0);
}
ASSERT_LE(mediocre_filters, good_filters/5);
}
Expand Down

0 comments on commit 90d1ab9

Please sign in to comment.