diff --git a/util/bloom.cc b/util/bloom.cc index 51de07953d1..5d78f389afe 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -7,6 +7,13 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#define PETERD_FIX_IMPL1_ENH 1 +#define PETERD_FIX_IMPL1_XOR 1 + +#define PETERD_FIX_IMPL2_ODD 0 +#define PETERD_FIX_IMPL2_ENH 1 +#define PETERD_FIX_IMPL2_XOR 1 + #include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" @@ -124,10 +131,13 @@ inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, #endif assert(num_lines > 0 && total_bits > 0); - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t delta = (h >> 17) | (h << 15) | PETERD_FIX_IMPL2_ODD; // Rotate right 17 bits + if (PETERD_FIX_IMPL2_XOR) { h ^= 0x6740bca3; } uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8); + if (PETERD_FIX_IMPL2_XOR) { h ^= 0x41fc0926; } for (uint32_t i = 0; i < num_probes_; ++i) { + if (PETERD_FIX_IMPL2_ENH) { delta += i; } // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized // to a simple operation by compiler. const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); @@ -226,12 +236,15 @@ bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash, const char* data = filter.data(); uint32_t h = hash; - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t delta = (h >> 17) | (h << 15) | PETERD_FIX_IMPL2_ODD; // Rotate right 17 bits + if (PETERD_FIX_IMPL2_XOR) { h ^= 0x6740bca3; } uint32_t b = (h % num_lines) * (cache_line_size * 8); + if (PETERD_FIX_IMPL2_XOR) { h ^= 0x41fc0926; } PREFETCH(&data[b / 8], 0 /* rw */, 1 /* locality */); PREFETCH(&data[b / 8 + cache_line_size - 1], 0 /* rw */, 1 /* locality */); for (uint32_t i = 0; i < num_probes; ++i) { + if (PETERD_FIX_IMPL2_ENH) { delta += i; } // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized // to a simple and operation by compiler. const uint32_t bitpos = b + (h % (cache_line_size * 8)); @@ -272,7 +285,6 @@ class BloomFilterPolicy : public FilterPolicy { size_t bytes = (bits + 7) / 8; bits = bytes * 8; - const size_t init_size = dst->size(); dst->resize(init_size + bytes, 0); dst->push_back(static_cast(num_probes_)); // Remember # of probes @@ -281,8 +293,10 @@ class BloomFilterPolicy : public FilterPolicy { // Use double-hashing to generate a sequence of hash values. // See analysis in [Kirsch,Mitzenmacher 2006]. uint32_t h = hash_func_(keys[i]); - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (PETERD_FIX_IMPL1_XOR) { h ^= 0x6740bca3; } for (size_t j = 0; j < num_probes_; j++) { + if (PETERD_FIX_IMPL1_ENH) { delta += j; } const uint32_t bitpos = h % bits; array[bitpos/8] |= (1 << (bitpos % 8)); h += delta; @@ -308,8 +322,10 @@ class BloomFilterPolicy : public FilterPolicy { } uint32_t h = hash_func_(key); - const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + if (PETERD_FIX_IMPL1_XOR) { h ^= 0x6740bca3; } for (size_t j = 0; j < k; j++) { + if (PETERD_FIX_IMPL1_ENH) { delta += j; } const uint32_t bitpos = h % bits; if ((array[bitpos/8] & (1 << (bitpos % 8))) == 0) return false; h += delta; diff --git a/util/bloom_test.cc b/util/bloom_test.cc index bbf1d3ae9a8..f52b51f2986 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -31,6 +31,15 @@ DEFINE_int32(bits_per_key, 10, ""); namespace rocksdb { +#define BLOOM_TEST_THOROUGH 1 +#ifdef BLOOM_TEST_THOROUGH +static const int bloom_test_max_length = 10000000; +static const int bloom_test_fp_probes = 100000; +#else +static const int bloom_test_max_length = 10000; +static const int bloom_test_fp_probes = 10000; +#endif + static const int kVerbose = 1; static Slice Key(int i, char* buffer) { @@ -47,8 +56,14 @@ static int NextLength(int length) { length += 10; } else if (length < 1000) { length += 100; - } else { + } else if (length < 10000) { length += 1000; + } else if (length < 100000) { + length += 10000; + } else if (length < 1000000) { + length += 100000; + } else { + length += 1000000; } return length; } @@ -113,12 +128,12 @@ class BloomTest : public testing::Test { double FalsePositiveRate() { char buffer[sizeof(int)]; int result = 0; - for (int i = 0; i < 10000; i++) { + for (int i = 0; i < bloom_test_fp_probes; i++) { if (Matches(Key(i + 1000000000, buffer))) { result++; } } - return result / 10000.0; + return result / (double)bloom_test_fp_probes; } }; @@ -142,11 +157,16 @@ TEST_F(BloomTest, VaryingLengths) { // Count number of filters that significantly exceed the false positive rate int mediocre_filters = 0; int good_filters = 0; + double rate_sum = 0.0; + int rate_sum_samples = 0; + double max_rate = 0.0; - for (int length = 1; length <= 10000; length = NextLength(length)) { + for (int length = 1; length <= bloom_test_max_length; length = NextLength(length)) { Reset(); for (int i = 0; i < length; i++) { - Add(Key(i, buffer)); + // For more independent results for various lengths, use different + // input keys between them (length * 10 + ...) + Add(Key(length * 10 + i, buffer)); } Build(); @@ -154,12 +174,17 @@ TEST_F(BloomTest, VaryingLengths) { // All added keys must match for (int i = 0; i < length; i++) { - ASSERT_TRUE(Matches(Key(i, buffer))) + ASSERT_TRUE(Matches(Key(length * 10 + i, buffer))) << "Length " << length << "; key " << i; } // Check false positive rate double rate = FalsePositiveRate(); + if (length >= 1000) { + rate_sum += rate; + rate_sum_samples++; + } + if (rate > max_rate) { max_rate = rate; } if (kVerbose >= 1) { fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", rate*100.0, length, static_cast(FilterSize())); @@ -169,8 +194,9 @@ TEST_F(BloomTest, VaryingLengths) { else good_filters++; } if (kVerbose >= 1) { - fprintf(stderr, "Filters: %d good, %d mediocre\n", - good_filters, mediocre_filters); + fprintf(stderr, "Filters: %d good, %d mediocre (avg=%5.2f%% (l>=1000), max=%5.2f%%)\n", + good_filters, mediocre_filters, + rate_sum / rate_sum_samples * 100.0, max_rate * 100.0); } ASSERT_LE(mediocre_filters, good_filters/5); } @@ -231,12 +257,12 @@ class FullBloomTest : public testing::Test { double FalsePositiveRate() { char buffer[sizeof(int)]; int result = 0; - for (int i = 0; i < 10000; i++) { + for (int i = 0; i < bloom_test_fp_probes; i++) { if (Matches(Key(i + 1000000000, buffer))) { result++; } } - return result / 10000.0; + return result / (double)bloom_test_fp_probes; } }; @@ -274,11 +300,16 @@ TEST_F(FullBloomTest, FullVaryingLengths) { // Count number of filters that significantly exceed the false positive rate int mediocre_filters = 0; int good_filters = 0; + double rate_sum = 0.0; + int rate_sum_samples = 0; + double max_rate = 0.0; - for (int length = 1; length <= 10000; length = NextLength(length)) { + for (int length = 1; length <= bloom_test_max_length; length = NextLength(length)) { Reset(); for (int i = 0; i < length; i++) { - Add(Key(i, buffer)); + // For more independent results for various lengths, use different + // input keys between them (length * 10 + ...) + Add(Key(length * 10 + i, buffer)); } Build(); @@ -286,12 +317,17 @@ TEST_F(FullBloomTest, FullVaryingLengths) { // All added keys must match for (int i = 0; i < length; i++) { - ASSERT_TRUE(Matches(Key(i, buffer))) + ASSERT_TRUE(Matches(Key(length * 10 + i, buffer))) << "Length " << length << "; key " << i; } // Check false positive rate double rate = FalsePositiveRate(); + if (length >= 1000) { + rate_sum += rate; + rate_sum_samples++; + } + if (rate > max_rate) { max_rate = rate; } if (kVerbose >= 1) { fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", rate*100.0, length, static_cast(FilterSize())); @@ -303,8 +339,9 @@ TEST_F(FullBloomTest, FullVaryingLengths) { good_filters++; } if (kVerbose >= 1) { - fprintf(stderr, "Filters: %d good, %d mediocre\n", - good_filters, mediocre_filters); + fprintf(stderr, "Filters: %d good, %d mediocre (avg=%5.2f%% (l>=1000), max=%5.2f%%)\n", + good_filters, mediocre_filters, + rate_sum / rate_sum_samples * 100.0, max_rate * 100.0); } ASSERT_LE(mediocre_filters, good_filters/5); }