Skip to content

Commit

Permalink
Add BloomFilter64 to support large Bloom filters
Browse files Browse the repository at this point in the history
Add BloomFilter64 that is like BloomFilter, but uses ulong to
represent m and count, and uses Buckets64 and Hash128.

Add Buckets64 that is like Buckets, but uses ulong to represent count
and uses multiple arrays to overcome the C# array size limit of
approximately 2^31 elements and the default .NET limit of 2^31 bytes
per object.

Add Hash128 that is like Hash, but provides 128 bits of hash material.
Also, add Hash128.ComputeHashAndSum() to improve performance by
avoiding the conversion of bytes to string, then string to bytes.

Add Utils.OptimalM64() that takes a ulong n and calculates a ulong m.

Add Utils.HashKernel128() and Utils.HashKernel128ReturnValue for
working with 128 bits of hash material.  In Utils.HashKernel128(),
avoid the use of Skip() and Take() to improve performance.

Add TestBloomFilter64.cs, TestBuckets64.cs, TestHash128.cs to test
the new classes.
  • Loading branch information
dferreyra committed May 11, 2018
1 parent edb3da6 commit 3e8bcee
Show file tree
Hide file tree
Showing 9 changed files with 1,058 additions and 0 deletions.
199 changes: 199 additions & 0 deletions ProbabilisticDataStructures/BloomFilter64.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using ProbabilisticDataStructures;
using System.Security.Cryptography;

namespace ProbabilisticDataStructures
{
/// <summary>
/// BloomFilter64 implements a classic Bloom filter. A bloom filter has a non-zero
/// probability of false positives and a zero probability of false negatives.
/// </summary>
public class BloomFilter64 : IFilter
{
/// <summary>
/// Filter data
/// </summary>
internal Buckets64 Buckets { get; set; }
/// <summary>
/// Hash algorithm
/// </summary>
private HashAlgorithm Hash { get; set; }
/// <summary>
/// Filter size
/// </summary>
private ulong m { get; set; }
/// <summary>
/// Number of hash functions
/// </summary>
private uint k { get; set; }
/// <summary>
/// Number of items added
/// </summary>
private ulong count { get; set; }

/// <summary>
/// Creates a new Bloom filter optimized to store n items with a specified target
/// false-positive rate.
/// </summary>
/// <param name="n">Number of items to store.</param>
/// <param name="fpRate">Desired false positive rate.</param>
public BloomFilter64(ulong n, double fpRate)
{
var m = Utils.OptimalM64(n, fpRate);
var k = Utils.OptimalK(fpRate);
Buckets = new Buckets64(m, 1);
Hash = Defaults.GetDefaultHashAlgorithm();
this.m = m;
this.k = k;
}

/// <summary>
/// Returns the Bloom filter capacity, m.
/// </summary>
/// <returns>The Bloom filter capacity, m.</returns>
public ulong Capacity()
{
return this.m;
}

/// <summary>
/// Returns the number of hash functions.
/// </summary>
/// <returns>The number of hash functions.</returns>
public uint K()
{
return this.k;
}

/// <summary>
/// Returns the number of items in the filter.
/// </summary>
/// <returns></returns>
public ulong Count()
{
return this.count;
}

/// <summary>
/// Returns the current estimated ratio of set bits.
/// </summary>
/// <returns>The current estimated ratio of set bits.</returns>
public double EstimatedFillRatio()
{
return 1 - Math.Exp((-(double)this.count * (double)this.k) / (double)this.m);
}

/// <summary>
/// Returns the ratio of set bits.
/// </summary>
/// <returns>The ratio of set bits.</returns>
public double FillRatio()
{
ulong sum = 0;
for (ulong i = 0; i < this.Buckets.count; i++)
{
sum += this.Buckets.Get(i);
}
return (double)sum / (double)this.m;
}

/// <summary>
/// Will test for membership of the data and returns true if it is a member,
/// false if not. This is a probabilistic test, meaning there is a non-zero
/// probability of false positives but a zero probability of false negatives.
/// </summary>
/// <param name="data">The data to search for.</param>
/// <returns>Whether or not the data is maybe contained in the filter.</returns>
public bool Test(byte[] data)
{
var hashKernel = Utils.HashKernel128(data, this.Hash);
var lower = hashKernel.LowerBaseHash;
var upper = hashKernel.UpperBaseHash;

// If any of the K bits are not set, then it's not a member.
for (uint i = 0; i < this.k; i++)
{
if (this.Buckets.Get((lower + upper * i) % this.m) == 0)
{
return false;
}
}
return true;
}

/// <summary>
/// Will add the data to the Bloom filter. It returns the filter to allow
/// for chaining.
/// </summary>
/// <param name="data">The data to add.</param>
/// <returns>The filter.</returns>
public IFilter Add(byte[] data)
{
var hashKernel = Utils.HashKernel128(data, this.Hash);
var lower = hashKernel.LowerBaseHash;
var upper = hashKernel.UpperBaseHash;

// Set the K bits.
for (uint i = 0; i < this.k; i++)
{
this.Buckets.Set((lower + upper * i) % this.m, 1);
}

this.count++;
return this;
}

/// <summary>
/// Is equivalent to calling Test followed by Add. It returns true if the data is
/// a member, false if not.
/// </summary>
/// <param name="data">The data to test for and add if it doesn't exist.</param>
/// <returns>Whether or not the data was probably contained in the filter.</returns>
public bool TestAndAdd(byte[] data)
{
var hashKernel = Utils.HashKernel128(data, this.Hash);
var lower = hashKernel.LowerBaseHash;
var upper = hashKernel.UpperBaseHash;
var member = true;

// If any of the K bits are not set, then it's not a member.
for (uint i = 0; i < this.k; i++)
{
var idx = (lower + upper * i) % this.m;
if (this.Buckets.Get(idx) == 0)
{
member = false;
}
this.Buckets.Set(idx, 1);
}

this.count++;
return member;
}

/// <summary>
/// Restores the Bloom filter to its original state. It returns the filter to
/// allow for chaining.
/// </summary>
/// <returns>The reset bloom filter.</returns>
public BloomFilter64 Reset()
{
this.Buckets.Reset();
return this;
}

/// <summary>
/// Sets the hashing function used in the filter.
/// </summary>
/// <param name="h">The HashAlgorithm to use.</param>
// TODO: Add SetHash to the IFilter interface?
public void SetHash(HashAlgorithm h)
{
this.Hash = h;
}
}
}
189 changes: 189 additions & 0 deletions ProbabilisticDataStructures/Buckets64.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace ProbabilisticDataStructures
{
/// <summary>
/// Buckets64 is a fast, space-efficient array of buckets where each bucket can store
/// up to a configured maximum value.
/// </summary>
public class Buckets64
{
// The largest C# array to create; the largest power of 2 that C# can support.
private const uint maxArraySize = 1U << 30;
private byte[][] Data { get; set; }
private int arrayCount { get; set; }
private byte bucketSize { get; set; }
private byte _max;
private int Max
{
get
{
return _max;
}
set
{
// TODO: Figure out this truncation thing.
// I'm not sure if MaxValue is always supposed to be capped at 255 via
// a byte conversion or not...
if (value > byte.MaxValue)
_max = byte.MaxValue;
else
_max = (byte)value;
}
}
internal ulong count { get; set; }

/// <summary>
/// Creates a new Buckets64 with the provided number of buckets where each bucket
/// is the specified number of bits.
/// </summary>
/// <param name="count">Number of buckets.</param>
/// <param name="bucketSize">Number of bits per bucket.</param>
internal Buckets64(ulong count, byte bucketSize)
{
this.count = count;
this.bucketSize = bucketSize;
AllocateArray(count, bucketSize);
this.Max = (1 << bucketSize) - 1;
}

private void AllocateArray(ulong count, byte bucketSize)
{
this.arrayCount = (int)(count / maxArraySize + 1);
this.Data = new byte[this.arrayCount][];
var bytesToAllocate = (count * bucketSize + 7) / 8;
for (int i = 0; i < this.arrayCount; i++)
{
var arraySize = Math.Min(bytesToAllocate, maxArraySize);
this.Data[i] = new byte[arraySize];
bytesToAllocate -= arraySize;
}
}

/// <summary>
/// Returns the maximum value that can be stored in a bucket.
/// </summary>
/// <returns>The bucket max value.</returns>
internal byte MaxBucketValue()
{
return this._max;
}

/// <summary>
/// Increment the value in the specified bucket by the provided delta. A bucket
/// can be decremented by providing a negative delta.
/// <para>
/// The value is clamped to zero and the maximum bucket value. Returns itself
/// to allow for chaining.
/// </para>
/// </summary>
/// <param name="bucket">The bucket to increment.</param>
/// <param name="delta">The amount to increment the bucket by.</param>
/// <returns>The modified bucket.</returns>
internal Buckets64 Increment(uint bucket, int delta)
{
int val = (int)(GetBits(bucket * this.bucketSize, this.bucketSize) + delta);

if (val > this.Max)
val = this.Max;
else if (val < 0)
val = 0;

SetBits((uint)bucket * (uint)this.bucketSize, this.bucketSize, (uint)val);
return this;
}

/// <summary>
/// Set the bucket value. The value is clamped to zero and the maximum bucket
/// value. Returns itself to allow for chaining.
/// </summary>
/// <param name="bucket">The bucket to change the value of.</param>
/// <param name="value">The value to set.</param>
/// <returns>The modified bucket.</returns>
internal Buckets64 Set(ulong bucket, byte value)
{
if (value > this._max)
value = this._max;

SetBits(bucket * this.bucketSize, this.bucketSize, value);
return this;
}

/// <summary>
/// Returns the value in the specified bucket.
/// </summary>
/// <param name="bucket">The bucket to get.</param>
/// <returns>The specified bucket.</returns>
internal uint Get(ulong bucket)
{
return GetBits(bucket * this.bucketSize, this.bucketSize);
}

/// <summary>
/// Restores the Buckets64 to the original state. Returns itself to allow for
/// chaining.
/// </summary>
/// <returns>The Buckets64 object the reset operation was performed on.</returns>
internal Buckets64 Reset()
{
AllocateArray(this.count, this.bucketSize);
return this;
}

/// <summary>
/// Returns the bits at the specified offset and length.
/// </summary>
/// <param name="offset">The position to start reading at.</param>
/// <param name="length">The distance to read from the offset.</param>
/// <returns>The bits at the specified offset and length.</returns>
internal uint GetBits(ulong offset, int length)
{
ulong byteIndex = offset / 8;
int byteOffset = (int)(offset % 8);

if ((byteOffset + length) > 8)
{
int rem = 8 - byteOffset;
return GetBits(offset, rem)
| (GetBits(offset + (ulong)rem, length - rem) << rem);
}

var dataArray = this.Data[byteIndex / maxArraySize];
var dataArrayByteIndex = byteIndex % maxArraySize;
int bitMask = (1 << length) - 1;
return (uint)((dataArray[dataArrayByteIndex] & (bitMask << byteOffset)) >> byteOffset);
}

/// <summary>
/// Sets bits at the specified offset and length.
/// </summary>
/// <param name="offset">The position to start writing at.</param>
/// <param name="length">The distance to write from the offset.</param>
/// <param name="bits">The bits to write.</param>
internal void SetBits(ulong offset, int length, uint bits)
{
ulong byteIndex = offset / 8;
int byteOffset = (int)(offset % 8);

if ((byteOffset + length) > 8)
{
int rem = 8 - byteOffset;
SetBits(offset, (byte)rem, bits);
SetBits(offset + (ulong)rem, length - rem, bits >> rem);
return;
}

var dataArray = this.Data[(uint)(byteIndex / maxArraySize)];
var dataArrayByteIndex = (uint)(byteIndex % maxArraySize);
int bitMask = (1 << length) - 1;
dataArray[dataArrayByteIndex] =
(byte)((dataArray[dataArrayByteIndex]) & ~(bitMask << byteOffset));
dataArray[dataArrayByteIndex] =
(byte)((dataArray[dataArrayByteIndex]) | ((bits & bitMask) << byteOffset));
}
}
}
Loading

0 comments on commit 3e8bcee

Please sign in to comment.