Skip to content

Commit

Permalink
Add HyperLogLog implementation (#1223)
Browse files Browse the repository at this point in the history
Implement a HyperLogLog sketch based on the `khmer` implementation but using the estimator from ["New cardinality estimation algorithms for HyperLogLog sketches"](http://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf) (also implemented in `dashing`).

This PR also moves `add_sequence` and `add_protein` to `SigsTrait`, closing #1057.

The encoding data and methods (`hp`, `dayhoff`, `aa` and `HashFunctions`) was in the MinHash source file, and since it is more general-purpose it was moved to a new module `encodings`, which is then used by `SigsTrait`.

(these changes are both spun off #1201)
  • Loading branch information
luizirber authored Oct 31, 2020
1 parent 1e94bde commit bdc19d5
Show file tree
Hide file tree
Showing 24 changed files with 1,642 additions and 685 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ doc: build .PHONY
cd doc && make html

include/sourmash.h: src/core/src/lib.rs \
src/core/src/ffi/hyperloglog.rs \
src/core/src/ffi/minhash.rs \
src/core/src/ffi/signature.rs \
src/core/src/ffi/nodegraph.rs \
Expand Down
37 changes: 37 additions & 0 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ enum SourmashErrorCode {
SOURMASH_ERROR_CODE_INVALID_HASH_FUNCTION = 1104,
SOURMASH_ERROR_CODE_READ_DATA = 1201,
SOURMASH_ERROR_CODE_STORAGE = 1202,
SOURMASH_ERROR_CODE_HLL_PRECISION_BOUNDS = 1301,
SOURMASH_ERROR_CODE_IO = 100001,
SOURMASH_ERROR_CODE_UTF8_ERROR = 100002,
SOURMASH_ERROR_CODE_PARSE_INT = 100003,
Expand All @@ -45,6 +46,8 @@ typedef uint32_t SourmashErrorCode;

typedef struct SourmashComputeParameters SourmashComputeParameters;

typedef struct SourmashHyperLogLog SourmashHyperLogLog;

typedef struct SourmashKmerMinHash SourmashKmerMinHash;

typedef struct SourmashNodegraph SourmashNodegraph;
Expand Down Expand Up @@ -115,6 +118,40 @@ bool computeparams_track_abundance(const SourmashComputeParameters *ptr);

uint64_t hash_murmur(const char *kmer, uint64_t seed);

void hll_add_hash(SourmashHyperLogLog *ptr, uint64_t hash);

void hll_add_sequence(SourmashHyperLogLog *ptr, const char *sequence, uintptr_t insize, bool force);

uintptr_t hll_cardinality(const SourmashHyperLogLog *ptr);

double hll_containment(const SourmashHyperLogLog *ptr, const SourmashHyperLogLog *optr);

void hll_free(SourmashHyperLogLog *ptr);

SourmashHyperLogLog *hll_from_buffer(const char *ptr, uintptr_t insize);

SourmashHyperLogLog *hll_from_path(const char *filename);

uintptr_t hll_intersection_size(const SourmashHyperLogLog *ptr, const SourmashHyperLogLog *optr);

uintptr_t hll_ksize(const SourmashHyperLogLog *ptr);

uintptr_t hll_matches(const SourmashHyperLogLog *ptr, const SourmashKmerMinHash *mh_ptr);

void hll_merge(SourmashHyperLogLog *ptr, const SourmashHyperLogLog *optr);

SourmashHyperLogLog *hll_new(void);

void hll_save(const SourmashHyperLogLog *ptr, const char *filename);

double hll_similarity(const SourmashHyperLogLog *ptr, const SourmashHyperLogLog *optr);

const uint8_t *hll_to_buffer(const SourmashHyperLogLog *ptr, uintptr_t *size);

void hll_update_mh(SourmashHyperLogLog *ptr, const SourmashKmerMinHash *optr);

SourmashHyperLogLog *hll_with_error_rate(double error_rate, uintptr_t ksize);

void kmerminhash_add_from(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

void kmerminhash_add_hash(SourmashKmerMinHash *ptr, uint64_t h);
Expand Down
112 changes: 112 additions & 0 deletions sourmash/hll.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# -*- coding: UTF-8 -*-

import sys
from tempfile import NamedTemporaryFile

from ._lowlevel import ffi, lib
from .utils import RustObject, rustcall, decode_str
from .exceptions import SourmashError
from .minhash import to_bytes, MinHash


class HLL(RustObject):
__dealloc_func__ = lib.hll_free

def __init__(self, error_rate, ksize):
self._objptr = lib.hll_with_error_rate(error_rate, ksize)

def __len__(self):
return self.cardinality()

def cardinality(self):
return self._methodcall(lib.hll_cardinality)

@property
def ksize(self):
return self._methodcall(lib.hll_ksize)

def add_sequence(self, sequence, force=False):
"Add a sequence into the sketch."
self._methodcall(lib.hll_add_sequence, to_bytes(sequence), len(sequence), force)

def add_kmer(self, kmer):
"Add a kmer into the sketch."
if len(kmer) != self.ksize:
raise ValueError("kmer to add is not {} in length".format(self.ksize))
self.add_sequence(kmer)

def add(self, h):
if isinstance(h, str):
return self.add_kmer(h)
return self._methodcall(lib.hll_add_hash, h)

def update(self, other):
if isinstance(other, HLL):
return self._methodcall(lib.hll_merge, other._objptr)
elif isinstance(other, MinHash):
return self._methodcall(lib.hll_update_mh, other._objptr)
else:
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("Must be a HyperLogLog or MinHash")

def similarity(self, other):
if isinstance(other, HLL):
return self._methodcall(lib.hll_similarity, other._objptr)
else:
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("other must be a HyperLogLog")

def containment(self, other):
if isinstance(other, HLL):
return self._methodcall(lib.hll_containment, other._objptr)
else:
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("other must be a HyperLogLog")

def intersection(self, other):
if isinstance(other, HLL):
return self._methodcall(lib.hll_intersection_size, other._objptr)
else:
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise TypeError("other must be a HyperLogLog")

@staticmethod
def load(filename):
hll_ptr = rustcall(lib.hll_from_path, to_bytes(filename))
return HLL._from_objptr(hll_ptr)

@staticmethod
def from_buffer(buf):
hll_ptr = rustcall(lib.hll_from_buffer, buf, len(buf))
return HLL._from_objptr(hll_ptr)

def save(self, filename):
self._methodcall(lib.hll_save, to_bytes(filename))

def to_bytes(self, compression=1):
size = ffi.new("uintptr_t *")
rawbuf = self._methodcall(lib.hll_to_buffer, size)
size = size[0]

rawbuf = ffi.gc(rawbuf, lambda o: lib.nodegraph_buffer_free(o, size), size)
buf = ffi.buffer(rawbuf, size)

return buf

def count(self, h):
self.add(h)

def get(self, h):
raise NotImplementedError("HLL doesn't support membership query")

def matches(self, mh):
if not isinstance(mh, MinHash):
# FIXME: we could take sets here too (or anything that can be
# converted to a list of ints...)
raise ValueError("mh must be a MinHash")

return self._methodcall(lib.hll_matches, mh._objptr)
11 changes: 5 additions & 6 deletions src/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,27 @@ from-finch = ["finch"]
parallel = ["rayon"]

[dependencies]
az = "1.0.0"
backtrace = "=0.3.46" # later versions require rust 1.40
bytecount = "0.6.0"
byteorder = "1.3.4"
cfg-if = "1.0"
failure = "0.1.8" # can remove after .backtrace() is available in std::error::Error
finch = { version = "0.3.0", optional = true }
fixedbitset = "0.3.0"
getset = "0.1.1"
log = "0.4.8"
md5 = "0.7.0"
murmurhash3 = "0.0.5"
niffler = { version = "2.2.0", default-features = false, features = [ "gz" ] }
nohash-hasher = "0.2.0"
num-iter = "0.1.41"
once_cell = "1.3.1"
rayon = { version = "1.3.0", optional = true }
serde = { version = "1.0.110", features = ["derive"] }
serde_json = "1.0.53"
primal-check = "0.2.3"
thiserror = "1.0"
typed-builder = "0.7.0"
getset = "0.1.1"

[target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dependencies.wasm-bindgen]
version = "0.2.62"
Expand All @@ -58,15 +60,12 @@ wasm-opt = false # https://github.com/rustwasm/wasm-pack/issues/886
[dev-dependencies]
assert_matches = "1.3.0"
criterion = "0.3.2"
needletail = { version = "0.4.0", default-features = false }
predicates = "1.0.4"
proptest = { version = "0.9.6", default-features = false, features = ["std"]} # Upgrade to 0.10 requires rust 1.39
rand = "0.7.3"
tempfile = "3.1.0"

[dev-dependencies.needletail]
version = "0.4.0"
default-features = false

[[bench]]
name = "index"
harness = false
Expand Down
3 changes: 2 additions & 1 deletion src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ use wasm_bindgen::prelude::*;
use getset::{CopyGetters, Getters, Setters};
use typed_builder::TypedBuilder;

use crate::encodings::HashFunctions;
use crate::index::MHBT;
use crate::signature::Signature;
use crate::sketch::minhash::{max_hash_for_scaled, HashFunctions, KmerMinHashBTree};
use crate::sketch::minhash::{max_hash_for_scaled, KmerMinHashBTree};
use crate::sketch::Sketch;
use crate::Error;

Expand Down
Loading

0 comments on commit bdc19d5

Please sign in to comment.