-
Notifications
You must be signed in to change notification settings - Fork 80
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add HyperLogLog implementation (#1223)
Implement a HyperLogLog sketch based on the `khmer` implementation but using the estimator from ["New cardinality estimation algorithms for HyperLogLog sketches"](http://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf) (also implemented in `dashing`). This PR also moves `add_sequence` and `add_protein` to `SigsTrait`, closing #1057. The encoding data and methods (`hp`, `dayhoff`, `aa` and `HashFunctions`) was in the MinHash source file, and since it is more general-purpose it was moved to a new module `encodings`, which is then used by `SigsTrait`. (these changes are both spun off #1201)
- Loading branch information
Showing
24 changed files
with
1,642 additions
and
685 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# -*- coding: UTF-8 -*- | ||
|
||
import sys | ||
from tempfile import NamedTemporaryFile | ||
|
||
from ._lowlevel import ffi, lib | ||
from .utils import RustObject, rustcall, decode_str | ||
from .exceptions import SourmashError | ||
from .minhash import to_bytes, MinHash | ||
|
||
|
||
class HLL(RustObject): | ||
__dealloc_func__ = lib.hll_free | ||
|
||
def __init__(self, error_rate, ksize): | ||
self._objptr = lib.hll_with_error_rate(error_rate, ksize) | ||
|
||
def __len__(self): | ||
return self.cardinality() | ||
|
||
def cardinality(self): | ||
return self._methodcall(lib.hll_cardinality) | ||
|
||
@property | ||
def ksize(self): | ||
return self._methodcall(lib.hll_ksize) | ||
|
||
def add_sequence(self, sequence, force=False): | ||
"Add a sequence into the sketch." | ||
self._methodcall(lib.hll_add_sequence, to_bytes(sequence), len(sequence), force) | ||
|
||
def add_kmer(self, kmer): | ||
"Add a kmer into the sketch." | ||
if len(kmer) != self.ksize: | ||
raise ValueError("kmer to add is not {} in length".format(self.ksize)) | ||
self.add_sequence(kmer) | ||
|
||
def add(self, h): | ||
if isinstance(h, str): | ||
return self.add_kmer(h) | ||
return self._methodcall(lib.hll_add_hash, h) | ||
|
||
def update(self, other): | ||
if isinstance(other, HLL): | ||
return self._methodcall(lib.hll_merge, other._objptr) | ||
elif isinstance(other, MinHash): | ||
return self._methodcall(lib.hll_update_mh, other._objptr) | ||
else: | ||
# FIXME: we could take sets here too (or anything that can be | ||
# converted to a list of ints...) | ||
raise TypeError("Must be a HyperLogLog or MinHash") | ||
|
||
def similarity(self, other): | ||
if isinstance(other, HLL): | ||
return self._methodcall(lib.hll_similarity, other._objptr) | ||
else: | ||
# FIXME: we could take sets here too (or anything that can be | ||
# converted to a list of ints...) | ||
raise TypeError("other must be a HyperLogLog") | ||
|
||
def containment(self, other): | ||
if isinstance(other, HLL): | ||
return self._methodcall(lib.hll_containment, other._objptr) | ||
else: | ||
# FIXME: we could take sets here too (or anything that can be | ||
# converted to a list of ints...) | ||
raise TypeError("other must be a HyperLogLog") | ||
|
||
def intersection(self, other): | ||
if isinstance(other, HLL): | ||
return self._methodcall(lib.hll_intersection_size, other._objptr) | ||
else: | ||
# FIXME: we could take sets here too (or anything that can be | ||
# converted to a list of ints...) | ||
raise TypeError("other must be a HyperLogLog") | ||
|
||
@staticmethod | ||
def load(filename): | ||
hll_ptr = rustcall(lib.hll_from_path, to_bytes(filename)) | ||
return HLL._from_objptr(hll_ptr) | ||
|
||
@staticmethod | ||
def from_buffer(buf): | ||
hll_ptr = rustcall(lib.hll_from_buffer, buf, len(buf)) | ||
return HLL._from_objptr(hll_ptr) | ||
|
||
def save(self, filename): | ||
self._methodcall(lib.hll_save, to_bytes(filename)) | ||
|
||
def to_bytes(self, compression=1): | ||
size = ffi.new("uintptr_t *") | ||
rawbuf = self._methodcall(lib.hll_to_buffer, size) | ||
size = size[0] | ||
|
||
rawbuf = ffi.gc(rawbuf, lambda o: lib.nodegraph_buffer_free(o, size), size) | ||
buf = ffi.buffer(rawbuf, size) | ||
|
||
return buf | ||
|
||
def count(self, h): | ||
self.add(h) | ||
|
||
def get(self, h): | ||
raise NotImplementedError("HLL doesn't support membership query") | ||
|
||
def matches(self, mh): | ||
if not isinstance(mh, MinHash): | ||
# FIXME: we could take sets here too (or anything that can be | ||
# converted to a list of ints...) | ||
raise ValueError("mh must be a MinHash") | ||
|
||
return self._methodcall(lib.hll_matches, mh._objptr) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.