diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 5ef8304fd9..eb8a55a94c 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -38,12 +38,12 @@ import sourmash from abc import abstractmethod, ABC from collections import namedtuple, Counter -from collections import defaultdict -from ..search import make_jaccard_search_query, make_gather_query -from ..manifest import CollectionManifest -from ..logging import debug_literal -from ..signature import load_signatures, save_signatures +from sourmash.search import (make_jaccard_search_query, make_gather_query, + calc_threshold_from_bp) +from sourmash.manifest import CollectionManifest +from sourmash.logging import debug_literal +from sourmash.signature import load_signatures, save_signatures # generic return tuple for Index.search and Index.gather IndexSearchResult = namedtuple('Result', 'score, signature, location') @@ -277,8 +277,13 @@ def gather(self, query, threshold_bp=None, **kwargs): return results[:1] - def peek(self, query_mh, threshold_bp=0): - "Mimic CounterGather.peek() on top of Index. Yes, this is backwards." + def peek(self, query_mh, *, threshold_bp=0): + """Mimic CounterGather.peek() on top of Index. + + This is implemented for situations where we don't want to use + 'prefetch' functionality. It is a light wrapper around the + 'gather'/search-by-containment method. + """ from sourmash import SourmashSignature # build a signature to use with self.gather... @@ -323,7 +328,7 @@ def counter_gather(self, query, threshold_bp, **kwargs): # find all matches and construct a CounterGather object. counter = CounterGather(prefetch_query.minhash) for result in self.prefetch(prefetch_query, threshold_bp, **kwargs): - counter.add(result.signature, result.location) + counter.add(result.signature, location=result.location) # tada! return counter @@ -701,13 +706,24 @@ def select(self, **kwargs): class CounterGather: - """ - Track and summarize matches for efficient 'gather' protocol. This - could be used downstream of prefetch (for example). + """This is an ancillary class that is used to implement "fast + gather", post-prefetch. It tracks and summarize matches for + efficient min-set-cov/'gather'. + + The class constructor takes a query MinHash that must be scaled, and + then takes signatures that have overlaps with the query (via 'add'). + + After all overlapping signatures have been loaded, the 'peek' + method is then used at each stage of the 'gather' procedure to + find the best match, and the 'consume' method is used to remove + a match from this counter. - The public interface is `peek(...)` and `consume(...)` only. + This particular implementation maintains a collections.Counter that + is used to quickly find the best match when 'peek' is called, but + other implementations are possible ;). """ def __init__(self, query_mh): + "Constructor - takes a query FracMinHash." if not query_mh.scaled: raise ValueError('gather requires scaled signatures') @@ -715,17 +731,17 @@ def __init__(self, query_mh): self.orig_query_mh = query_mh.copy().flatten() self.scaled = query_mh.scaled - # track matching signatures & their locations + # use these to track loaded matches & their locations self.siglist = [] self.locations = [] - # ...and overlaps with query + # ...and also track overlaps with the progressive query self.counter = Counter() - # cannot add matches once query has started. + # fence to make sure we do add matches once query has started. self.query_started = 0 - def add(self, ss, location=None, require_overlap=True): + def add(self, ss, *, location=None, require_overlap=True): "Add this signature in as a potential match." if self.query_started: raise ValueError("cannot add more signatures to counter after peek/consume") @@ -748,26 +764,11 @@ def downsample(self, scaled): "Track highest scaled across all possible matches." if scaled > self.scaled: self.scaled = scaled + return self.scaled - def calc_threshold(self, threshold_bp, scaled, query_size): - # CTB: this code doesn't need to be in this class. - threshold = 0.0 - n_threshold_hashes = 0 - - if threshold_bp: - # if we have a threshold_bp of N, then that amounts to N/scaled - # hashes: - n_threshold_hashes = float(threshold_bp) / scaled - - # that then requires the following containment: - threshold = n_threshold_hashes / query_size - - return threshold, n_threshold_hashes - - def peek(self, cur_query_mh, threshold_bp=0): + def peek(self, cur_query_mh, *, threshold_bp=0): "Get next 'gather' result for this database, w/o changing counters." self.query_started = 1 - scaled = cur_query_mh.scaled # empty? nothing to search. counter = self.counter @@ -777,25 +778,25 @@ def peek(self, cur_query_mh, threshold_bp=0): siglist = self.siglist assert siglist - self.downsample(scaled) - scaled = self.scaled + scaled = self.downsample(cur_query_mh.scaled) cur_query_mh = cur_query_mh.downsample(scaled=scaled) if not cur_query_mh: # empty query? quit. return [] + # CTB: could probably remove this check unless debug requested. if cur_query_mh.contained_by(self.orig_query_mh, downsample=True) < 1: raise ValueError("current query not a subset of original query") # are we setting a threshold? - threshold, n_threshold_hashes = self.calc_threshold(threshold_bp, - scaled, - len(cur_query_mh)) + threshold, n_threshold_hashes = calc_threshold_from_bp(threshold_bp, + scaled, + len(cur_query_mh)) # is it too high to ever match? if so, exit. if threshold > 1.0: return [] - # Find the best match - + # Find the best match using the internal Counter. most_common = counter.most_common() dataset_id, match_size = most_common[0] @@ -803,12 +804,13 @@ def peek(self, cur_query_mh, threshold_bp=0): if match_size < n_threshold_hashes: return [] - ## at this point, we must have a legitimate match above threshold! + ## at this point, we have a legitimate match above threshold! # pull match and location. match = siglist[dataset_id] # calculate containment + # CTB: this check is probably redundant with intersect_mh calc, below. cont = cur_query_mh.contained_by(match.minhash, downsample=True) assert cont assert cont >= threshold @@ -822,7 +824,7 @@ def peek(self, cur_query_mh, threshold_bp=0): return (IndexSearchResult(cont, match, location), intersect_mh) def consume(self, intersect_mh): - "Remove the given hashes from this counter." + "Maintain the internal counter by removing the given hashes." self.query_started = 1 if not intersect_mh: diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 5a86fa8d85..3e03951978 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -11,6 +11,25 @@ from .sketchcomparison import FracMinHashComparison, NumMinHashComparison +def calc_threshold_from_bp(threshold_bp, scaled, query_size): + """ + Convert threshold_bp (threshold in estimated bp) to + fraction of query & minimum number of hashes needed. + """ + threshold = 0.0 + n_threshold_hashes = 0 + + if threshold_bp: + # if we have a threshold_bp of N, then that amounts to N/scaled + # hashes: + n_threshold_hashes = float(threshold_bp) / scaled + + # that then requires the following containment: + threshold = n_threshold_hashes / query_size + + return threshold, n_threshold_hashes + + class SearchType(Enum): JACCARD = 1 CONTAINMENT = 2 @@ -621,7 +640,7 @@ def _find_best(counters, query, threshold_bp): # find the best score across multiple counters, without consuming for counter in counters: - result = counter.peek(query.minhash, threshold_bp) + result = counter.peek(query.minhash, threshold_bp=threshold_bp) if result: (sr, intersect_mh) = result diff --git a/tests/test_index.py b/tests/test_index.py index bd216b8f32..e36275b092 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -8,7 +8,6 @@ import shutil import sourmash -from sourmash import index from sourmash import load_one_signature, SourmashSignature from sourmash.index import (LinearIndex, ZipFileLinearIndex, make_jaccard_search_query, CounterGather, @@ -1542,546 +1541,10 @@ def is_found(ss, xx): assert not is_found(ss2, results) assert is_found(ss63, results) -### -### CounterGather tests -### - -def _consume_all(query_mh, counter, threshold_bp=0): - results = [] - query_mh = query_mh.to_mutable() - - last_intersect_size = None - while 1: - result = counter.peek(query_mh, threshold_bp) - if not result: - break - - sr, intersect_mh = result - print(sr.signature.name, len(intersect_mh)) - if last_intersect_size: - assert len(intersect_mh) <= last_intersect_size - - last_intersect_size = len(intersect_mh) - - counter.consume(intersect_mh) - query_mh.remove_many(intersect_mh.hashes) - - results.append((sr, len(intersect_mh))) - - return results - - -def test_counter_gather_1(): - # check a contrived set of non-overlapping gather results, - # generated via CounterGather - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear() - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - match_mh_2 = query_mh.copy_and_clear() - match_mh_2.add_many(range(10, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') - - match_mh_3 = query_mh.copy_and_clear() - match_mh_3.add_many(range(15, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - counter.add(match_ss_2) - counter.add(match_ss_3) - - results = _consume_all(query_ss.minhash, counter) - - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) - assert len(results) == len(expected), results - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_1_b(): - # check a contrived set of somewhat-overlapping gather results, - # generated via CounterGather. Here the overlaps are structured - # so that the gather results are the same as those in - # test_counter_gather_1(), even though the overlaps themselves are - # larger. - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear() - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - match_mh_2 = query_mh.copy_and_clear() - match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') - - match_mh_3 = query_mh.copy_and_clear() - match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - counter.add(match_ss_2) - counter.add(match_ss_3) - - results = _consume_all(query_ss.minhash, counter) - - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) - assert len(results) == len(expected), results - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_1_c_with_threshold(): - # check a contrived set of somewhat-overlapping gather results, - # generated via CounterGather. Here the overlaps are structured - # so that the gather results are the same as those in - # test_counter_gather_1(), even though the overlaps themselves are - # larger. - # use a threshold, here. - - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear() - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - match_mh_2 = query_mh.copy_and_clear() - match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') - - match_mh_3 = query_mh.copy_and_clear() - match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - counter.add(match_ss_2) - counter.add(match_ss_3) - - results = _consume_all(query_ss.minhash, counter, - threshold_bp=3) - - expected = (['match1', 10], - ['match2', 5]) - assert len(results) == len(expected), results - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_1_d_diff_scaled(): - # test as above, but with different scaled. - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) - match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') - - match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) - match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - counter.add(match_ss_2) - counter.add(match_ss_3) - - results = _consume_all(query_ss.minhash, counter) - - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) - assert len(results) == len(expected), results - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_1_d_diff_scaled_query(): - # test as above, but with different scaled for QUERY. - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - - match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) - match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') - - match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) - match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') - - # downsample query now - - query_ss = SourmashSignature(query_mh.downsample(scaled=100), name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - counter.add(match_ss_2) - counter.add(match_ss_3) - - results = _consume_all(query_ss.minhash, counter) - - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) - assert len(results) == len(expected), results - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_1_e_abund_query(): - # test as above, but abund query - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear().flatten() - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - match_mh_2 = query_mh.copy_and_clear().flatten() - match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') - - match_mh_3 = query_mh.copy_and_clear().flatten() - match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - counter.add(match_ss_2) - counter.add(match_ss_3) - - # must flatten before peek! - results = _consume_all(query_ss.minhash.flatten(), counter) - - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) - assert len(results) == len(expected), results - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_1_f_abund_match(): - # test as above, but abund query - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh.flatten(), name='query') - - match_mh_1 = query_mh.copy_and_clear() - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - match_mh_2 = query_mh.copy_and_clear() - match_mh_2.add_many(range(7, 15)) - match_ss_2 = SourmashSignature(match_mh_2, name='match2') - - match_mh_3 = query_mh.copy_and_clear() - match_mh_3.add_many(range(13, 17)) - match_ss_3 = SourmashSignature(match_mh_3, name='match3') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - counter.add(match_ss_2) - counter.add(match_ss_3) - - # must flatten before peek! - results = _consume_all(query_ss.minhash.flatten(), counter) - - expected = (['match1', 10], - ['match2', 5], - ['match3', 2],) - assert len(results) == len(expected), results - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_2(): - # check basic set of gather results on semi-real data, - # generated via CounterGather - testdata_combined = utils.get_test_data('gather/combined.sig') - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) - - query_ss = sourmash.load_one_signature(testdata_combined, ksize=21) - subject_sigs = [ (sourmash.load_one_signature(t, ksize=21), t) - for t in testdata_sigs ] - - # load up the counter - counter = CounterGather(query_ss.minhash) - for ss, loc in subject_sigs: - counter.add(ss, loc) - - results = _consume_all(query_ss.minhash, counter) - - expected = (['NC_003198.1', 487], - ['NC_000853.1', 192], - ['NC_011978.1', 169], - ['NC_002163.1', 157], - ['NC_003197.2', 152], - ['NC_009486.1', 92], - ['NC_006905.1', 76], - ['NC_011080.1', 59], - ['NC_011274.1', 42], - ['NC_006511.1', 31], - ['NC_011294.1', 7], - ['NC_004631.1', 2]) - assert len(results) == len(expected) - - for (sr, size), (exp_name, exp_size) in zip(results, expected): - sr_name = sr.signature.name.split()[0] - print(sr_name, size) - - assert sr_name == exp_name - assert size == exp_size - - -def test_counter_gather_exact_match(): - # query == match - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(query_ss, 'somewhere over the rainbow') - - results = _consume_all(query_ss.minhash, counter) - assert len(results) == 1 - (sr, intersect_mh) = results[0] - - assert sr.score == 1.0 - assert sr.signature == query_ss - assert sr.location == 'somewhere over the rainbow' - - -def test_counter_gather_add_after_peek(): - # cannot add after peek or consume - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(query_ss, 'somewhere over the rainbow') - - counter.peek(query_ss.minhash) - - with pytest.raises(ValueError): - counter.add(query_ss, "try again") - - -def test_counter_gather_add_after_consume(): - # cannot add after peek or consume - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(query_ss, 'somewhere over the rainbow') - - counter.consume(query_ss.minhash) - - with pytest.raises(ValueError): - counter.add(query_ss, "try again") - - -def test_counter_gather_consume_empty_intersect(): - # check that consume works fine when there is an empty signature. - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(query_ss, 'somewhere over the rainbow') - - # nothing really happens here :laugh:, just making sure there's no error - counter.consume(query_ss.minhash.copy_and_clear()) - - -def test_counter_gather_empty_initial_query(): - # check empty initial query - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear() - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1, require_overlap=False) - - assert counter.peek(query_ss.minhash) == [] - - -def test_counter_gather_num_query(): - # check num query - query_mh = sourmash.MinHash(n=500, ksize=31) - query_mh.add_many(range(0, 10)) - query_ss = SourmashSignature(query_mh, name='query') - - with pytest.raises(ValueError): - counter = CounterGather(query_ss.minhash) - - -def test_counter_gather_empty_cur_query(): - # test empty cur query - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(query_ss, 'somewhere over the rainbow') - - cur_query_mh = query_ss.minhash.copy_and_clear() - results = _consume_all(cur_query_mh, counter) - assert results == [] - - -def test_counter_gather_add_num_matchy(): - # test add num query - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh = sourmash.MinHash(n=500, ksize=31) - match_mh.add_many(range(0, 20)) - match_ss = SourmashSignature(match_mh, name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - with pytest.raises(ValueError): - counter.add(match_ss, 'somewhere over the rainbow') - - -def test_counter_gather_bad_cur_query(): - # test cur query that is not subset of original query - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(query_ss, 'somewhere over the rainbow') - - cur_query_mh = query_ss.minhash.copy_and_clear() - cur_query_mh.add_many(range(20, 30)) - with pytest.raises(ValueError): - counter.peek(cur_query_mh) - - -def test_counter_gather_add_no_overlap(): - # check adding match with no overlap w/query - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 10)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear() - match_mh_1.add_many(range(10, 20)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - # load up the counter - counter = CounterGather(query_ss.minhash) - with pytest.raises(ValueError): - counter.add(match_ss_1) - - assert counter.peek(query_ss.minhash) == [] - - -def test_counter_gather_big_threshold(): - # check 'peek' with a huge threshold - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_mh.add_many(range(0, 20)) - query_ss = SourmashSignature(query_mh, name='query') - - match_mh_1 = query_mh.copy_and_clear() - match_mh_1.add_many(range(0, 10)) - match_ss_1 = SourmashSignature(match_mh_1, name='match1') - - # load up the counter - counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1) - - # impossible threshold: - threshold_bp=30*query_ss.minhash.scaled - results = counter.peek(query_ss.minhash, threshold_bp=threshold_bp) - assert results == [] - - -def test_counter_gather_empty_counter(): - # check empty counter - query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) - query_ss = SourmashSignature(query_mh, name='query') - - # empty counter! - counter = CounterGather(query_ss.minhash) - - assert counter.peek(query_ss.minhash) == [] - - -def test_counter_gather_3_test_consume(): - # open-box testing of consume(...) +def test_counter_gather_test_consume(): + # open-box testing of CounterGather.consume(...) + # (see test_index_protocol.py for generic CounterGather tests.) query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) query_mh.add_many(range(0, 20)) query_ss = SourmashSignature(query_mh, name='query') @@ -2100,9 +1563,9 @@ def test_counter_gather_3_test_consume(): # load up the counter counter = CounterGather(query_ss.minhash) - counter.add(match_ss_1, 'loc a') - counter.add(match_ss_2, 'loc b') - counter.add(match_ss_3, 'loc c') + counter.add(match_ss_1, location='loc a') + counter.add(match_ss_2, location='loc b') + counter.add(match_ss_3, location='loc c') ### ok, dig into actual counts... import pprint diff --git a/tests/test_index_protocol.py b/tests/test_index_protocol.py index 15fd70aad0..19f27788c8 100644 --- a/tests/test_index_protocol.py +++ b/tests/test_index_protocol.py @@ -4,12 +4,14 @@ """ import pytest +import glob import sourmash from sourmash import SourmashSignature from sourmash.index import (LinearIndex, ZipFileLinearIndex, LazyLinearIndex, MultiIndex, StandaloneManifestIndex) +from sourmash.index import CounterGather from sourmash.index.sqlite_index import SqliteIndex from sourmash.index.revindex import RevIndex from sourmash.sbt import SBT, GraphFactory @@ -128,14 +130,6 @@ def build_lca_index_save_load(runtmp): return sourmash.load_file_as_index(outfile) -def build_lca_index_save_load(runtmp): - db = build_lca_index(runtmp) - outfile = runtmp.output('db.lca.json') - db.save(outfile) - - return sourmash.load_file_as_index(outfile) - - def build_sqlite_index(runtmp): filename = runtmp.output('idx.sqldb') db = SqliteIndex.create(filename) @@ -463,3 +457,667 @@ def test_gather_threshold_5(index_obj): containment, match_sig, name = results[0] assert containment == 1.0 assert match_sig.minhash == ss2.minhash + + +### +### CounterGather tests +### + + +def create_basic_counter_gather(runtmp): + "Construct a CounterGather class." + return CounterGather + + +class CounterGather_LinearIndex: + """ + Provides an (inefficient) CounterGather-style class, for + protocol testing purposes. + """ + def __init__(self, orig_query_mh): + "Constructor - take a FracMinHash that is the original query." + if orig_query_mh.scaled == 0: + raise ValueError + + # Index object used to actually track matches. + self.idx = LinearIndex() + self.orig_query_mh = orig_query_mh.copy().flatten() + self.query_started = 0 + self.scaled = orig_query_mh.scaled + self.locations = {} + + def add(self, ss, *, location=None, require_overlap=True): + "Insert potential match." + if self.query_started: + raise ValueError("cannot add more signatures to counter after peek/consume") + + # skip duplicates + md5 = ss.md5sum() + if md5 in self.locations: + return + + # confirm that this match has an overlap... + add_mh = ss.minhash.flatten() + overlap = self.orig_query_mh.count_common(add_mh, downsample=True) + + # ...figure out what scaled we are operating at now... + if overlap: + self.downsample(add_mh.scaled) + elif require_overlap: + raise ValueError("no overlap between query and signature!?") + + # ...and add to the Index, while also tracking location! + self.idx.insert(ss) + self.locations[md5] = location + + def downsample(self, scaled): + "Track highest scaled across all possible matches." + if scaled > self.scaled: + self.scaled = scaled + return self.scaled + + def peek(self, cur_query_mh, *, threshold_bp=0): + """ + Find best match to current query within this CounterGather object. + """ + self.query_started = 1 + cur_query_mh = cur_query_mh.flatten() + scaled = self.downsample(cur_query_mh.scaled) + cur_query_mh = cur_query_mh.downsample(scaled=scaled) + + # no match? exit. + if not self.orig_query_mh or not cur_query_mh: + return [] + + # verify current query is a subset of the original. + if cur_query_mh.contained_by(self.orig_query_mh, downsample=True) < 1: + raise ValueError("current query not a subset of original query") + + # did we get a match? + res = self.idx.peek(cur_query_mh, threshold_bp=threshold_bp) + if not res: + return [] + sr, intersect_mh = res + + from sourmash.index import IndexSearchResult + match = sr.signature + md5 = match.md5sum() + location = self.locations[md5] + new_sr = IndexSearchResult(sr.score, match, location) + return new_sr, intersect_mh + + def consume(self, *args, **kwargs): + self.query_started = 1 + return self.idx.consume(*args, **kwargs) + + +@pytest.fixture(params=[CounterGather, + CounterGather_LinearIndex, + ] +) +def counter_gather_constructor(request): + build_fn = request.param + + # build on demand + return build_fn + + +def _consume_all(query_mh, counter, threshold_bp=0): + results = [] + query_mh = query_mh.to_mutable() + + last_intersect_size = None + while 1: + result = counter.peek(query_mh, threshold_bp=threshold_bp) + if not result: + break + + sr, intersect_mh = result + print(sr.signature.name, len(intersect_mh)) + if last_intersect_size: + assert len(intersect_mh) <= last_intersect_size + + last_intersect_size = len(intersect_mh) + + counter.consume(intersect_mh) + query_mh.remove_many(intersect_mh.hashes) + + results.append((sr, len(intersect_mh))) + + return results + + +def test_counter_gather_1(counter_gather_constructor): + # check a contrived set of non-overlapping gather results, + # generated via CounterGather + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear() + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + match_mh_2 = query_mh.copy_and_clear() + match_mh_2.add_many(range(10, 15)) + match_ss_2 = SourmashSignature(match_mh_2, name='match2') + + match_mh_3 = query_mh.copy_and_clear() + match_mh_3.add_many(range(15, 17)) + match_ss_3 = SourmashSignature(match_mh_3, name='match3') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + counter.add(match_ss_2) + counter.add(match_ss_3) + + results = _consume_all(query_ss.minhash, counter) + + expected = (['match1', 10], + ['match2', 5], + ['match3', 2],) + assert len(results) == len(expected), results + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_1_b(counter_gather_constructor): + # check a contrived set of somewhat-overlapping gather results, + # generated via CounterGather. Here the overlaps are structured + # so that the gather results are the same as those in + # test_counter_gather_1(), even though the overlaps themselves are + # larger. + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear() + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + match_mh_2 = query_mh.copy_and_clear() + match_mh_2.add_many(range(7, 15)) + match_ss_2 = SourmashSignature(match_mh_2, name='match2') + + match_mh_3 = query_mh.copy_and_clear() + match_mh_3.add_many(range(13, 17)) + match_ss_3 = SourmashSignature(match_mh_3, name='match3') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + counter.add(match_ss_2) + counter.add(match_ss_3) + + results = _consume_all(query_ss.minhash, counter) + + expected = (['match1', 10], + ['match2', 5], + ['match3', 2],) + assert len(results) == len(expected), results + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_1_c_with_threshold(counter_gather_constructor): + # check a contrived set of somewhat-overlapping gather results, + # generated via CounterGather. Here the overlaps are structured + # so that the gather results are the same as those in + # test_counter_gather_1(), even though the overlaps themselves are + # larger. + # use a threshold, here. + + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear() + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + match_mh_2 = query_mh.copy_and_clear() + match_mh_2.add_many(range(7, 15)) + match_ss_2 = SourmashSignature(match_mh_2, name='match2') + + match_mh_3 = query_mh.copy_and_clear() + match_mh_3.add_many(range(13, 17)) + match_ss_3 = SourmashSignature(match_mh_3, name='match3') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + counter.add(match_ss_2) + counter.add(match_ss_3) + + results = _consume_all(query_ss.minhash, counter, + threshold_bp=3) + + expected = (['match1', 10], + ['match2', 5]) + assert len(results) == len(expected), results + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_1_d_diff_scaled(counter_gather_constructor): + # test as above, but with different scaled. + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) + match_mh_2.add_many(range(7, 15)) + match_ss_2 = SourmashSignature(match_mh_2, name='match2') + + match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) + match_mh_3.add_many(range(13, 17)) + match_ss_3 = SourmashSignature(match_mh_3, name='match3') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + counter.add(match_ss_2) + counter.add(match_ss_3) + + results = _consume_all(query_ss.minhash, counter) + + expected = (['match1', 10], + ['match2', 5], + ['match3', 2],) + assert len(results) == len(expected), results + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_1_d_diff_scaled_query(counter_gather_constructor): + # test as above, but with different scaled for QUERY. + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + + match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10) + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20) + match_mh_2.add_many(range(7, 15)) + match_ss_2 = SourmashSignature(match_mh_2, name='match2') + + match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30) + match_mh_3.add_many(range(13, 17)) + match_ss_3 = SourmashSignature(match_mh_3, name='match3') + + # downsample query now - + query_ss = SourmashSignature(query_mh.downsample(scaled=100), name='query') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + counter.add(match_ss_2) + counter.add(match_ss_3) + + results = _consume_all(query_ss.minhash, counter) + + expected = (['match1', 10], + ['match2', 5], + ['match3', 2],) + assert len(results) == len(expected), results + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_1_e_abund_query(counter_gather_constructor): + # test as above, but abund query + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear().flatten() + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + match_mh_2 = query_mh.copy_and_clear().flatten() + match_mh_2.add_many(range(7, 15)) + match_ss_2 = SourmashSignature(match_mh_2, name='match2') + + match_mh_3 = query_mh.copy_and_clear().flatten() + match_mh_3.add_many(range(13, 17)) + match_ss_3 = SourmashSignature(match_mh_3, name='match3') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + counter.add(match_ss_2) + counter.add(match_ss_3) + + # must flatten before peek! + results = _consume_all(query_ss.minhash.flatten(), counter) + + expected = (['match1', 10], + ['match2', 5], + ['match3', 2],) + assert len(results) == len(expected), results + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_1_f_abund_match(counter_gather_constructor): + # test as above, but abund query + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh.flatten(), name='query') + + match_mh_1 = query_mh.copy_and_clear() + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + match_mh_2 = query_mh.copy_and_clear() + match_mh_2.add_many(range(7, 15)) + match_ss_2 = SourmashSignature(match_mh_2, name='match2') + + match_mh_3 = query_mh.copy_and_clear() + match_mh_3.add_many(range(13, 17)) + match_ss_3 = SourmashSignature(match_mh_3, name='match3') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + counter.add(match_ss_2) + counter.add(match_ss_3) + + # must flatten before peek! + results = _consume_all(query_ss.minhash.flatten(), counter) + + expected = (['match1', 10], + ['match2', 5], + ['match3', 2],) + assert len(results) == len(expected), results + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_2(counter_gather_constructor): + # check basic set of gather results on semi-real data, + # generated via CounterGather + testdata_combined = utils.get_test_data('gather/combined.sig') + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_ss = sourmash.load_one_signature(testdata_combined, ksize=21) + subject_sigs = [ (sourmash.load_one_signature(t, ksize=21), t) + for t in testdata_sigs ] + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + for ss, loc in subject_sigs: + counter.add(ss, location=loc) + + results = _consume_all(query_ss.minhash, counter) + + expected = (['NC_003198.1', 487], + ['NC_000853.1', 192], + ['NC_011978.1', 169], + ['NC_002163.1', 157], + ['NC_003197.2', 152], + ['NC_009486.1', 92], + ['NC_006905.1', 76], + ['NC_011080.1', 59], + ['NC_011274.1', 42], + ['NC_006511.1', 31], + ['NC_011294.1', 7], + ['NC_004631.1', 2]) + assert len(results) == len(expected) + + for (sr, size), (exp_name, exp_size) in zip(results, expected): + sr_name = sr.signature.name.split()[0] + print(sr_name, size) + + assert sr_name == exp_name + assert size == exp_size + + +def test_counter_gather_exact_match(counter_gather_constructor): + # query == match + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + # load up the counter; provide a location override, too. + counter = counter_gather_constructor(query_ss.minhash) + counter.add(query_ss, location='somewhere over the rainbow') + + results = _consume_all(query_ss.minhash, counter) + assert len(results) == 1 + (sr, intersect_mh) = results[0] + + assert sr.score == 1.0 + assert sr.signature == query_ss + assert sr.location == 'somewhere over the rainbow' + + +def test_counter_gather_multiple_identical_matches(counter_gather_constructor): + # test multiple identical matches being inserted, with only one return + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + # create counter... + counter = counter_gather_constructor(query_ss.minhash) + + # now add multiple identical matches. + match_mh = query_mh.copy_and_clear() + match_mh.add_many(range(5, 15)) + + for name in 'match1', 'match2', 'match3': + match_ss = SourmashSignature(match_mh, name=name) + counter.add(match_ss, location=name) + + results = _consume_all(query_ss.minhash, counter) + assert len(results) == 1 + + sr, overlap_count = results[0] + assert sr.score == 0.5 + assert overlap_count == 10 + + # any one of the three is valid + assert sr.location in ('match1', 'match2', 'match3') + + +def test_counter_gather_add_after_peek(counter_gather_constructor): + # cannot add after peek or consume + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(query_ss, location='somewhere over the rainbow') + + counter.peek(query_ss.minhash) + + with pytest.raises(ValueError): + counter.add(query_ss, location="try again") + + +def test_counter_gather_add_after_consume(counter_gather_constructor): + # cannot add after peek or consume + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(query_ss, location='somewhere over the rainbow') + + counter.consume(query_ss.minhash) + + with pytest.raises(ValueError): + counter.add(query_ss, location="try again") + + +def test_counter_gather_consume_empty_intersect(counter_gather_constructor): + # check that consume works fine when there is an empty signature. + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(query_ss, location='somewhere over the rainbow') + + # nothing really happens here :laugh:, just making sure there's no error + counter.consume(query_ss.minhash.copy_and_clear()) + + +def test_counter_gather_empty_initial_query(counter_gather_constructor): + # check empty initial query + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear() + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1, require_overlap=False) + + assert counter.peek(query_ss.minhash) == [] + + +def test_counter_gather_num_query(counter_gather_constructor): + # check num query + query_mh = sourmash.MinHash(n=500, ksize=31) + query_mh.add_many(range(0, 10)) + query_ss = SourmashSignature(query_mh, name='query') + + with pytest.raises(ValueError): + counter_gather_constructor(query_ss.minhash) + + +def test_counter_gather_empty_cur_query(counter_gather_constructor): + # test empty cur query + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(query_ss, location='somewhere over the rainbow') + + cur_query_mh = query_ss.minhash.copy_and_clear() + results = _consume_all(cur_query_mh, counter) + assert results == [] + + +def test_counter_gather_add_num_matchy(counter_gather_constructor): + # test add num query + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh = sourmash.MinHash(n=500, ksize=31) + match_mh.add_many(range(0, 20)) + match_ss = SourmashSignature(match_mh, name='query') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + with pytest.raises(ValueError): + counter.add(match_ss, location='somewhere over the rainbow') + + +def test_counter_gather_bad_cur_query(counter_gather_constructor): + # test cur query that is not subset of original query + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(query_ss, location='somewhere over the rainbow') + + cur_query_mh = query_ss.minhash.copy_and_clear() + cur_query_mh.add_many(range(20, 30)) + with pytest.raises(ValueError): + counter.peek(cur_query_mh) + + +def test_counter_gather_add_no_overlap(counter_gather_constructor): + # check adding match with no overlap w/query + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 10)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear() + match_mh_1.add_many(range(10, 20)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + with pytest.raises(ValueError): + counter.add(match_ss_1) + + assert counter.peek(query_ss.minhash) == [] + + +def test_counter_gather_big_threshold(counter_gather_constructor): + # check 'peek' with a huge threshold + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_mh.add_many(range(0, 20)) + query_ss = SourmashSignature(query_mh, name='query') + + match_mh_1 = query_mh.copy_and_clear() + match_mh_1.add_many(range(0, 10)) + match_ss_1 = SourmashSignature(match_mh_1, name='match1') + + # load up the counter + counter = counter_gather_constructor(query_ss.minhash) + counter.add(match_ss_1) + + # impossible threshold: + threshold_bp=30*query_ss.minhash.scaled + results = counter.peek(query_ss.minhash, threshold_bp=threshold_bp) + assert results == [] + + +def test_counter_gather_empty_counter(counter_gather_constructor): + # check empty counter + query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1) + query_ss = SourmashSignature(query_mh, name='query') + + # empty counter! + counter = counter_gather_constructor(query_ss.minhash) + + assert counter.peek(query_ss.minhash) == []