From 6f1caf54673eb3c932146e80d36d4442abc7d97e Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 12 May 2021 15:06:31 -0700
Subject: [PATCH] Refactor the `gather_databases` function to use `MinHash` API
 over Python set API. (#1512)

* make 'find' a generator
* remove comment

* begin refactoring 'categorize'

* have the 'find' function for SBTs return signatures

* fix majority of tests

* comment & then fix test

* torture the tests into working

* split find and _find_nodes to take different kinds of functions

* redo 'find' on index

* refactor lca_db to use new find

* refactor SBT to use new find

* comment/cleanup

* refactor out common code

* fix up gather

* use 'passes' properly

* attempted cleanup

* minor fixes

* get a start on correct downsampling

* adjust tree downsampling for regular minhashes, too

* remove now-unused search functions in sbtmh

* refactor categorize to use new find

* cleanup and removal

* remove redundant code in lca_db

* remove redundant code in SBT

* add notes

* remove more unused code

* refactor most of the test_sbt tests

* fix one minor issue

* fix jaccard calculation in sbt

* check for compatibility of search fn and query signature

* switch tests over to jaccard similarity, not containment

* fix test

* remove test for unimplemented LCA_Database.find method

* document threshold change; update test

* refuse to run abund signatures

* flatten sigs internally for gather

* reinflate abundances for saving

* fix problem where sbt indices coudl be created with abund signatures

* more

* split flat and abund search

* make ignore_abundance work again for categorize

* turn off best-only, since it triggers on self-hits.

* add test: 'sourmash index' flattens sigs

* add note about something to test

* fix typo; still broken tho

* location is now a property

* move search code into search.py

* remove redundant scaled checking code

* best-only now works properly for two tests

* 'fix' tests by removing v1 and v2 SBT compatibility

* simplify (?) downsampling code

* require keyword args in MinHash.downsample(...)

* fix bug with downsample

* require keyword args in MinHash.downsample(...)

* fix test to use proper downsampling, reverse order to match scaled

* add test for revealed bug

* remove unnecessary comment

* flatten subject MinHash, too

* add testme comment

* clean up sbt find

* clean up lca find

* add IndexSearchResult namedtuple for search and gather results

* add more tests for Index classes

* add tests for subj & query num downsampling

* tests for Index.search_abund

* refactor a bit

* refactor make_jaccard_search_query; start tests

* even more tests

* test collect, best_only

* more search tests

* remove unnec space

* add minor comment

* deal with status == None on SystemExit

* upgrade and simplify categorize

* restore test

* merge

* fix abundance search in SBT for categorize

* code cleanup and refactoring; check for proper error messages

* add explicit test for incompatible num

* refactor MinHash.downsample

* deal with status == None on SystemExit

* fix test

* fix comment mispelling

* properly pass kwargs; fix search_sbt_index

* add simple tests for SBT load and search API

* allow arbitrary kwargs for LCA_DAtabase.find

* add testing of passthru-kwargs

* re-enable test

* add notes to update docstrings

* docstring updates

* fix test

* fix location reporting in prefetch

* fix prefetch location by fixing MultiIndex

* temporary prefetch_gather intervention

* 'gather' only returns best match

* turn prefetch on by default, for now

* better tests for gather --save-unassigned

* remove unused print

* remove unnecessary check-me comment

* clear out docstring

* SBT search doesn't work on v1 and v2 SBTs b/c no min_n_below

* start adding tests

* test some basic prefetch stuff

* update index for prefetch

* add fairly thorough tests

* fix my dumb mistake with gather

* simplify, refactor, fix

* fix remaining tests

* propogate ValueErrors better

* fix tests

* flatten prefetch queries

* fix for genome-grist alpha test

* fix threshold bugarooni

* fix gather/prefetch interactions

* fix sourmash prefetch return value

* minor fixes

* pay proper attention to threshold

* cleanup and refactoring

* remove unnecessary 'scaled'

* minor cleanup

* added LazyLinearLindex and prefetch --linear

* fix abundance problem

* save matches to a directory

* test for saving matches to a directory

* add a flexible progressive signature output class

* add tests for .sig.gz and .zip outputs

* update save_signatures code; add tests; use in gather and search too

* update comment

* cleanup and refactor of SaveSignaturesToLocation code

* docstrings & cleanup

* add 'run' and 'runtmp' test fixtures

* remove unnecessary track_abundance fixture call

* restore original;

* linear and prefetch fixtures + runtmp

* fix use of runtmp

* copy over SaveSignaturesToLocation code from other branch

* docs for sourmash prefetch

* more doc

* minor edits

* Re-implement the actual gather protocol with a cleaner interface. (#1489)

* initial refactor of CounterGather stuff

* refactor into peek and consume

* move next method over to query specific class

* replace gather implementation with new CounterGather

* many more tests for CounterGather

* remove scaled arg from peek

* open-box test for counter internal data structures

* add num query & subj tests

* add repr; add tests; support stdout

* refactor signature saving to use new sourmash_args collection saving

* specify utf-8 encoding for output

* add flexible output to compute/sketch

* add test to trigger rust panic

* test search --save-matches

* add --save-prefetch to sourmash gather

* remove --no-prefetch option :)

* added --save-prefetch functionality

* add back a mostly-functioning --no-prefetch argument :)

* add --no-prefetch back in

* check for JSON in first byte of LCA DB file

* start adding linear tests

* use fixtures to test prefetch and linear more thoroughly

* comments, etc

* upgrade docs for --linear and --prefetch

* 'fix' issue and test

* fix a last test ;)

* Update doc/command-line.md

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update src/sourmash/cli/sig/rename.py

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update tests/test_sourmash_args.py

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update tests/test_sourmash_args.py

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update tests/test_sourmash_args.py

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update tests/test_sourmash_args.py

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update tests/test_sourmash_args.py

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update doc/command-line.md

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* write tests for LazyLinearIndex

* add some basic prefetch tests

* properly test linear!

* add more tests for LazyLinearIndex

* test zipfile bool

* remove unnecessary try/except; comment

* fix signatures() call

* fix --prefetch snafu; doc

* do not overwrite signature even if duplicate md5sum (#1497)

* try adding loc to return values from Index.find

* made use of new IndexSearchResult.find throughout

* adjust note

* provide signatures_with_location on all Index objects

* cleanup and fix

* Update doc/command-line.md

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* Update doc/command-line.md

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

* fix bug around --save-prefetch with multiple databases

* comment/doc minor updates

* move away from Python sets to MinHash objects

* return intersect_mh from _find_best

* put _subtract_and_downsample inline

* clean up and remove old code

* remove max_hash

* more cleanup

Co-authored-by: Luiz Irber <luizirber@users.noreply.github.com>
Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>
---
 src/sourmash/commands.py |  8 ++---
 src/sourmash/search.py   | 74 +++++++++++++++-------------------------
 2 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index c5ea6cb4aa..34eefaf8b8 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -680,12 +680,11 @@ def gather(args):
     weighted_missed = 1
     is_abundance = query.minhash.track_abundance and not args.ignore_abundance
     orig_query_mh = query.minhash
-    new_max_hash = query.minhash._max_hash
     next_query = query
 
     gather_iter = gather_databases(query, counters, args.threshold_bp,
                                    args.ignore_abundance)
-    for result, weighted_missed, new_max_hash, next_query in gather_iter:
+    for result, weighted_missed, next_query in gather_iter:
         if not len(found):                # first result? print header.
             if is_abundance:
                 print_results("")
@@ -842,7 +841,7 @@ def multigather(args):
             found = []
             weighted_missed = 1
             is_abundance = query.minhash.track_abundance and not args.ignore_abundance
-            for result, weighted_missed, new_max_hash, next_query in gather_databases(query, counters, args.threshold_bp, args.ignore_abundance):
+            for result, weighted_missed, next_query in gather_databases(query, counters, args.threshold_bp, args.ignore_abundance):
                 if not len(found):                # first result? print header.
                     if is_abundance:
                         print_results("")
@@ -919,7 +918,8 @@ def multigather(args):
                 else:
                     notify('saving unassigned hashes to "{}"', output_unassigned)
 
-                    e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
+                    e = MinHash(ksize=query.minhash.ksize, n=0,
+                                scaled=next_query.minhash.scaled)
                     e.add_many(next_query.minhash.hashes)
                     # CTB: note, multigather does not save abundances
                     sig.save_signatures([ sig.SourmashSignature(e) ], fp)
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index 8b1093719c..ad8e5a2a54 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -245,15 +245,6 @@ def search_databases_with_abund_query(query, databases, **kwargs):
                           'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp')
 
 
-# build a new query object, subtracting found mins and downsampling
-def _subtract_and_downsample(to_remove, old_query, scaled=None):
-    mh = old_query.minhash
-    mh = mh.downsample(scaled=scaled)
-    mh.remove_many(to_remove)
-
-    return SourmashSignature(mh)
-
-
 def _find_best(counters, query, threshold_bp):
     """
     Search for the best containment, return precisely one match.
@@ -279,16 +270,8 @@ def _find_best(counters, query, threshold_bp):
             counter.consume(best_intersect_mh)
 
         # and done!
-        return best_result
-    return None
-
-
-def _filter_max_hash(values, max_hash):
-    results = set()
-    for v in values:
-        if v < max_hash:
-            results.add(v)
-    return results
+        return best_result, best_intersect_mh
+    return None, None
 
 
 def gather_databases(query, counters, threshold_bp, ignore_abundance):
@@ -299,21 +282,21 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
     # track original query information for later usage.
     track_abundance = query.minhash.track_abundance and not ignore_abundance
     orig_query_mh = query.minhash
-    orig_query_hashes = set(orig_query_mh.hashes)
 
     # do we pay attention to abundances?
-    orig_query_abunds = { k: 1 for k in orig_query_hashes }
+    orig_query_abunds = { k: 1 for k in orig_query_mh.hashes }
     if track_abundance:
         import numpy as np
         orig_query_abunds = orig_query_mh.hashes
 
+    orig_query_mh = orig_query_mh.flatten()
     query.minhash = query.minhash.flatten()
 
     cmp_scaled = query.minhash.scaled    # initialize with resolution of query
     result_n = 0
     while query.minhash:
         # find the best match!
-        best_result = _find_best(counters, query, threshold_bp)
+        best_result, intersect_mh = _find_best(counters, query, threshold_bp)
 
         if not best_result:          # no matches at all for this cutoff!
             notify(f'found less than {format_bp(threshold_bp)} in common. => exiting')
@@ -322,10 +305,6 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
         best_match = best_result.signature
         filename = best_result.location
 
-        # subtract found hashes from search hashes, construct new search
-        query_hashes = set(query.minhash.hashes)
-        found_hashes = set(best_match.minhash.hashes)
-
         # Is the best match computed with scaled? Die if not.
         match_scaled = best_match.minhash.scaled
         assert match_scaled
@@ -336,39 +315,37 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
         # eliminate hashes under this new resolution.
         # (CTB note: this means that if a high scaled/low res signature is
         # found early on, resolution will be low from then on.)
-        new_max_hash = _get_max_hash_for_scaled(cmp_scaled)
-        query_hashes = _filter_max_hash(query_hashes, new_max_hash)
-        found_hashes = _filter_max_hash(found_hashes, new_max_hash)
-        orig_query_hashes = _filter_max_hash(orig_query_hashes, new_max_hash)
-        sum_abunds = sum(( orig_query_abunds[k] for k in orig_query_hashes))
+        query_mh = query.minhash.downsample(scaled=cmp_scaled)
+        found_mh = best_match.minhash.downsample(scaled=cmp_scaled)
+        orig_query_mh = orig_query_mh.downsample(scaled=cmp_scaled)
+        sum_abunds = sum(( orig_query_abunds[k] for k in orig_query_mh.hashes ))
 
         # calculate intersection with query hashes:
-        intersect_hashes = query_hashes.intersection(found_hashes)
-        unique_intersect_bp = cmp_scaled * len(intersect_hashes)
-        intersect_orig_hashes = orig_query_hashes.intersection(found_hashes)
-        intersect_bp = cmp_scaled * len(intersect_orig_hashes)
+        unique_intersect_bp = cmp_scaled * len(intersect_mh)
+        intersect_orig_mh = orig_query_mh.intersection(found_mh)
+        intersect_bp = cmp_scaled * len(intersect_orig_mh)
 
         # calculate fractions wrt first denominator - genome size
-        assert intersect_hashes.issubset(found_hashes)
-        f_match = len(intersect_hashes) / len(found_hashes)
-        f_orig_query = len(intersect_orig_hashes) / len(orig_query_hashes)
+        assert intersect_mh.contained_by(found_mh) == 1.0
+        f_match = len(intersect_mh) / len(found_mh)
+        f_orig_query = len(intersect_orig_mh) / len(orig_query_mh)
 
         # calculate fractions wrt second denominator - metagenome size
-        assert intersect_hashes.issubset(orig_query_hashes)
-        f_unique_to_query = len(intersect_hashes) / len(orig_query_hashes)
+        assert intersect_mh.contained_by(orig_query_mh) == 1.0
+        f_unique_to_query = len(intersect_mh) / len(orig_query_mh)
 
         # calculate fraction of subject match with orig query
         f_match_orig = best_match.minhash.contained_by(orig_query_mh,
                                                        downsample=True)
 
         # calculate scores weighted by abundances
-        f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_hashes))
+        f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mh.hashes ))
         f_unique_weighted /= sum_abunds
 
         # calculate stats on abundances, if desired.
         average_abund, median_abund, std_abund = None, None, None
         if track_abundance:
-            intersect_abunds = (orig_query_abunds[k] for k in intersect_hashes)
+            intersect_abunds = (orig_query_abunds[k] for k in intersect_mh.hashes )
             intersect_abunds = list(intersect_abunds)
 
             average_abund = np.mean(intersect_abunds)
@@ -376,11 +353,14 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
             std_abund = np.std(intersect_abunds)
 
         # construct a new query, subtracting hashes found in previous one.
-        query = _subtract_and_downsample(found_hashes, query, cmp_scaled)
-        remaining_bp = cmp_scaled * len(query.minhash.hashes)
+        new_query_mh = query.minhash.downsample(scaled=cmp_scaled)
+        new_query_mh.remove_many(set(found_mh.hashes))
+        new_query = SourmashSignature(new_query_mh)
+
+        remaining_bp = cmp_scaled * len(new_query_mh)
 
         # compute weighted_missed:
-        query_hashes -= set(found_hashes)
+        query_hashes = set(query_mh.hashes) - set(found_mh.hashes)
         weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) \
              / sum_abunds
 
@@ -403,7 +383,9 @@ def gather_databases(query, counters, threshold_bp, ignore_abundance):
                               remaining_bp=remaining_bp)
         result_n += 1
 
-        yield result, weighted_missed, new_max_hash, query
+        yield result, weighted_missed, new_query
+
+        query = new_query
 
 
 ###