From 940866df85eb9a64cf235486c34461594cbd8d29 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Wed, 27 Apr 2022 11:14:42 -0700
Subject: [PATCH 1/9] init with tax code from #1788

---
 src/sourmash/tax/__main__.py             |  30 +++--
 src/sourmash/tax/tax_utils.py            | 150 ++++++++++-------------
 tests/test-data/tax/test1.gather_ani.csv |   5 +
 tests/test_tax.py                        | 118 +++++++++++++-----
 tests/test_tax_utils.py                  | 106 +++++++++-------
 5 files changed, 242 insertions(+), 167 deletions(-)
 create mode 100644 tests/test-data/tax/test1.gather_ani.csv

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 01fc6bbe1c..1cee174eb1 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -184,7 +184,8 @@ def genome(args):
             best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed,
                                                      keep_full_identifiers=args.keep_full_identifiers,
                                                      keep_identifier_versions = args.keep_identifier_versions,
-                                                     best_only=True, seen_perfect=seen_perfect)
+                                                     best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True)
+
         except ValueError as exc:
             error(f"ERROR: {str(exc)}")
             sys.exit(-1)
@@ -194,19 +195,22 @@ def genome(args):
             status = 'nomatch'
             if sg.query_name in matched_queries:
                 continue
-            if sg.fraction <= args.containment_threshold:
+            if args.ani_threshold and sg.query_ani_at_rank < args.ani_threshold:
+                status="below_threshold"
+                notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet query ANI/AAI threshold {args.ani_threshold}")
+            elif sg.fraction <= args.containment_threshold: # should this just be less than?
                 status="below_threshold"
                 notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}")
             else:
                 status="match"
-            classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank)
+            classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank)
             classifications[args.rank].append(classif)
             matched_queries.add(sg.query_name)
             if "krona" in args.output_format:
                 lin_list = display_lineage(sg.lineage).split(';')
                 krona_results.append((sg.fraction, *lin_list))
     else:
-        # classify to the match that passes the containment threshold.
+        # classify to the rank/match that passes the containment threshold.
         # To do - do we want to store anything for this match if nothing >= containment threshold?
         for rank in tax_utils.ascending_taxlist(include_strain=False):
             # gets best_at_rank for all queries in this gather_csv
@@ -214,7 +218,7 @@ def genome(args):
                 best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
                                                          keep_full_identifiers=args.keep_full_identifiers,
                                                          keep_identifier_versions = args.keep_identifier_versions,
-                                                         best_only=True, seen_perfect=seen_perfect)
+                                                         best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True)
             except ValueError as exc:
                 error(f"ERROR: {str(exc)}")
                 sys.exit(-1)
@@ -223,18 +227,26 @@ def genome(args):
                 status = 'nomatch'
                 if sg.query_name in matched_queries:
                     continue
-                if sg.fraction >= args.containment_threshold:
+                if args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold:
+                    status="match"
+                elif sg.fraction >= args.containment_threshold:
                     status = "match"
-                    classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank)
+                if status == "match":
+                    classif = ClassificationResult(query_name=sg.query_name, status=status, rank=sg.rank,
+                                                    fraction=sg.fraction, lineage=sg.lineage,
+                                                    query_md5=sg.query_md5, query_filename=sg.query_filename,
+                                                    f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank,
+                                                    query_ani_at_rank= sg.query_ani_at_rank)
                     classifications[sg.rank].append(classif)
                     matched_queries.add(sg.query_name)
                     continue
-                if rank == "superkingdom" and status == "nomatch":
+                elif rank == "superkingdom" and status == "nomatch":
                     status="below_threshold"
                     classif = ClassificationResult(query_name=sg.query_name, status=status,
                                                    rank="", fraction=0, lineage="",
                                                    query_md5=sg.query_md5, query_filename=sg.query_filename,
-                                                   f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank)
+                                                   f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, 
+                                                   query_ani_at_rank=sg.query_ani_at_rank)
                     classifications[sg.rank].append(classif)
 
     if not any([classifications, krona_results]):
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index d5a7161afc..0c69eeaf55 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -5,12 +5,7 @@
 import csv
 from collections import namedtuple, defaultdict
 from collections import abc
-
-from sourmash import sqlite_utils
-from sourmash.exceptions import IndexNotSupported
-
-import sqlite3
-
+from sourmash.distance_utils import containment_to_distance
 
 __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs',
            'load_gather_results', 'check_and_load_gather_csvs',
@@ -24,9 +19,9 @@
 from sourmash.logging import notify
 from sourmash.sourmash_args import load_pathlist_from_file
 
-QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp")
-SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank")
-ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank")
+QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp, query_hashes")
+SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank")
+ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank")
 
 # Essential Gather column names that must be in gather_csv to allow `tax` summarization
 EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename')
@@ -188,7 +183,7 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [],
 def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                         keep_full_identifiers=False,
                         keep_identifier_versions=False, best_only=False,
-                        seen_perfect=set()):
+                        seen_perfect=set(), estimate_query_ani=False):
     """
     Summarize gather results at specified taxonomic rank
     """
@@ -198,7 +193,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
     sum_uniq_to_query = defaultdict(lambda: defaultdict(float))
     sum_uniq_bp = defaultdict(lambda: defaultdict(float))
     query_info = {}
-
+    ksize,scaled,query_nhashes=None,None,None
     for row in gather_results:
         # get essential gather info
         query_name = row['query_name']
@@ -207,13 +202,25 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
         unique_intersect_bp = int(row['unique_intersect_bp'])
         query_md5 = row['query_md5']
         query_filename = row['query_filename']
-        # get query_bp
-        if query_name not in query_info.keys():
-            query_bp = unique_intersect_bp + int(row['remaining_bp'])
+        if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug?
+            if "query_nhashes" in row.keys():
+                query_nhashes = int(row["query_nhashes"])
+            if "query_bp" in row.keys():
+                query_bp = int(row["query_bp"])
+            else:
+                query_bp = unique_intersect_bp + int(row['remaining_bp'])
         # store query info
-        query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp)
-        match_ident = row['name']
+        query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes = query_nhashes)
+
+        if estimate_query_ani and (not ksize or not scaled): # just need to set these once. BUT, if we have these, should we check for compatibility when loading the gather file?
+            if "ksize" in row.keys(): # ksize and scaled were added to gather results in same PR
+                ksize = int(row['ksize'])
+                scaled = int(row['scaled'])
+            else:
+                estimate_query_ani=False
+                notify("WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI...")
 
+        match_ident = row['name']
 
         # 100% match? are we looking at something in the database?
         if f_unique_to_query >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank
@@ -225,27 +232,29 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
 
         # get lineage for match
         lineage = find_match_lineage(match_ident, tax_assign,
-                                     skip_idents=skip_idents,
-                             keep_full_identifiers=keep_full_identifiers,
-                             keep_identifier_versions=keep_identifier_versions)
+                                    skip_idents=skip_idents,
+                                    keep_full_identifiers=keep_full_identifiers,
+                                    keep_identifier_versions=keep_identifier_versions)
         # ident was in skip_idents
         if not lineage:
             continue
 
         # summarize at rank!
         lineage = pop_to_rank(lineage, rank)
-        assert lineage[-1].rank == rank, (rank, lineage[-1])
+        assert lineage[-1].rank == rank, lineage[-1]
         # record info
         sum_uniq_to_query[query_name][lineage] += f_unique_to_query
         sum_uniq_weighted[query_name][lineage] += f_uniq_weighted
         sum_uniq_bp[query_name][lineage] += unique_intersect_bp
 
+
     # sort and store each as SummarizedGatherResult
     sum_uniq_to_query_sorted = []
     for query_name, lineage_weights in sum_uniq_to_query.items():
         qInfo = query_info[query_name]
         sumgather_items = list(lineage_weights.items())
         sumgather_items.sort(key = lambda x: -x[1])
+        query_ani = None
         if best_only:
             lineage, fraction = sumgather_items[0]
             if fraction > 1:
@@ -254,13 +263,19 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                 continue
             f_weighted_at_rank = sum_uniq_weighted[query_name][lineage]
             bp_intersect_at_rank = sum_uniq_bp[query_name][lineage]
-            sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank)
+            if estimate_query_ani:
+                query_ani = containment_to_distance(fraction, ksize, scaled,
+                                                    n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp,
+                                                    return_identity=True)[0]
+            sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5,
+                                          qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani)
             sum_uniq_to_query_sorted.append(sres)
         else:
             total_f_weighted= 0.0
             total_f_classified = 0.0
             total_bp_classified = 0
             for lineage, fraction in sumgather_items:
+                query_ani=None
                 if fraction > 1:
                     raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
                 elif fraction == 0:
@@ -270,16 +285,23 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                 total_f_weighted += f_weighted_at_rank
                 bp_intersect_at_rank = int(sum_uniq_bp[query_name][lineage])
                 total_bp_classified += bp_intersect_at_rank
-                sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank)
+                if estimate_query_ani:
+                    query_ani = containment_to_distance(fraction, ksize, scaled,
+                                                        n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp,
+                                                        return_identity=True)[0]
+                sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5,
+                                              query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani)
                 sum_uniq_to_query_sorted.append(sres)
 
             # record unclassified
             lineage = ()
+            query_ani=None
             fraction = 1.0 - total_f_classified
             if fraction > 0:
                 f_weighted_at_rank = 1.0 - total_f_weighted
                 bp_intersect_at_rank = qInfo.query_bp - total_bp_classified
-                sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank)
+                sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5,
+                                              query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani)
                 sum_uniq_to_query_sorted.append(sres)
 
     return sum_uniq_to_query_sorted, seen_perfect
@@ -628,27 +650,15 @@ def load(cls, filename, *, delimiter=',', force=False,
 
 class LineageDB_Sqlite(abc.Mapping):
     """
-    A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table.
+    A LineageDB based on a sqlite3 database with a 'taxonomy' table.
     """
     # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'.
     columns = ('superkingdom', 'phylum', 'order_', 'class', 'family',
                'genus', 'species', 'strain')
-    table_name = 'sourmash_taxonomy'
 
-    def __init__(self, conn, *, table_name=None):
+    def __init__(self, conn):
         self.conn = conn
 
-        # provide for legacy support for pre-sourmash_internal days...
-        if table_name is not None:
-            self.table_name = table_name
-
-        # check that the right table is there.
-        c = conn.cursor()
-        try:
-            c.execute(f'SELECT * FROM {self.table_name} LIMIT 1')
-        except (sqlite3.DatabaseError, sqlite3.OperationalError):
-            raise ValueError("not a taxonomy database")
-            
         # check: can we do a 'select' on the right table?
         self.__len__()
         c = conn.cursor()
@@ -656,7 +666,7 @@ def __init__(self, conn, *, table_name=None):
         # get available ranks...
         ranks = set()
         for column, rank in zip(self.columns, taxlist(include_strain=True)):
-            query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""'
+            query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""'
             c.execute(query)
             cnt, = c.fetchone()
             if cnt:
@@ -667,35 +677,14 @@ def __init__(self, conn, *, table_name=None):
 
     @classmethod
     def load(cls, location):
-        "load taxonomy information from an existing sqlite3 database"
-        conn = sqlite_utils.open_sqlite_db(location)
-        if not conn:
-            raise ValueError("not a sqlite taxonomy database")
-
-        table_name = None
-        c = conn.cursor()
+        "load taxonomy information from a sqlite3 database"
+        import sqlite3
         try:
-            info = sqlite_utils.get_sourmash_internal(c)
-        except sqlite3.OperationalError:
-            info = {}
-
-        if 'SqliteLineage' in info:
-            if info['SqliteLineage'] != '1.0':
-                raise IndexNotSupported
-
-            table_name = 'sourmash_taxonomy'
-        else:
-            # legacy support for old taxonomy DB, pre sourmash_internal.
-            try:
-                c.execute('SELECT * FROM taxonomy LIMIT 1')
-                table_name = 'taxonomy'
-            except sqlite3.OperationalError:
-                pass
-
-        if table_name is None:
-            raise ValueError("not a sqlite taxonomy database")
-
-        return cls(conn, table_name=table_name)
+            conn = sqlite3.connect(location)
+            db = cls(conn)
+        except sqlite3.DatabaseError:
+            raise ValueError("not a sqlite database")
+        return db
 
     def _make_tup(self, row):
         "build a tuple of LineagePairs for this sqlite row"
@@ -705,7 +694,7 @@ def _make_tup(self, row):
     def __getitem__(self, ident):
         "Retrieve lineage for identifer"
         c = self.cursor
-        c.execute(f'SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?', (ident,))
+        c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,))
 
         # retrieve names list...
         names = c.fetchone()
@@ -726,7 +715,7 @@ def __bool__(self):
     def __len__(self):
         "Return number of rows"
         c = self.conn.cursor()
-        c.execute(f'SELECT COUNT(DISTINCT ident) FROM {self.table_name}')
+        c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy')
         nrows, = c.fetchone()
         return nrows
 
@@ -734,7 +723,7 @@ def __iter__(self):
         "Return all identifiers"
         # create new cursor so as to allow other operations
         c = self.conn.cursor()
-        c.execute(f'SELECT DISTINCT ident FROM {self.table_name}')
+        c.execute('SELECT DISTINCT ident FROM taxonomy')
 
         for ident, in c:
             yield ident
@@ -743,12 +732,11 @@ def items(self):
         "return all items in the sqlite database"
         c = self.conn.cursor()
 
-        c.execute(f'SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}')
+        c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy')
 
         for ident, *names in c:
             yield ident, self._make_tup(names)
 
-
 class MultiLineageDB(abc.Mapping):
     "A wrapper for (dynamically) combining multiple lineage databases."
 
@@ -845,23 +833,15 @@ def save(self, filename_or_fp, file_format):
                 if is_filename:
                     fp.close()
 
-    def _save_sqlite(self, filename, *, conn=None):
-        from sourmash import sqlite_utils
-
-        if conn is None:
-            db = sqlite3.connect(filename)
-        else:
-            assert not filename
-            db = conn
+    def _save_sqlite(self, filename):
+        import sqlite3
+        db = sqlite3.connect(filename)
 
         cursor = db.cursor()
         try:
-            sqlite_utils.add_sourmash_internal(cursor, 'SqliteLineage', '1.0')
-
-            # CTB: could add 'IF NOT EXIST' here; would need tests, too.
             cursor.execute("""
 
-        CREATE TABLE sourmash_taxonomy (
+        CREATE TABLE taxonomy (
             ident TEXT NOT NULL,
             superkingdom TEXT,
             phylum TEXT,
@@ -879,7 +859,7 @@ class TEXT,
             raise ValueError(f"taxonomy table already exists in '{filename}'")
 
         # follow up and create index
-        cursor.execute("CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);")
+        cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);")
         for ident, tax in self.items():
             x = [ident, *[ t.name for t in tax ]]
 
@@ -888,7 +868,7 @@ class TEXT,
             while len(x) < 9:
                 x.append('')
 
-            cursor.execute('INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
+            cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
 
         db.commit()
 
diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_ani.csv
new file mode 100644
index 0000000000..48a09eb199
--- /dev/null
+++ b/tests/test-data/tax/test1.gather_ani.csv
@@ -0,0 +1,5 @@
+intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled,query_nhashes
+442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000,5013970
+390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000,4571970
+138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000,4181970
+338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000,4327970
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 1faf6ce19a..3d19cfd740 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -9,9 +9,6 @@
 from sourmash.tax import tax_utils
 from sourmash_tst_utils import SourmashCommandFailed
 
-from sourmash import sqlite_utils
-from sourmash.exceptions import IndexNotSupported
-
 ## command line tests
 def test_run_sourmash_tax():
     status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True)
@@ -705,6 +702,18 @@ def test_genome_rank_stdout_0_db(runtmp):
     assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
     assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0'  in c.last_result.out
 
+    # too stringent of containment threshold:
+    c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv',
+                   tax, '--rank', 'species', '--containment-threshold', '1.0')
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "WARNING: classifying query test1 at desired rank species does not meet containment threshold 1.0" in c.last_result.err
+    assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0," in c.last_result.out
+
 
 def test_genome_rank_csv_0(runtmp):
     # test basic genome - output csv
@@ -1366,6 +1375,81 @@ def test_genome_over100percent_error(runtmp):
     assert "ERROR: The tax summary of query 'test1' is 1.1, which is > 100% of the query!!" in runtmp.last_result.err
 
 
+def test_genome_ani_threshold_input_errors(runtmp):
+    c = runtmp
+    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
+    tax = utils.get_test_data('tax/test.taxonomy.csv')
+    below_threshold = "-1"
+
+    with pytest.raises(SourmashCommandFailed) as exc:
+        c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax,
+                       '--ani-threshold', below_threshold)
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+    assert "ERROR: Argument must be >0 and <1" in str(exc.value)
+
+    above_threshold = "1.1"
+    with pytest.raises(SourmashCommandFailed) as exc:
+        c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+                       '--ani-threshold', above_threshold)
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+    assert "ERROR: Argument must be >0 and <1" in str(exc.value)
+
+    not_a_float = "str"
+
+    with pytest.raises(SourmashCommandFailed) as exc:
+        c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+                       '--ani-threshold', not_a_float)
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+    assert "ERROR: Must be a floating point number" in str(exc.value)
+
+
+def test_genome_ani_threshold(runtmp):
+    c = runtmp
+    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
+    tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+                       '--ani-threshold', "0.95")
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err
+    assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
+    assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out 
+
+    # more lax threshold
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+                       '--ani-threshold', "0.9")
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0'  in c.last_result.out
+
+    # too stringent of threshold (using rank)
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+                       '--ani-threshold', "1.0", '--rank', 'species')
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+    assert "WARNING: classifying query test1 at desired rank species does not meet query ANI/AAI threshold 1.0" in c.last_result.err
+    assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0,0.9247805047263588" in c.last_result.out
+
+
 def test_annotate_0(runtmp):
     # test annotate
     c = runtmp
@@ -1822,31 +1906,3 @@ def test_tax_prepare_3_db_to_csv_empty_ranks_3(runtmp):
                                         keep_identifier_versions=False)
     assert set(db1) == set(db2)
     assert set(db1) == set(db3)
-
-
-def test_tax_prepare_sqlite_lineage_version(runtmp):
-    # test bad sourmash_internals version for SqliteLineage
-    taxcsv = utils.get_test_data('tax/test.taxonomy.csv')
-    taxout = runtmp.output('out.db')
-
-    runtmp.run_sourmash('tax', 'prepare', '-t', taxcsv,
-                        '-o', taxout, '-F', 'sql')
-    assert os.path.exists(taxout)
-
-    # set bad version
-    conn = sqlite_utils.open_sqlite_db(taxout)
-    c = conn.cursor()
-    c.execute("UPDATE sourmash_internal SET value='0.9' WHERE key='SqliteLineage'")
-
-    conn.commit()
-    conn.close()
-
-    with pytest.raises(IndexNotSupported):
-        db = tax_utils.MultiLineageDB.load([taxout])
-
-def test_tax_prepare_sqlite_no_lineage():
-    # no lineage table at all
-    sqldb = utils.get_test_data('sqlite/index.sqldb')
-
-    with pytest.raises(ValueError):
-        db = tax_utils.MultiLineageDB.load([sqldb])
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 449e26f972..17e457b2b3 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1,6 +1,7 @@
 """
 Tests for functions in taxonomy submodule.
 """
+
 import pytest
 from os.path import basename
 
@@ -22,9 +23,11 @@
 from sourmash.lca.lca_utils import LineagePair
 
 # utility functions for testing
-def make_mini_gather_results(g_infolist):
+def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False):
     # make mini gather_results
     min_header = ["query_name", "name", "match_ident", "f_unique_to_query", "query_md5", "query_filename", "f_unique_weighted", "unique_intersect_bp", "remaining_bp"]
+    if include_ksize_and_scaled:
+        min_header.extend(['ksize', 'scaled'])
     gather_results = []
     for g_info in g_infolist:
         inf = dict(zip(min_header, g_info))
@@ -351,30 +354,36 @@ def test_summarize_gather_at_0():
 def test_summarize_gather_at_1():
     """test two matches, diff f_unique_to_query"""
     # make mini gather_results
-    gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40']
-    gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90']
-    g_res = make_mini_gather_results([gA,gB])
+    ksize=31
+    scaled=10
+    gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled]
+    gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90', ksize, scaled]
+    g_res = make_mini_gather_results([gA,gB], include_ksize_and_scaled=True)
 
     # make mini taxonomy
     gA_tax = ("gA", "a;b;c")
     gB_tax = ("gB", "a;b;d")
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
     # run summarize_gather_at and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True)
 
     # superkingdom
     assert len(sk_sum) == 2
-    print("superkingdom summarized gather: ", sk_sum[0])
+    print("\nsuperkingdom summarized gather 0: ", sk_sum[0])
     assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.7
     assert sk_sum[0].bp_match_at_rank == 70
+    print("superkingdom summarized gather 1: ", sk_sum[1])
     assert sk_sum[1].lineage == ()
     assert round(sk_sum[1].fraction, 1) == 0.3
     assert sk_sum[1].bp_match_at_rank == 30
+    assert sk_sum[0].query_ani_at_rank == 0.9885602934376099
+    assert sk_sum[1].query_ani_at_rank == None
 
     # phylum
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
-    print("phylum summarized gather: ", phy_sum[0])
+    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False)
+    print("phylum summarized gather 0: ", phy_sum[0])
+    print("phylum summarized gather 1: ", phy_sum[1])
     assert len(phy_sum) == 2
     assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.7
@@ -383,8 +392,10 @@ def test_summarize_gather_at_1():
     assert phy_sum[1].lineage == ()
     assert round(phy_sum[1].fraction, 1) == 0.3
     assert phy_sum[1].bp_match_at_rank == 30
+    assert phy_sum[0].query_ani_at_rank == None
+    assert phy_sum[1].query_ani_at_rank == None
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res)
+    cl_sum, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True)
     assert len(cl_sum) == 3
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -393,6 +404,7 @@ def test_summarize_gather_at_1():
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].f_weighted_at_rank == 0.5
     assert cl_sum[0].bp_match_at_rank == 60
+    assert cl_sum[0].query_ani_at_rank == 0.9836567776983505
 
     assert cl_sum[1].rank == 'class'
     assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -401,8 +413,10 @@ def test_summarize_gather_at_1():
     assert cl_sum[1].fraction == 0.1
     assert cl_sum[1].f_weighted_at_rank == 0.1
     assert cl_sum[1].bp_match_at_rank == 10
+    assert cl_sum[1].query_ani_at_rank == 0.9284145445194744
     assert cl_sum[2].lineage == ()
     assert round(cl_sum[2].fraction, 1) == 0.3
+    assert cl_sum[2].query_ani_at_rank == None
 
 
 def test_summarize_gather_at_perfect_match():
@@ -532,32 +546,38 @@ def test_summarize_gather_at_missing_fail():
 def test_summarize_gather_at_best_only_0():
     """test two matches, diff f_unique_to_query"""
     # make mini gather_results
-    gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40']
-    gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90']
-    g_res = make_mini_gather_results([gA,gB])
+    ksize =31
+    scaled=10
+    gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled]
+    gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90', ksize, scaled]
+    g_res = make_mini_gather_results([gA,gB],include_ksize_and_scaled=True)
 
     # make mini taxonomy
     gA_tax = ("gA", "a;b;c")
     gB_tax = ("gB", "a;b;d")
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
     # run summarize_gather_at and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True)
+    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True)
     # superkingdom
     assert len(sk_sum) == 1
     print("superkingdom summarized gather: ", sk_sum[0])
     assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.7
     assert sk_sum[0].bp_match_at_rank == 70
+    print("superk ANI:",sk_sum[0].query_ani_at_rank)
+    assert sk_sum[0].query_ani_at_rank == 0.9885602934376099
 
     # phylum
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True)
+    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True)
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 1
     assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.7
     assert phy_sum[0].bp_match_at_rank == 70
+    print("phy ANI:",phy_sum[0].query_ani_at_rank)
+    assert phy_sum[0].query_ani_at_rank == 0.9885602934376099
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True)
+    cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True)
     assert len(cl_sum) == 1
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -565,6 +585,8 @@ def test_summarize_gather_at_best_only_0():
                                  LineagePair(rank='class', name='c'))
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].bp_match_at_rank == 60
+    print("cl ANI:",cl_sum[0].query_ani_at_rank)
+    assert cl_sum[0].query_ani_at_rank == 0.9836567776983505
 
 
 def test_summarize_gather_at_best_only_equal_choose_first():
@@ -597,12 +619,14 @@ def test_write_summary_csv(runtmp):
     sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),))],
+                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          query_ani_at_rank=None)],
                   'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=1.0, bp_match_at_rank=100,
                                                      lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='b')))]}
+                                                              LineagePair(rank='phylum', name='b')),
+                                                     query_ani_at_rank=None)]}
 
     outs= runtmp.output("outsum.csv")
     with open(outs, 'w') as out_fp:
@@ -610,9 +634,9 @@ def test_write_summary_csv(runtmp):
 
     sr = [x.rstrip().split(',') for x in open(outs, 'r')]
     print("gather_summary_results_from_file: \n", sr)
-    assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0]
-    assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1]
-    assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[2]
+    assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0]
+    assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1]
+    assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100',''] == sr[2]
 
 
 def test_write_classification(runtmp):
@@ -620,7 +644,8 @@ def test_write_classification(runtmp):
     classif = ClassificationResult('queryA', 'match', 'phylum', 1.0,
                                     (LineagePair(rank='superkingdom', name='a'),
                                      LineagePair(rank='phylum', name='b')),
-                                     'queryA_md5', 'queryA.sig', 1.0, 100)
+                                     'queryA_md5', 'queryA.sig', 1.0, 100,
+                                     query_ani_at_rank=None)
 
     classification = {'phylum': [classif]}
 
@@ -630,8 +655,8 @@ def test_write_classification(runtmp):
 
     sr = [x.rstrip().split(',') for x in open(outs, 'r')]
     print("gather_classification_results_from_file: \n", sr)
-    assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0]
-    assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1]
+    assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0]
+    assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1]
 
 
 def test_make_krona_header_0():
@@ -816,21 +841,25 @@ def test_combine_sumgather_csvs_by_lineage(runtmp):
     sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),))],
+                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          query_ani_at_rank=None)],
                   'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
                                                      lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='b')))]}
+                                                              LineagePair(rank='phylum', name='b')),
+                                                     query_ani_at_rank=None)]}
     sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),))],
+                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          query_ani_at_rank=None)],
                   'phylum':  [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
                                                      lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='c')))]}
+                                                              LineagePair(rank='phylum', name='c')),
+                                                     query_ani_at_rank=None)]}
 
     # write summarized gather results csvs
     sg1= runtmp.output("sample1.csv")
@@ -903,21 +932,25 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
     sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=0.5, bp_match_at_rank=50,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),))],
+                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          query_ani_at_rank=None)],
                   'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
                                                      lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='b')))]}
+                                                              LineagePair(rank='phylum', name='b')),
+                                                     query_ani_at_rank=None)]}
     sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),))],
+                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          query_ani_at_rank=None)],
                   'phylum':  [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
                                                      lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='c')))]}
+                                                              LineagePair(rank='phylum', name='c')),
+                                                     query_ani_at_rank=None)]}
 
     # write summarized gather results csvs
     sg1= runtmp.output("sample1.csv")
@@ -967,17 +1000,6 @@ def test_tax_multi_load_files(runtmp):
         MultiLineageDB.load([runtmp.output('no-such-file')])
 
 
-def test_tax_sql_load_new_file(runtmp):
-    # test loading a newer-format sql file with sourmash_internals table
-    taxonomy_db = utils.get_test_data('sqlite/test.taxonomy.db')
-
-    db = MultiLineageDB.load([taxonomy_db])
-    print(list(db.keys()))
-    assert len(db) == 6
-    assert 'strain' not in db.available_ranks
-    assert db['GCF_001881345'][0].rank == 'superkingdom'
-
-
 def test_tax_multi_load_files_shadowed(runtmp):
     # test loading various good and bad files
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')

From 4e2d43bb1f3e480a44704e8c04c8f96bd0fa2cdf Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Wed, 27 Apr 2022 11:46:01 -0700
Subject: [PATCH 2/9] tax threshold arg from #1788

---
 src/sourmash/cli/utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py
index 1725518747..2063fd09de 100644
--- a/src/sourmash/cli/utils.py
+++ b/src/sourmash/cli/utils.py
@@ -67,10 +67,14 @@ def range_limited_float_type(arg):
     return f
 
 
-def add_tax_threshold_arg(parser, default=0.1):
+def add_tax_threshold_arg(parser, containment_default=0.1, ani_default=None):
     parser.add_argument(
-        '--containment-threshold', default=default, type=range_limited_float_type,
-        help=f'minimum containment threshold for classification; default={default}'
+        '--containment-threshold', default=containment_default, type=range_limited_float_type,
+        help=f'minimum containment threshold for classification; default={containment_default}',
+    )
+    parser.add_argument(
+        '--ani-threshold', '--aai-threshold', default=ani_default, type=range_limited_float_type,
+        help=f'minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}',
     )
 
 

From e29b5ddec53df1a0c919cf79e967df07a3bd0a25 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Wed, 27 Apr 2022 14:58:59 -0700
Subject: [PATCH 3/9] add back sql changes from latest

---
 src/sourmash/tax/tax_utils.py | 112 ++++++++++++++++++++++++----------
 tests/test_tax.py             |  31 ++++++++++
 tests/test_tax_utils.py       |  11 ++++
 3 files changed, 123 insertions(+), 31 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 0c69eeaf55..6ee0787f3b 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -5,8 +5,14 @@
 import csv
 from collections import namedtuple, defaultdict
 from collections import abc
+
+from sourmash import sqlite_utils
+from sourmash.exceptions import IndexNotSupported
 from sourmash.distance_utils import containment_to_distance
 
+import sqlite3
+
+
 __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs',
            'load_gather_results', 'check_and_load_gather_csvs',
            'find_match_lineage', 'summarize_gather_at',
@@ -183,7 +189,8 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [],
 def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                         keep_full_identifiers=False,
                         keep_identifier_versions=False, best_only=False,
-                        seen_perfect=set(), estimate_query_ani=False):
+                        seen_perfect=set(),
+                        estimate_query_ani=False):
     """
     Summarize gather results at specified taxonomic rank
     """
@@ -194,6 +201,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
     sum_uniq_bp = defaultdict(lambda: defaultdict(float))
     query_info = {}
     ksize,scaled,query_nhashes=None,None,None
+
     for row in gather_results:
         # get essential gather info
         query_name = row['query_name']
@@ -202,6 +210,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
         unique_intersect_bp = int(row['unique_intersect_bp'])
         query_md5 = row['query_md5']
         query_filename = row['query_filename']
+        # get query_bp
         if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug?
             if "query_nhashes" in row.keys():
                 query_nhashes = int(row["query_nhashes"])
@@ -209,17 +218,18 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                 query_bp = int(row["query_bp"])
             else:
                 query_bp = unique_intersect_bp + int(row['remaining_bp'])
+        
         # store query info
         query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes = query_nhashes)
-
+        
         if estimate_query_ani and (not ksize or not scaled): # just need to set these once. BUT, if we have these, should we check for compatibility when loading the gather file?
-            if "ksize" in row.keys(): # ksize and scaled were added to gather results in same PR
+            if "ksize" in row.keys():
                 ksize = int(row['ksize'])
                 scaled = int(row['scaled'])
             else:
                 estimate_query_ani=False
-                notify("WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI...")
-
+                notify("WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI...")
+        
         match_ident = row['name']
 
         # 100% match? are we looking at something in the database?
@@ -241,13 +251,13 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
 
         # summarize at rank!
         lineage = pop_to_rank(lineage, rank)
+        #assert lineage[-1].rank == rank, (rank, lineage[-1])
         assert lineage[-1].rank == rank, lineage[-1]
         # record info
         sum_uniq_to_query[query_name][lineage] += f_unique_to_query
         sum_uniq_weighted[query_name][lineage] += f_uniq_weighted
         sum_uniq_bp[query_name][lineage] += unique_intersect_bp
 
-
     # sort and store each as SummarizedGatherResult
     sum_uniq_to_query_sorted = []
     for query_name, lineage_weights in sum_uniq_to_query.items():
@@ -265,8 +275,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
             bp_intersect_at_rank = sum_uniq_bp[query_name][lineage]
             if estimate_query_ani:
                 query_ani = containment_to_distance(fraction, ksize, scaled,
-                                                    n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp,
-                                                    return_identity=True)[0]
+                                                    n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp).ani
             sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5,
                                           qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani)
             sum_uniq_to_query_sorted.append(sres)
@@ -287,15 +296,14 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                 total_bp_classified += bp_intersect_at_rank
                 if estimate_query_ani:
                     query_ani = containment_to_distance(fraction, ksize, scaled,
-                                                        n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp,
-                                                        return_identity=True)[0]
+                                                        n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp).ani
                 sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5,
                                               query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani)
                 sum_uniq_to_query_sorted.append(sres)
 
             # record unclassified
             lineage = ()
-            query_ani=None
+            query_ani = None
             fraction = 1.0 - total_f_classified
             if fraction > 0:
                 f_weighted_at_rank = 1.0 - total_f_weighted
@@ -650,15 +658,27 @@ def load(cls, filename, *, delimiter=',', force=False,
 
 class LineageDB_Sqlite(abc.Mapping):
     """
-    A LineageDB based on a sqlite3 database with a 'taxonomy' table.
+    A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table.
     """
     # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'.
     columns = ('superkingdom', 'phylum', 'order_', 'class', 'family',
                'genus', 'species', 'strain')
+    table_name = 'sourmash_taxonomy'
 
-    def __init__(self, conn):
+    def __init__(self, conn, *, table_name=None):
         self.conn = conn
 
+        # provide for legacy support for pre-sourmash_internal days...
+        if table_name is not None:
+            self.table_name = table_name
+
+        # check that the right table is there.
+        c = conn.cursor()
+        try:
+            c.execute(f'SELECT * FROM {self.table_name} LIMIT 1')
+        except (sqlite3.DatabaseError, sqlite3.OperationalError):
+            raise ValueError("not a taxonomy database")
+            
         # check: can we do a 'select' on the right table?
         self.__len__()
         c = conn.cursor()
@@ -666,7 +686,7 @@ def __init__(self, conn):
         # get available ranks...
         ranks = set()
         for column, rank in zip(self.columns, taxlist(include_strain=True)):
-            query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""'
+            query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""'
             c.execute(query)
             cnt, = c.fetchone()
             if cnt:
@@ -677,14 +697,35 @@ def __init__(self, conn):
 
     @classmethod
     def load(cls, location):
-        "load taxonomy information from a sqlite3 database"
-        import sqlite3
+        "load taxonomy information from an existing sqlite3 database"
+        conn = sqlite_utils.open_sqlite_db(location)
+        if not conn:
+            raise ValueError("not a sqlite taxonomy database")
+
+        table_name = None
+        c = conn.cursor()
         try:
-            conn = sqlite3.connect(location)
-            db = cls(conn)
-        except sqlite3.DatabaseError:
-            raise ValueError("not a sqlite database")
-        return db
+            info = sqlite_utils.get_sourmash_internal(c)
+        except sqlite3.OperationalError:
+            info = {}
+
+        if 'SqliteLineage' in info:
+            if info['SqliteLineage'] != '1.0':
+                raise IndexNotSupported
+
+            table_name = 'sourmash_taxonomy'
+        else:
+            # legacy support for old taxonomy DB, pre sourmash_internal.
+            try:
+                c.execute('SELECT * FROM taxonomy LIMIT 1')
+                table_name = 'taxonomy'
+            except sqlite3.OperationalError:
+                pass
+
+        if table_name is None:
+            raise ValueError("not a sqlite taxonomy database")
+
+        return cls(conn, table_name=table_name)
 
     def _make_tup(self, row):
         "build a tuple of LineagePairs for this sqlite row"
@@ -694,7 +735,7 @@ def _make_tup(self, row):
     def __getitem__(self, ident):
         "Retrieve lineage for identifer"
         c = self.cursor
-        c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,))
+        c.execute(f'SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name} WHERE ident=?', (ident,))
 
         # retrieve names list...
         names = c.fetchone()
@@ -715,7 +756,7 @@ def __bool__(self):
     def __len__(self):
         "Return number of rows"
         c = self.conn.cursor()
-        c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy')
+        c.execute(f'SELECT COUNT(DISTINCT ident) FROM {self.table_name}')
         nrows, = c.fetchone()
         return nrows
 
@@ -723,7 +764,7 @@ def __iter__(self):
         "Return all identifiers"
         # create new cursor so as to allow other operations
         c = self.conn.cursor()
-        c.execute('SELECT DISTINCT ident FROM taxonomy')
+        c.execute(f'SELECT DISTINCT ident FROM {self.table_name}')
 
         for ident, in c:
             yield ident
@@ -732,11 +773,12 @@ def items(self):
         "return all items in the sqlite database"
         c = self.conn.cursor()
 
-        c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy')
+        c.execute(f'SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM {self.table_name}')
 
         for ident, *names in c:
             yield ident, self._make_tup(names)
 
+
 class MultiLineageDB(abc.Mapping):
     "A wrapper for (dynamically) combining multiple lineage databases."
 
@@ -833,15 +875,23 @@ def save(self, filename_or_fp, file_format):
                 if is_filename:
                     fp.close()
 
-    def _save_sqlite(self, filename):
-        import sqlite3
-        db = sqlite3.connect(filename)
+    def _save_sqlite(self, filename, *, conn=None):
+        from sourmash import sqlite_utils
+
+        if conn is None:
+            db = sqlite3.connect(filename)
+        else:
+            assert not filename
+            db = conn
 
         cursor = db.cursor()
         try:
+            sqlite_utils.add_sourmash_internal(cursor, 'SqliteLineage', '1.0')
+
+            # CTB: could add 'IF NOT EXIST' here; would need tests, too.
             cursor.execute("""
 
-        CREATE TABLE taxonomy (
+        CREATE TABLE sourmash_taxonomy (
             ident TEXT NOT NULL,
             superkingdom TEXT,
             phylum TEXT,
@@ -859,7 +909,7 @@ class TEXT,
             raise ValueError(f"taxonomy table already exists in '{filename}'")
 
         # follow up and create index
-        cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);")
+        cursor.execute("CREATE UNIQUE INDEX sourmash_taxonomy_ident ON sourmash_taxonomy(ident);")
         for ident, tax in self.items():
             x = [ident, *[ t.name for t in tax ]]
 
@@ -868,7 +918,7 @@ class TEXT,
             while len(x) < 9:
                 x.append('')
 
-            cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
+            cursor.execute('INSERT INTO sourmash_taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
 
         db.commit()
 
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 3d19cfd740..8cf2218ef7 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -9,6 +9,9 @@
 from sourmash.tax import tax_utils
 from sourmash_tst_utils import SourmashCommandFailed
 
+from sourmash import sqlite_utils
+from sourmash.exceptions import IndexNotSupported
+
 ## command line tests
 def test_run_sourmash_tax():
     status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True)
@@ -1906,3 +1909,31 @@ def test_tax_prepare_3_db_to_csv_empty_ranks_3(runtmp):
                                         keep_identifier_versions=False)
     assert set(db1) == set(db2)
     assert set(db1) == set(db3)
+
+
+def test_tax_prepare_sqlite_lineage_version(runtmp):
+    # test bad sourmash_internals version for SqliteLineage
+    taxcsv = utils.get_test_data('tax/test.taxonomy.csv')
+    taxout = runtmp.output('out.db')
+
+    runtmp.run_sourmash('tax', 'prepare', '-t', taxcsv,
+                        '-o', taxout, '-F', 'sql')
+    assert os.path.exists(taxout)
+
+    # set bad version
+    conn = sqlite_utils.open_sqlite_db(taxout)
+    c = conn.cursor()
+    c.execute("UPDATE sourmash_internal SET value='0.9' WHERE key='SqliteLineage'")
+
+    conn.commit()
+    conn.close()
+
+    with pytest.raises(IndexNotSupported):
+        db = tax_utils.MultiLineageDB.load([taxout])
+
+def test_tax_prepare_sqlite_no_lineage():
+    # no lineage table at all
+    sqldb = utils.get_test_data('sqlite/index.sqldb')
+
+    with pytest.raises(ValueError):
+        db = tax_utils.MultiLineageDB.load([sqldb])
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 17e457b2b3..e0c9001bfd 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1000,6 +1000,17 @@ def test_tax_multi_load_files(runtmp):
         MultiLineageDB.load([runtmp.output('no-such-file')])
 
 
+def test_tax_sql_load_new_file(runtmp):
+    # test loading a newer-format sql file with sourmash_internals table
+    taxonomy_db = utils.get_test_data('sqlite/test.taxonomy.db')
+
+    db = MultiLineageDB.load([taxonomy_db])
+    print(list(db.keys()))
+    assert len(db) == 6
+    assert 'strain' not in db.available_ranks
+    assert db['GCF_001881345'][0].rank == 'superkingdom'
+
+
 def test_tax_multi_load_files_shadowed(runtmp):
     # test loading various good and bad files
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')

From 428cefb1ab03f608b73f213e7933a5e171b464ea Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Wed, 27 Apr 2022 15:25:16 -0700
Subject: [PATCH 4/9] make sure we can still use tax without ANI col

---
 src/sourmash/tax/__main__.py  |  2 +-
 src/sourmash/tax/tax_utils.py |  1 -
 tests/test_tax.py             | 20 +++++++++++++++++++-
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 1cee174eb1..9ba282eac6 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -227,7 +227,7 @@ def genome(args):
                 status = 'nomatch'
                 if sg.query_name in matched_queries:
                     continue
-                if args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold:
+                if sg.query_ani_at_rank is not None and args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold:
                     status="match"
                 elif sg.fraction >= args.containment_threshold:
                     status = "match"
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 6ee0787f3b..d79a727405 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -251,7 +251,6 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
 
         # summarize at rank!
         lineage = pop_to_rank(lineage, rank)
-        #assert lineage[-1].rank == rank, (rank, lineage[-1])
         assert lineage[-1].rank == rank, lineage[-1]
         # record info
         sum_uniq_to_query[query_name][lineage] += f_unique_to_query
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 8cf2218ef7..3dcb2dbd1a 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -1428,7 +1428,7 @@ def test_genome_ani_threshold(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status == 0
-    assert "WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err
+    assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err
     assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
     assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out 
 
@@ -1453,6 +1453,24 @@ def test_genome_ani_threshold(runtmp):
     assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0,0.9247805047263588" in c.last_result.out
 
 
+def test_genome_ani_oldgather(runtmp):
+    c = runtmp
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    tax = utils.get_test_data('tax/test.taxonomy.csv')
+
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
+                       '--ani-threshold', "0.95")
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." in c.last_result.err
+    assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
+    assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out
+
+
 def test_annotate_0(runtmp):
     # test annotate
     c = runtmp

From cd4fbfcf7021bb5ff5d911d738d4e0d5dea8a40d Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Wed, 27 Apr 2022 15:46:56 -0700
Subject: [PATCH 5/9] dont warn about ignoring ANI more than once

---
 src/sourmash/tax/__main__.py  | 13 +++++-----
 src/sourmash/tax/tax_utils.py |  2 +-
 tests/test_tax.py             | 11 ++++++++
 tests/test_tax_utils.py       | 48 +++++++++++++++++------------------
 4 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 9ba282eac6..6bdf1b119f 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -99,7 +99,7 @@ def metagenome(args):
     seen_perfect = set()
     for rank in sourmash.lca.taxlist(include_strain=False):
         try:
-            summarized_gather[rank], seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
+            summarized_gather[rank], seen_perfect, _ = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
                                                                 keep_full_identifiers=args.keep_full_identifiers,
                                                                 keep_identifier_versions = args.keep_identifier_versions,
                                                                 seen_perfect = seen_perfect)
@@ -179,9 +179,10 @@ def genome(args):
         sys.exit(-1)
 
     # if --rank is specified, classify to that rank
+    estimate_query_ani = True
     if args.rank:
         try:
-            best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed,
+            best_at_rank, seen_perfect, estimate_query_ani = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed,
                                                      keep_full_identifiers=args.keep_full_identifiers,
                                                      keep_identifier_versions = args.keep_identifier_versions,
                                                      best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True)
@@ -215,10 +216,10 @@ def genome(args):
         for rank in tax_utils.ascending_taxlist(include_strain=False):
             # gets best_at_rank for all queries in this gather_csv
             try:
-                best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
-                                                         keep_full_identifiers=args.keep_full_identifiers,
-                                                         keep_identifier_versions = args.keep_identifier_versions,
-                                                         best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True)
+                best_at_rank, seen_perfect, estimate_query_ani = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
+                                                                                                keep_full_identifiers=args.keep_full_identifiers,
+                                                                                                keep_identifier_versions = args.keep_identifier_versions,
+                                                                                                best_only=True, seen_perfect=seen_perfect, estimate_query_ani=estimate_query_ani)
             except ValueError as exc:
                 error(f"ERROR: {str(exc)}")
                 sys.exit(-1)
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index d79a727405..e6aaff9e4b 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -311,7 +311,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                                               query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani)
                 sum_uniq_to_query_sorted.append(sres)
 
-    return sum_uniq_to_query_sorted, seen_perfect
+    return sum_uniq_to_query_sorted, seen_perfect, estimate_query_ani
 
 
 def find_missing_identities(gather_results, tax_assign):
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 3dcb2dbd1a..f900720798 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -1454,10 +1454,21 @@ def test_genome_ani_threshold(runtmp):
 
 
 def test_genome_ani_oldgather(runtmp):
+    # Ignore ANI if we don't have the information we need to estimate it
     c = runtmp
     g_csv = utils.get_test_data('tax/test1.gather.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
 
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax)
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
+    assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,' in c.last_result.out
+
     c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
                        '--ani-threshold', "0.95")
 
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index e0c9001bfd..febae40076 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -302,7 +302,7 @@ def test_summarize_gather_at_0():
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
 
     # run summarize_gather_at and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+    sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res)
 
     # superkingdom
     assert len(sk_sum) == 1
@@ -317,7 +317,7 @@ def test_summarize_gather_at_0():
     assert sk_sum[0].bp_match_at_rank == 100
 
     # phylum
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+    phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res)
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 1
     assert phy_sum[0].query_name == "queryA"
@@ -329,7 +329,7 @@ def test_summarize_gather_at_0():
     assert phy_sum[0].f_weighted_at_rank == 1.0
     assert phy_sum[0].bp_match_at_rank == 100
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res)
+    cl_sum, _, _ = summarize_gather_at("class", taxD, g_res)
     assert len(cl_sum) == 2
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].query_name == "queryA"
@@ -365,7 +365,7 @@ def test_summarize_gather_at_1():
     gB_tax = ("gB", "a;b;d")
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
     # run summarize_gather_at and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True)
+    sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True)
 
     # superkingdom
     assert len(sk_sum) == 2
@@ -381,7 +381,7 @@ def test_summarize_gather_at_1():
     assert sk_sum[1].query_ani_at_rank == None
 
     # phylum
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False)
+    phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False)
     print("phylum summarized gather 0: ", phy_sum[0])
     print("phylum summarized gather 1: ", phy_sum[1])
     assert len(phy_sum) == 2
@@ -395,7 +395,7 @@ def test_summarize_gather_at_1():
     assert phy_sum[0].query_ani_at_rank == None
     assert phy_sum[1].query_ani_at_rank == None
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True)
+    cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True)
     assert len(cl_sum) == 3
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -432,7 +432,7 @@ def test_summarize_gather_at_perfect_match():
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
 
     # run summarize_gather_at and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+    sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res)
     # superkingdom
     assert len(sk_sum) == 1
     print("superkingdom summarized gather: ", sk_sum[0])
@@ -454,16 +454,16 @@ def test_summarize_gather_at_over100percent_f_unique_to_query():
 
     # run summarize_gather_at and check results!
     with pytest.raises(ValueError) as exc:
-        sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+        sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res)
     assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc)
 
     # phylum
     with pytest.raises(ValueError) as exc:
-        phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+        phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res)
     assert "The tax summary of query 'queryA' is 1.1, which is > 100% of the query!!" in str(exc)
 
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res)
+    cl_sum, _, _ = summarize_gather_at("class", taxD, g_res)
     assert len(cl_sum) == 2
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -491,7 +491,7 @@ def test_summarize_gather_at_missing_ignore():
     taxD = make_mini_taxonomy([gA_tax])
 
     # run summarize_gather_at and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB'])
+    sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB'])
     # superkingdom
     assert len(sk_sum) == 2
     print("superkingdom summarized gather: ", sk_sum[0])
@@ -503,7 +503,7 @@ def test_summarize_gather_at_missing_ignore():
     assert sk_sum[1].bp_match_at_rank == 50
 
     # phylum
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB'])
+    phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB'])
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 2
     assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
@@ -513,7 +513,7 @@ def test_summarize_gather_at_missing_ignore():
     assert phy_sum[1].fraction == 0.5
     assert phy_sum[1].bp_match_at_rank == 50
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB'])
+    cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB'])
     assert len(cl_sum) == 2
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -539,7 +539,7 @@ def test_summarize_gather_at_missing_fail():
 
     # run summarize_gather_at and check results!
     with pytest.raises(ValueError) as exc:
-        sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+        sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res)
     assert "ident gB is not in the taxonomy database." in str(exc.value)
 
 
@@ -557,7 +557,7 @@ def test_summarize_gather_at_best_only_0():
     gB_tax = ("gB", "a;b;d")
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
     # run summarize_gather_at and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True)
+    sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True)
     # superkingdom
     assert len(sk_sum) == 1
     print("superkingdom summarized gather: ", sk_sum[0])
@@ -568,7 +568,7 @@ def test_summarize_gather_at_best_only_0():
     assert sk_sum[0].query_ani_at_rank == 0.9885602934376099
 
     # phylum
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True)
+    phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True)
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 1
     assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
@@ -577,7 +577,7 @@ def test_summarize_gather_at_best_only_0():
     print("phy ANI:",phy_sum[0].query_ani_at_rank)
     assert phy_sum[0].query_ani_at_rank == 0.9885602934376099
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True)
+    cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True)
     assert len(cl_sum) == 1
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -603,7 +603,7 @@ def test_summarize_gather_at_best_only_equal_choose_first():
 
     # run summarize_gather_at and check results!
     # class
-    cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True)
+    cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True)
     assert len(cl_sum) == 1
     print("class summarized gather: ", cl_sum)
     assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
@@ -697,7 +697,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
 
     # aggregate by lineage at rank
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+    sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res)
     print("superkingdom summarized gather results:", sk_sum)
     assert len(sk_sum) ==4
     assert sk_sum[0].query_name == "queryA"
@@ -726,7 +726,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     assert num_queries == 2
     assert query_names == ['queryA', 'queryB']
 
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+    phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res)
     print("phylum summary:", phy_sum, ']\n')
     phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True)
     print("phylum lineage summary:", phy_lin_sum, '\n')
@@ -750,13 +750,13 @@ def test_format_for_krona_0():
     taxD = make_mini_taxonomy([gA_tax,gB_tax])
 
     # check krona format and check results!
-    sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
+    sk_sum, _, _ = summarize_gather_at("superkingdom", taxD, g_res)
     print("superkingdom summarized gather results:", sk_sum)
     krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum})
     print("krona_res: ", krona_res)
     assert krona_res == [(1.0, 'a')]
 
-    phy_sum, _ = summarize_gather_at("phylum", taxD, g_res)
+    phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res)
     krona_res = format_for_krona("phylum", {"phylum": phy_sum})
     print("krona_res: ", krona_res)
     assert krona_res == [(1.0, 'a', 'b')]
@@ -778,7 +778,7 @@ def test_format_for_krona_1():
     sum_res = {}
     #for rank in lca_utils.taxlist(include_strain=False):
     for rank in ['superkingdom', 'phylum', 'class']:
-        sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res)
+        sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res)
     print('summarized gather: ', sum_res)
     # check krona format
     sk_krona = format_for_krona("superkingdom", sum_res)
@@ -808,7 +808,7 @@ def test_format_for_krona_best_only():
     sum_res = {}
     #for rank in lca_utils.taxlist(include_strain=False):
     for rank in ['superkingdom', 'phylum', 'class']:
-        sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res, best_only=True)
+        sum_res[rank], _, _ = summarize_gather_at(rank, taxD, g_res, best_only=True)
     print('summarized gather: ', sum_res)
     # check krona format
     sk_krona = format_for_krona("superkingdom", sum_res)

From d27be4aeb1c57ad9077faa3ae52983a1632a79ad Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Fri, 29 Apr 2022 17:41:34 -0700
Subject: [PATCH 6/9] check query stats

---
 tests/test_search.py   | 2 ++
 tests/test_sourmash.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tests/test_search.py b/tests/test_search.py
index 0d765e0f96..0efa267a20 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -482,6 +482,8 @@ def test_GatherResult():
     assert res.match_md5 == ss4763.md5sum()
     assert res.query_n_hashes == len(ss47.minhash)
     assert res.match_n_hashes == len(ss4763.minhash)
+    assert res.query_bp == ss47.minhash.covered_bp
+    assert res.match_bp == ss4763.minhash.covered_bp
     assert res.md5 == ss4763.md5sum()
     assert res.name == ss4763.name
     assert res.match_filename == ss4763.filename
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 4dae2e4858..94cd6baaf9 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5660,6 +5660,8 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
         assert row['query_name'] == 'tr1 4'
         assert row['query_md5'] == 'c9d5a795'
         assert row['query_bp'] == '910'
+        assert row['query_n_hashes'] == '91'
+        assert row['query_abundance'] == 'False'
         assert row['query_containment_ani']== '1.0'
         assert row['query_containment_ani_low']== ''
         assert row['query_containment_ani_high']== ''

From fb3729ff18419d9cec2e99640273c46d7a41801e Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Sat, 23 Jul 2022 14:55:40 -0400
Subject: [PATCH 7/9] fix gather test

---
 tests/test_search.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_search.py b/tests/test_search.py
index 7c9e43f1f4..1863c03bb2 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -481,8 +481,8 @@ def test_GatherResult():
     assert res.match_md5 == ss4763.md5sum()
     assert res.query_n_hashes == len(ss47.minhash)
     assert res.match_n_hashes == len(ss4763.minhash)
-    assert res.query_bp == ss47.minhash.covered_bp
-    assert res.match_bp == ss4763.minhash.covered_bp
+    assert res.query_bp == ss47.minhash.unique_dataset_hashes
+    assert res.match_bp == ss4763.minhash.unique_dataset_hashes
     assert res.md5 == ss4763.md5sum()
     assert res.name == ss4763.name
     assert res.match_filename == ss4763.filename

From f78eef3c249e9393ee5cefbfbe8a1f3f61ea7ab1 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Sat, 23 Jul 2022 20:12:10 -0400
Subject: [PATCH 8/9] add sentence on ani thresh

---
 doc/command-line.md | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/doc/command-line.md b/doc/command-line.md
index 99619da023..37438af488 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -468,10 +468,9 @@ The sourmash `tax` or `taxonomy` commands integrate taxonomic
  taxonomic rank. For example, if the gather results for a metagenome
  include results for 30 different strains of a given species, we can sum
  the fraction uniquely matched to each strain to obtain the fraction
- uniquely matched to this species. Note that this summarization can
- also take into account abundance weighting; see
- [classifying signatures](classifying-signatures.md) for more
- information.
+ uniquely matched to this species. Alternatively, taxonomic summarization
+ can take into account abundance weighting; see
+ [classifying signatures](classifying-signatures.md) for more information.
 
 As with all reference-based analysis, results can be affected by the
  completeness of the reference database. However, summarizing taxonomic
@@ -589,12 +588,20 @@ To produce multiple output types from the same command, add the types into the
 ### `sourmash tax genome` - classify a genome using `gather` results
 
 `sourmash tax genome` reports likely classification for each query,
- based on `gather` matches. By default, classification requires at least 10% of
- the query to be matched. Thus, if 10% of the query was matched to a species, the
+ based on `gather` matches. Since `gather` matches are non-overlapping
+
+ By default, classification requires at least 10% of the query to be matched.
+ Thus, if 10% of the query was matched to a species, the
  species-level classification can be reported. However, if 7% of the query was
  matched to one species, and an additional 5% matched to a different species in
  the same genus, the genus-level classification will be reported.
 
+`sourmash tax genome` can use an ANI threshold (`--ani-threshold`) instead of a
+ containment threshold. This works the same way as the containment threshold
+ (and indeed, is using the same underlying information). Note that for DNA k-mers,
+ k=21 ANI is most similar to alignment-based ANI values, and ANI values should only
+ be compared if they were generated using the same ksize.
+
 Optionally, `genome` can instead report classifications at a desired `rank`,
  regardless of match threshold (`--rank` argument, e.g. `--rank species`).
 

From 6ee64463f239c54332653f646b907830e0caeade Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Sun, 24 Jul 2022 09:08:46 -0400
Subject: [PATCH 9/9] apply sugg from code review

---
 doc/command-line.md           | 12 +++++-------
 src/sourmash/tax/tax_utils.py |  4 ++--
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/doc/command-line.md b/doc/command-line.md
index 37438af488..3748f0f5c9 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -588,13 +588,11 @@ To produce multiple output types from the same command, add the types into the
 ### `sourmash tax genome` - classify a genome using `gather` results
 
 `sourmash tax genome` reports likely classification for each query,
- based on `gather` matches. Since `gather` matches are non-overlapping
-
- By default, classification requires at least 10% of the query to be matched.
- Thus, if 10% of the query was matched to a species, the
- species-level classification can be reported. However, if 7% of the query was
- matched to one species, and an additional 5% matched to a different species in
- the same genus, the genus-level classification will be reported.
+ based on `gather` matches. By default, classification requires at least 10%
+ of the query to be matched. Thus, if 10% of the query was matched to a species,
+ the species-level classification can be reported. However, if 7% of the query
+ was matched to one species, and an additional 5% matched to a different species
+ in the same genus, the genus-level classification will be reported.
 
 `sourmash tax genome` can use an ANI threshold (`--ani-threshold`) instead of a
  containment threshold. This works the same way as the containment threshold
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index e6aaff9e4b..473a7fcce3 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -200,7 +200,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
     sum_uniq_to_query = defaultdict(lambda: defaultdict(float))
     sum_uniq_bp = defaultdict(lambda: defaultdict(float))
     query_info = {}
-    ksize,scaled,query_nhashes=None,None,None
+    ksize, scaled, query_nhashes=None, None, None
 
     for row in gather_results:
         # get essential gather info
@@ -283,7 +283,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
             total_f_classified = 0.0
             total_bp_classified = 0
             for lineage, fraction in sumgather_items:
-                query_ani=None
+                query_ani = None
                 if fraction > 1:
                     raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
                 elif fraction == 0: