From b1ed90062d8bd69d7d27803620e698c16a050aa6 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 5 Jan 2023 15:43:17 -0800
Subject: [PATCH 01/21] init new LineagePair and LineageInfo classes

---
 src/sourmash/lca/lca_utils.py |   4 +
 src/sourmash/tax/tax_utils.py | 292 +++++++++++++++++++++++++++++++++-
 tests/test_tax_utils.py       | 128 ++++++++-------
 3 files changed, 361 insertions(+), 63 deletions(-)

diff --git a/src/sourmash/lca/lca_utils.py b/src/sourmash/lca/lca_utils.py
index b81f765bb9..8138e38c2f 100644
--- a/src/sourmash/lca/lca_utils.py
+++ b/src/sourmash/lca/lca_utils.py
@@ -131,6 +131,10 @@ def build_tree(assignments, initial=None):
     for assignment in assignments:
         node = tree
 
+        # when we switch LineagePair over, will need ot add this.
+        #if isinstance(assignment, (BaseLineageInfo, RankLineageInfo, LINSLineageInfo)):
+        #    assignment = assignment.filled_lineage
+
         for lineage_tup in assignment:
             if lineage_tup.name:
                 child = node.get(lineage_tup, {})
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index bba7637833..2b47e672de 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -5,6 +5,9 @@
 import csv
 from collections import namedtuple, defaultdict
 from collections import abc
+from itertools import zip_longest
+from typing import NamedTuple
+from dataclasses import dataclass, field, replace
 import gzip
 
 from sourmash import sqlite_utils, sourmash_args
@@ -36,7 +39,288 @@
 
 # import lca utils as needed for now
 from sourmash.lca import lca_utils
-from sourmash.lca.lca_utils import (LineagePair, taxlist, display_lineage, pop_to_rank)
+from sourmash.lca.lca_utils import (taxlist, display_lineage, pop_to_rank)
+
+class LineagePair(NamedTuple):
+    rank: str
+    name: str = ""
+    taxid: int = None
+
+@dataclass(frozen=True, order=True)
+class BaseLineageInfo:
+    """BaseLineageInfo Class, defining a set of functions that can be used to handle
+       hierarchical set of LineagePairs. """
+    # need to set compare=False for any mutable type to keep this class hashable
+    ranks: tuple() # require ranks
+    lineage: tuple = field(default=()) #tuple of LineagePairs
+    lineage_str: str = field(default=None, compare=False) # ';'- or ','-separated str of lineage names
+    lineage_dict: dict = field(default=None, compare=False) # dict of rank: name
+
+    def __post_init__(self):
+        "Initialize according to passed values"
+        # ranks must be tuple for hashability
+        if isinstance(self.ranks, list):
+            object.__setattr__(self, "ranks", tuple(self.ranks))
+        if self.lineage:
+            self._init_from_lineage_tuples()
+        elif self.lineage_str is not None:
+            self._init_from_lineage_str()
+        elif self.lineage_dict is not None:
+            self._init_from_lineage_dict()
+        else:
+            self._init_empty()
+
+    def __eq__(self, other):
+        if other == (): # just handy: if comparing to a null tuple, don't try to find it's lineage before returning False
+            return False
+        return all([self.ranks == other.ranks and self.lineage==other.lineage])
+
+    @property
+    def taxlist(self):
+        return self.ranks
+
+    @property
+    def ascending_taxlist(self):
+        return self.ranks[::-1]
+
+    @property
+    def lowest_rank(self):
+        return self.filled_ranks[-1]
+
+    def rank_index(self, rank):
+        return self.ranks.index(rank)
+
+    @property
+    def filled_lineage(self):
+        """Return lineage down to lowest non-empty rank. Preserves missing ranks above."""
+        # Would we prefer this to be the default returned by lineage??
+        if not self.filled_ranks:
+            return ()
+        lowest_filled_rank_idx = self.rank_index(self.filled_ranks[-1])
+        return self.lineage[:lowest_filled_rank_idx+1]
+
+    @property
+    def lowest_lineage_name(self, null_as_unclassified = False):
+        """Return the name of the lowest filled lineage"""
+        if not self.filled_ranks:
+            #if null_as_unclassified: # todo: enable me
+            #    return "unclassified"
+            #else:
+                return ""
+        return self.filled_lineage[-1].name
+
+    @property
+    def lowest_lineage_taxid(self):
+        """Return the taxid of the lowest filled lineage"""
+        if not self.filled_ranks:
+            return ""
+        return self.filled_lineage[-1].taxid
+
+    def _init_empty(self):
+        'initialize empty genome lineage'
+        new_lineage = []
+        for rank in self.ranks:
+            new_lineage.append(LineagePair(rank=rank))
+        # set lineage and filled_ranks (because frozen, need to do it this way)
+        object.__setattr__(self, "lineage", new_lineage)
+        object.__setattr__(self, "filled_ranks", ())
+
+    def _init_from_lineage_tuples(self):
+        'initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary'
+        new_lineage = []
+        # check this is a list or tuple of lineage tuples:
+        for lin_tup in self.lineage:
+            if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)):
+                raise ValueError(f"{lin_tup} is not LineagePair or LineagePair")
+        for rank in self.ranks:
+            new_lineage.append(LineagePair(rank=rank))
+        # now add input tuples in correct spots. This corrects for order and allows empty values.
+            for lin_tup in self.lineage:
+                # find index for this rank
+                if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?)
+                    try:
+                        rank_idx = self.rank_index(lin_tup.rank)
+                    except ValueError as e:
+                        raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e
+                    # make sure we're adding LineagePairs, not lineagePairs for consistency
+                    if not isinstance(lin_tup, LineagePair):
+                        new_lineage[rank_idx] =  LineagePair(rank=lin_tup.rank, name=lin_tup.name)
+                    else:
+                        new_lineage[rank_idx] =  lin_tup
+        # build list of filled ranks
+        filled_ranks = [a.rank for a in new_lineage if a.name]
+        # set lineage and filled_ranks
+        object.__setattr__(self, "lineage", tuple(new_lineage))
+        object.__setattr__(self, "filled_ranks", filled_ranks)
+
+    def _init_from_lineage_dict(self):
+        'initialize from lineage dict, e.g. from gather csv, allowing empty ranks and reordering if necessary'
+        if not isinstance(self.lineage_dict, (dict)):
+            raise ValueError(f"{self.lineage_dict} is not dictionary")
+        # first, initialize_empty
+        new_lineage = []
+        # build empty lineage
+        for rank in self.ranks:
+            new_lineage.append(LineagePair(rank=rank))
+        # now add input information in correct spots. This corrects for order and allows empty values.
+        for rank, info in self.lineage_dict.items():
+            try:
+                rank_idx = self.rank_index(rank)
+            except ValueError as e:
+                raise ValueError(f"Rank '{rank}' not present in {', '.join(self.ranks)}") from e
+
+            name, taxid = None, None
+            if isinstance(info, dict):
+                if 'name' in info.keys():
+                    name = info['name']
+                if 'taxid' in info.keys():
+                    taxid = info['taxid']
+            elif isinstance(info, str):
+                name = info
+            new_lineage[rank_idx] =  LineagePair(rank=rank, name=name, taxid=taxid)
+        # build list of filled ranks
+        filled_ranks = [a.rank for a in new_lineage if a.name]
+        # set lineage and filled_ranks
+        object.__setattr__(self, "lineage", tuple(new_lineage))
+        object.__setattr__(self, "filled_ranks", filled_ranks)
+
+    def _init_from_lineage_str(self):
+        """
+        Turn a ; or ,-separated set of lineages into a list of LineagePair objs.
+        """
+        new_lineage = self.lineage_str.split(';')
+        if len(new_lineage) == 1:
+            new_lineage = self.lineage_str.split(',')
+        new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ]
+        # build list of filled ranks
+        filled_ranks = [a.rank for a in new_lineage if a.name]
+        object.__setattr__(self, "lineage", tuple(new_lineage))
+        object.__setattr__(self, "filled_ranks", filled_ranks)
+
+    def zip_lineage(self, truncate_empty=False):
+        """
+        Return lineage names as a list
+        """
+        if truncate_empty:
+            zipped = [a.name for a in self.filled_lineage]
+        else:
+            zipped = [a.name for a in self.lineage]
+        # replace None with empty string ("")
+        if None in zipped:
+            zipped = ['' if x is None else x for x in zipped]
+
+        return zipped
+
+    def zip_taxid(self, truncate_empty=False):
+        """
+        Return taxids as a list
+        """
+        if truncate_empty:
+            zipped = [a.taxid for a in self.filled_lineage]
+        else:
+            zipped = [a.taxid for a in self.lineage]
+        # replace None with empty string (""); cast taxids to str
+        zipped = ['' if x is None else str(x) for x in zipped]
+
+        return zipped
+
+    def display_lineage(self, truncate_empty=True, null_as_unclassified=False):
+        "Return lineage names as ';'-separated list"
+        lin = ";".join(self.zip_lineage(truncate_empty=truncate_empty))
+        if null_as_unclassified:
+            if lin == "":
+                return "unclassified"
+        else:
+            return lin
+
+    def display_taxid(self, truncate_empty=True):
+        "Return lineage taxids as ';'-separated list"
+        return ";".join(self.zip_taxid(truncate_empty=truncate_empty))
+
+    def is_lineage_match(self, other, rank):
+        """
+        check to see if two lineages are a match down to given rank.
+        """
+        if not other.ranks == self.ranks: # check same ranks
+            raise ValueError("Cannot compare lineages from taxonomies with different ranks.")
+        if rank not in self.ranks: # rank is available
+            raise ValueError(f"Desired Rank {rank} not available for this lineage")
+        # always return false if rank is not filled in either of the two lineages
+        if rank in self.filled_ranks and rank in other.filled_ranks:
+            rank_idx = self.rank_index(rank)
+            a_lin = self.lineage[:rank_idx+1]
+            b_lin = other.lineage[:rank_idx+1]
+            if a_lin == b_lin:
+                return 1
+        return 0
+
+    def pop_to_rank(self, rank):
+        "Return new LineageInfo with ranks only filled to desired rank"
+        if rank not in self.ranks:
+            raise ValueError(f"Desired Rank '{rank}' not available for this lineage")
+        # are we already above rank?
+        if rank not in self.filled_ranks:
+            return replace(self)
+        # if not, make filled_lineage at this rank + use to generate new LineageInfo
+        new_lineage = self.lineage_at_rank(rank)
+        new = replace(self, lineage = new_lineage)
+        # replace doesn't run the __post_init__ properly. reinitialize.
+        new._init_from_lineage_tuples()
+        return new
+
+    def lineage_at_rank(self, rank):
+        # non-descructive pop_to_rank. Returns tuple of LineagePairs
+        "Returns tuple of LineagePairs at given rank."
+        if rank not in self.ranks:
+            raise ValueError(f"Desired Rank '{rank}' not available for this lineage")
+        # are we already above rank?
+        if rank not in self.filled_ranks:
+            return self.filled_lineage
+        # if not, return lineage tuples down to desired rank
+        rank_idx = self.rank_index(rank)
+        return self.filled_lineage[:rank_idx+1]
+
+
+@dataclass(frozen=True, order=True)
+class RankLineageInfo(BaseLineageInfo):
+    """Class for storing multi-rank lineage information"""
+    ranks: tuple = field(default_factory=lambda: ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain'))
+
+    def __post_init__(self):
+        "Initialize according to passed values"
+        # ranks must be tuple for hashability
+        if isinstance(self.ranks, list):
+            object.__setattr__(self, "ranks", tuple(self.ranks))
+        if self.lineage:
+            self._init_from_lineage_tuples()
+        elif self.lineage_str is not None:
+            self._init_from_lineage_str()
+        elif self.lineage_dict is not None:
+            self._init_from_lineage_dict()
+        elif self.ranks:
+            self._init_empty()
+
+    def __eq__(self, other):
+        if other == (): # sometimes we compare to null lineage.. this just helps with that
+            return False
+        return all([self.ranks == other.ranks, self.lineage==other.lineage])
+
+
+@dataclass(frozen=True, order=True)
+class LINSLineageInfo(BaseLineageInfo):
+    """Class for storing multi-rank lineage information"""
+    num_positions: int = None
+    ## WHAT special considerations do we have here?
+    def __post_init__(self):
+        "Initialize according to passed values"
+        if self.lineage != [LineagePair()]:
+            self._init_from_lineage_tuples()
+        elif self.lineage_str is not None and self.ranks:
+            self._init_from_lineage_str()
+        elif self.ranks:
+            self._init_empty()
+        else:
+            raise ValueError("Cannot initialize LINSLineageInfo. Please provide lineage or rank info.")
 
 
 def get_ident(ident, *,
@@ -760,7 +1044,7 @@ def load(cls, filename, *, delimiter=',', force=False,
                 # read row into a lineage pair
                 for rank in lca_utils.taxlist(include_strain=include_strain):
                     lin = row[rank]
-                    lineage.append(LineagePair(rank, lin))
+                    lineage.append(lca_utils.LineagePair(rank, lin))
                 ident = row[identifier]
 
                 # fold, spindle, and mutilate ident?
@@ -770,7 +1054,7 @@ def load(cls, filename, *, delimiter=',', force=False,
 
                 # clean lineage of null names, replace with 'unassigned'
                 lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
-                lineage = [ LineagePair(a, b) for (a, b) in lineage ]
+                lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage ]
 
                 # remove end nulls
                 while lineage and lineage[-1].name == 'unassigned':
@@ -924,7 +1208,7 @@ def load(cls, location):
 
     def _make_tup(self, row):
         "build a tuple of LineagePairs for this sqlite row"
-        tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
+        tup = [ lca_utils.LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
         return tuple(tup)
 
     def __getitem__(self, ident):
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 7652970f60..a48692161d 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -22,7 +22,7 @@
 
 # import lca utils as needed for now
 from sourmash.lca import lca_utils
-from sourmash.lca.lca_utils import LineagePair
+from sourmash.tax.tax_utils import LineagePair
 
 # utility functions for testing
 def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False):
@@ -379,7 +379,7 @@ def test_summarize_gather_at_0():
     assert sk_sum[0].query_md5 == "queryA_md5"
     assert sk_sum[0].query_filename == "queryA.sig"
     assert sk_sum[0].rank == 'superkingdom'
-    assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 1.0
     assert sk_sum[0].f_weighted_at_rank == 1.0
     assert sk_sum[0].bp_match_at_rank == 100
@@ -392,7 +392,7 @@ def test_summarize_gather_at_0():
     assert phy_sum[0].query_md5 == "queryA_md5"
     assert phy_sum[0].query_filename == "queryA.sig"
     assert phy_sum[0].rank == 'phylum'
-    assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
     assert phy_sum[0].fraction == 1.0
     assert phy_sum[0].f_weighted_at_rank == 1.0
     assert phy_sum[0].bp_match_at_rank == 100
@@ -404,16 +404,16 @@ def test_summarize_gather_at_0():
     assert cl_sum[0].query_md5 == "queryA_md5"
     assert cl_sum[0].query_filename == "queryA.sig"
     assert cl_sum[0].rank == 'class'
-    assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='c'))
     assert cl_sum[0].fraction == 0.5
     assert cl_sum[0].f_weighted_at_rank == 0.5
     assert cl_sum[0].bp_match_at_rank == 50
     assert cl_sum[1].rank == 'class'
-    assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='d'))
+    assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='d'))
     assert cl_sum[1].fraction == 0.5
     assert cl_sum[1].f_weighted_at_rank == 0.5
     assert cl_sum[1].bp_match_at_rank == 50
@@ -438,7 +438,7 @@ def test_summarize_gather_at_1():
     # superkingdom
     assert len(sk_sum) == 2
     print("\nsuperkingdom summarized gather 0: ", sk_sum[0])
-    assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.7
     assert sk_sum[0].bp_match_at_rank == 70
     print("superkingdom summarized gather 1: ", sk_sum[1])
@@ -453,7 +453,7 @@ def test_summarize_gather_at_1():
     print("phylum summarized gather 0: ", phy_sum[0])
     print("phylum summarized gather 1: ", phy_sum[1])
     assert len(phy_sum) == 2
-    assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.7
     assert phy_sum[0].f_weighted_at_rank == 0.6
     assert phy_sum[0].bp_match_at_rank == 70
@@ -466,18 +466,18 @@ def test_summarize_gather_at_1():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True)
     assert len(cl_sum) == 3
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='c'))
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].f_weighted_at_rank == 0.5
     assert cl_sum[0].bp_match_at_rank == 60
     assert cl_sum[0].query_ani_at_rank == 0.9836567776983505
 
     assert cl_sum[1].rank == 'class'
-    assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='d'))
+    assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='d'))
     assert cl_sum[1].fraction == 0.1
     assert cl_sum[1].f_weighted_at_rank == 0.1
     assert cl_sum[1].bp_match_at_rank == 10
@@ -504,7 +504,7 @@ def test_summarize_gather_at_perfect_match():
     # superkingdom
     assert len(sk_sum) == 1
     print("superkingdom summarized gather: ", sk_sum[0])
-    assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 1.0
 
 
@@ -534,15 +534,15 @@ def test_summarize_gather_at_over100percent_f_unique_to_query():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res)
     assert len(cl_sum) == 2
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='d'))
+    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='d'))
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].bp_match_at_rank == 60
     assert cl_sum[1].rank == 'class'
-    assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='c'))
+    assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='c'))
     assert cl_sum[1].fraction == 0.5
     assert cl_sum[1].bp_match_at_rank == 50
 
@@ -563,7 +563,7 @@ def test_summarize_gather_at_missing_ignore():
     # superkingdom
     assert len(sk_sum) == 2
     print("superkingdom summarized gather: ", sk_sum[0])
-    assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.5
     assert sk_sum[0].bp_match_at_rank == 50
     assert sk_sum[1].lineage == ()
@@ -574,7 +574,7 @@ def test_summarize_gather_at_missing_ignore():
     phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB'])
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 2
-    assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.5
     assert phy_sum[0].bp_match_at_rank == 50
     assert phy_sum[1].lineage == ()
@@ -584,9 +584,9 @@ def test_summarize_gather_at_missing_ignore():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB'])
     assert len(cl_sum) == 2
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='c'))
     assert cl_sum[0].fraction == 0.5
     assert cl_sum[0].bp_match_at_rank == 50
     assert cl_sum[1].lineage == ()
@@ -629,7 +629,7 @@ def test_summarize_gather_at_best_only_0():
     # superkingdom
     assert len(sk_sum) == 1
     print("superkingdom summarized gather: ", sk_sum[0])
-    assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.7
     assert sk_sum[0].bp_match_at_rank == 70
     print("superk ANI:",sk_sum[0].query_ani_at_rank)
@@ -639,7 +639,7 @@ def test_summarize_gather_at_best_only_0():
     phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True)
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 1
-    assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.7
     assert phy_sum[0].bp_match_at_rank == 70
     print("phy ANI:",phy_sum[0].query_ani_at_rank)
@@ -648,9 +648,9 @@ def test_summarize_gather_at_best_only_0():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True)
     assert len(cl_sum) == 1
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='c'))
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].bp_match_at_rank == 60
     print("cl ANI:",cl_sum[0].query_ani_at_rank)
@@ -674,9 +674,9 @@ def test_summarize_gather_at_best_only_equal_choose_first():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True)
     assert len(cl_sum) == 1
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
-                                 LineagePair(rank='phylum', name='b'),
-                                 LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                 lca_utils.LineagePair(rank='phylum', name='b'),
+                                 lca_utils.LineagePair(rank='class', name='c'))
     assert cl_sum[0].fraction == 0.5
     assert cl_sum[0].bp_match_at_rank == 50
 
@@ -687,14 +687,14 @@ def test_write_summary_csv(runtmp):
     sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                           query_ani_at_rank=None,
                                                           total_weighted_hashes=0)],
                   'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                     lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='b')),
+                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                                              lca_utils.LineagePair(rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
 
@@ -712,8 +712,8 @@ def test_write_summary_csv(runtmp):
 def test_write_classification(runtmp):
     """test classification csv write function"""
     classif = ClassificationResult('queryA', 'match', 'phylum', 1.0,
-                                    (LineagePair(rank='superkingdom', name='a'),
-                                     LineagePair(rank='phylum', name='b')),
+                                    (lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                     lca_utils.LineagePair(rank='phylum', name='b')),
                                      'queryA_md5', 'queryA.sig', 1.0, 100,
                                      query_ani_at_rank=None)
 
@@ -771,7 +771,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     print("superkingdom summarized gather results:", sk_sum)
     assert len(sk_sum) ==4
     assert sk_sum[0].query_name == "queryA"
-    assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.9
     assert sk_sum[0].bp_match_at_rank == 160
     # check for unassigned for queryA
@@ -781,7 +781,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     assert round(sk_sum[1].fraction,1) == 0.1
     # queryB
     assert sk_sum[2].query_name == "queryB"
-    assert sk_sum[2].lineage == (LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[2].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[2].fraction == 0.3
     assert sk_sum[2].bp_match_at_rank == 60
     # check for unassigned for queryA
@@ -791,7 +791,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     assert sk_sum[3].bp_match_at_rank == 140
     sk_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True)
     print("superkingdom lineage summary:", sk_lin_sum, '\n')
-    assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3},
+    assert sk_lin_sum == {(lca_utils.LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3},
                           (): {'queryA': 0.09999999999999998, 'queryB': 0.7}}
     assert num_queries == 2
     assert query_names == ['queryA', 'queryB']
@@ -800,8 +800,8 @@ def test_aggregate_by_lineage_at_rank_by_query():
     print("phylum summary:", phy_sum, ']\n')
     phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True)
     print("phylum lineage summary:", phy_lin_sum, '\n')
-    assert phy_lin_sum ==  {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): {'queryA': 0.5},
-                            (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3},
+    assert phy_lin_sum ==  {(lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='b')): {'queryA': 0.5},
+                            (lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3},
                             (): {'queryA': 0.09999999999999998, 'queryB': 0.7}}
     assert num_queries == 2
     assert query_names == ['queryA', 'queryB']
@@ -911,27 +911,27 @@ def test_combine_sumgather_csvs_by_lineage(runtmp):
     sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
-                                                     lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='b')),
+                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                                              lca_utils.LineagePair(rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
     sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                     lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='c')),
+                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                                              lca_utils.LineagePair(rank='phylum', name='c')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
 
@@ -1006,27 +1006,27 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
     sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=0.5, bp_match_at_rank=50,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
-                                                     lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='b')),
+                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                                              lca_utils.LineagePair(rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
     sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                          lineage=(LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                     lineage=(LineagePair(rank='superkingdom', name='a'),
-                                                              LineagePair(rank='phylum', name='c')),
+                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
+                                                              lca_utils.LineagePair(rank='phylum', name='c')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
 
@@ -1231,3 +1231,13 @@ def test_lineage_db_sql_load(runtmp):
     # file does not exist
     with pytest.raises(ValueError):
         LineageDB_Sqlite.load(runtmp.output('no-such-file'))
+
+
+def test_LineagePair():
+    lin = LineagePair(rank="rank1", name='name1')
+    print(lin)
+
+
+def test_LineagePair_1():
+    lin1 = LineagePair(rank="rank1", name='name1', taxid=1)
+    print(lin1)

From 5aa5714e4a0020178d79dbc0da1be0c5135897e9 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 5 Jan 2023 16:09:57 -0800
Subject: [PATCH 02/21] test new LineagePair,BaseLineageInfo,RankLineageInfo

---
 src/sourmash/tax/tax_utils.py |  36 ++--
 tests/test_tax_utils.py       | 382 +++++++++++++++++++++++++++++++++-
 2 files changed, 396 insertions(+), 22 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 2b47e672de..ec731ede5e 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -43,7 +43,7 @@
 
 class LineagePair(NamedTuple):
     rank: str
-    name: str = ""
+    name: str = None
     taxid: int = None
 
 @dataclass(frozen=True, order=True)
@@ -129,24 +129,23 @@ def _init_from_lineage_tuples(self):
         'initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary'
         new_lineage = []
         # check this is a list or tuple of lineage tuples:
-        for lin_tup in self.lineage:
-            if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)):
-                raise ValueError(f"{lin_tup} is not LineagePair or LineagePair")
         for rank in self.ranks:
             new_lineage.append(LineagePair(rank=rank))
-        # now add input tuples in correct spots. This corrects for order and allows empty values.
-            for lin_tup in self.lineage:
+        for lin_tup in self.lineage:
+            # now add input tuples in correct spots. This corrects for order and allows empty values.
+            if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)):
+                raise ValueError(f"{lin_tup} is not LineagePair.")
                 # find index for this rank
-                if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?)
-                    try:
-                        rank_idx = self.rank_index(lin_tup.rank)
-                    except ValueError as e:
-                        raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e
-                    # make sure we're adding LineagePairs, not lineagePairs for consistency
-                    if not isinstance(lin_tup, LineagePair):
-                        new_lineage[rank_idx] =  LineagePair(rank=lin_tup.rank, name=lin_tup.name)
-                    else:
-                        new_lineage[rank_idx] =  lin_tup
+            if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?)
+                try:
+                    rank_idx = self.rank_index(lin_tup.rank)
+                except ValueError as e:
+                    raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e
+                # make sure we're adding LineagePairs, not lineagePairs for consistency
+                if isinstance(lin_tup, lca_utils.LineagePair):
+                    new_lineage[rank_idx] =  LineagePair(rank=lin_tup.rank, name=lin_tup.name)
+                else:
+                    new_lineage[rank_idx] =  lin_tup
         # build list of filled ranks
         filled_ranks = [a.rank for a in new_lineage if a.name]
         # set lineage and filled_ranks
@@ -300,11 +299,6 @@ def __post_init__(self):
         elif self.ranks:
             self._init_empty()
 
-    def __eq__(self, other):
-        if other == (): # sometimes we compare to null lineage.. this just helps with that
-            return False
-        return all([self.ranks == other.ranks, self.lineage==other.lineage])
-
 
 @dataclass(frozen=True, order=True)
 class LINSLineageInfo(BaseLineageInfo):
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index a48692161d..65334e82aa 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -22,7 +22,7 @@
 
 # import lca utils as needed for now
 from sourmash.lca import lca_utils
-from sourmash.tax.tax_utils import LineagePair
+from sourmash.tax.tax_utils import LineagePair, BaseLineageInfo, RankLineageInfo
 
 # utility functions for testing
 def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False):
@@ -1241,3 +1241,383 @@ def test_LineagePair():
 def test_LineagePair_1():
     lin1 = LineagePair(rank="rank1", name='name1', taxid=1)
     print(lin1)
+
+
+def test_BaseLineageInfo_init_lineage_str():
+    x = "a;b;c"
+    ranks=["A", "B", "C"]
+    taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', 'b', 'c']
+
+
+def test_BaseLineageInfo_init_lineage_tups():
+    ranks=["A", "B", "C"]
+    lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b'))
+    taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', '', 'b']
+
+
+def test_BaseLineageInfo_init_lineage_dict():
+    x = {'rank1': 'name1', 'rank2': 'name2'}
+    taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"])
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    assert taxinf.zip_lineage()== ['name1', 'name2']
+
+
+def test_BaseLineageInfo_init_lineage_dict_withtaxid():
+    x = {'rank1': {'name': 'name1', 'taxid': 1}, 'rank2': {'name':'name2', 'taxid': 2}}
+    taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"])
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    assert taxinf.zip_lineage()== ['name1', 'name2']
+    assert taxinf.zip_taxid()== ['1', '2']
+
+
+def test_BaseLineageInfo_init_lineage_str_lineage_dict_test_eq():
+    x = "a;b;c"
+    ranks=["A", "B", "C"]
+    rankD = {"A": "a", "B": "b", "C": "c"}
+    lin1 = BaseLineageInfo(lineage_str=x, ranks=ranks)
+    lin2 = BaseLineageInfo(lineage_dict=rankD, ranks=ranks)
+    assert lin1 == lin2
+
+
+def test_BaseLineageInfo_init_no_ranks():
+    x = "a;b;c"
+    rankD = {"superkingdom": "a", "phylum": "b", "class": "c"}
+    lin_tups = (LineagePair(rank="rank2", name='name1'), LineagePair(rank="rank1", name='name1'))
+    with pytest.raises(TypeError) as exc:
+        BaseLineageInfo(lineage_str=x)
+    print(exc)
+    assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc)
+    with pytest.raises(TypeError) as exc:
+        BaseLineageInfo(lineage_dict=rankD)
+    print(exc)
+    assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc)
+    with pytest.raises(TypeError) as exc:
+        BaseLineageInfo(lineage=lin_tups)
+    print(exc)
+    assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc)
+
+
+def test_BaseLineageInfo_init_with_wrong_ranks():
+    ranks=["A", "B", "C"]
+    lin_tups = (LineagePair(rank="rank1", name='name1'))
+    linD = {"rank1": "a"}
+    with pytest.raises(ValueError) as exc:
+        BaseLineageInfo(lineage=lin_tups, ranks=ranks)
+    print(str(exc))
+    assert "Rank 'rank1' not present in A, B, C" in str(exc)
+    with pytest.raises(ValueError) as exc:
+        BaseLineageInfo(lineage=linD, ranks=ranks)
+    print(str(exc))
+    assert "Rank 'rank1' not present in A, B, C" in str(exc)
+
+
+def test_BaseLineageInfo_init_not_lineagepair():
+    ranks=["A", "B", "C"]
+    lin_tups = (("rank1", "name1"),)
+    with pytest.raises(ValueError) as exc:
+        BaseLineageInfo(lineage=lin_tups, ranks=ranks)
+    print(str(exc))
+    assert "is not LineagePair" in str(exc)
+
+
+def test_RankLineageInfo_taxlist():
+    taxinf = RankLineageInfo()
+    taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain')
+    assert taxinf.taxlist == taxranks
+    assert taxinf.ascending_taxlist == taxranks[::-1]
+
+
+def test_RankLineageInfo_init_lineage_str():
+    x = "a;b;c"
+    taxinf = RankLineageInfo(lineage_str=x)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', '']
+
+
+def test_RankLineageInfo_init_lineage_tups():
+    x = (LineagePair(rank="superkingdom", name='a'), LineagePair(rank="phylum", name='b'))
+    taxinf = RankLineageInfo(lineage=x)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', '']
+
+
+def test_RankLineageInfo_init_lineage_dict():
+    x = {"superkingdom":'a',"phylum":'b'}
+    taxinf = RankLineageInfo(lineage_dict=x)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', '']
+
+
+def test_RankLineageInfo_init_lineage_dict_missing_rank():
+    x = {'superkingdom': 'name1', 'class': 'name2'}
+    taxinf = RankLineageInfo(lineage_dict=x)
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', '']
+    assert taxinf.zip_lineage(truncate_empty=True)== ['name1', '', 'name2']
+
+
+def test_RankLineageInfo_init_lineage_dict_missing_rank_withtaxid():
+    x = {'superkingdom': {'name': 'name1', 'taxid': 1}, 'class': {'name':'name2', 'taxid': 2}}
+    taxinf = RankLineageInfo(lineage_dict=x)
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', '']
+    assert taxinf.zip_taxid()== ['1', '', '2', '', '', '', '', '']
+
+
+def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq():
+    x = "a;b;c"
+    rankD = {"superkingdom": "a", "phylum": "b", "class": "c"}
+    lin1 = RankLineageInfo(lineage_str=x)
+    lin2 = RankLineageInfo(lineage_dict=rankD)
+    print("lin1: ", lin1)
+    print("lin2: ", lin2)
+    assert lin1 == lin2
+
+def test_RankLineageInfo_init_lineage_str_1_truncate():
+    x = "a;b;c"
+    taxinf = RankLineageInfo(lineage_str=x)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', 'c']
+
+
+def test_RankLineageInfo_init_lineage_str_2():
+    x = "a;b;;c"
+    taxinf = RankLineageInfo(lineage_str=x)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', 'b', '', 'c' '', '', '', '', '']
+
+
+def test_RankLineageInfo_init_lineage_str_2_truncate():
+    x = "a;b;;c"
+    taxinf = RankLineageInfo(lineage_str=x)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', '', 'c']
+
+
+def test_RankLineageInfo_init_lineage_with_incorrect_rank():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair("NotARank", ''), LineagePair('class', 'c') ]
+    with pytest.raises(ValueError) as exc:
+        RankLineageInfo(lineage=x)
+    print(str(exc))
+    assert f"Rank 'NotARank' not present in " in str(exc)
+
+
+def test_zip_lineage_1():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ]
+    taxinf = RankLineageInfo(lineage=x)
+    print("ranks: ", taxinf.ranks)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    assert taxinf.zip_lineage() == ['a', 'b', '', '', '', '', '', '']
+
+
+def test_zip_lineage_2():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ]
+    taxinf = RankLineageInfo(lineage=x)
+    print("ranks: ", taxinf.ranks)
+    print("zipped lineage: ", taxinf.zip_lineage(truncate_empty=True))
+    assert taxinf.zip_lineage(truncate_empty=True) == ['a', 'b']
+
+
+def test_zip_lineage_3():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ]
+    taxinf = RankLineageInfo(lineage=x)
+    assert taxinf.zip_lineage() == ['a', '', 'c', '', '', '', '', '']
+
+
+def test_zip_lineage_3_truncate():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ]
+    taxinf = RankLineageInfo(lineage=x)
+    assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c']
+
+
+def test_zip_lineage_4():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair('class', 'c') ]
+    taxinf = RankLineageInfo(lineage=x)
+    assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c']
+
+
+def test_display_lineage_1():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ]
+    taxinf = RankLineageInfo(lineage=x)
+    assert taxinf.display_lineage() == "a;b"
+
+
+def test_display_lineage_2():
+    x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ]
+    taxinf = RankLineageInfo(lineage=x)
+    assert taxinf.display_lineage() == "a;;c"
+
+
+def test_display_taxid_1():
+    x = [ LineagePair('superkingdom', 'a', 1), LineagePair('phylum', 'b', 2) ]
+    taxinf = RankLineageInfo(lineage=x)
+    print(taxinf)
+    assert taxinf.display_taxid() == "1;2"
+
+def test_display_taxid_2():
+    x = [ LineagePair('superkingdom', 'name1', 1), LineagePair(None, ''), LineagePair    ('class', 'name2',2) ]
+    taxinf = RankLineageInfo(lineage=x)
+    print(taxinf)
+    assert taxinf.display_taxid() == "1;;2"
+
+
+def test_is_lineage_match_1():
+    # basic behavior: match at order and above, but not at family or below.
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e')
+    lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    print(lin1.lineage)
+    assert lin1.is_lineage_match(lin2, 'superkingdom')
+    assert lin2.is_lineage_match(lin1, 'superkingdom')
+    assert lin1.is_lineage_match(lin2, 'phylum')
+    assert lin2.is_lineage_match(lin1, 'phylum')
+    assert lin1.is_lineage_match(lin2, 'class')
+    assert lin2.is_lineage_match(lin1, 'class')
+    assert lin1.is_lineage_match(lin2, 'order')
+    assert lin2.is_lineage_match(lin1, 'order')
+
+    assert not lin1.is_lineage_match(lin2, 'family')
+    assert not lin2.is_lineage_match(lin1, 'family')
+    assert not lin1.is_lineage_match(lin2, 'genus')
+    assert not lin2.is_lineage_match(lin1, 'genus')
+    assert not lin1.is_lineage_match(lin2, 'species')
+    assert not lin2.is_lineage_match(lin1, 'species')
+
+
+def test_is_lineage_match_2():
+    # match at family, and above, levels; no genus or species to match
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    assert lin1.is_lineage_match(lin2, 'superkingdom')
+    assert lin2.is_lineage_match(lin1, 'superkingdom')
+    assert lin1.is_lineage_match(lin2, 'phylum')
+    assert lin2.is_lineage_match(lin1, 'phylum')
+    assert lin1.is_lineage_match(lin2, 'class')
+    assert lin2.is_lineage_match(lin1, 'class')
+    assert lin1.is_lineage_match(lin2, 'order')
+    assert lin2.is_lineage_match(lin1, 'order')
+    assert lin1.is_lineage_match(lin2, 'family')
+    assert lin2.is_lineage_match(lin1, 'family')
+
+    assert not lin1.is_lineage_match(lin2, 'genus')
+    assert not lin2.is_lineage_match(lin1, 'genus')
+    assert not lin1.is_lineage_match(lin2, 'species')
+    assert not lin2.is_lineage_match(lin1, 'species')
+
+
+def test_is_lineage_match_3():
+    # one lineage is empty
+    lin1 = RankLineageInfo()
+    lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+
+    assert not lin1.is_lineage_match(lin2, 'superkingdom')
+    assert not lin2.is_lineage_match(lin1, 'superkingdom')
+    assert not lin1.is_lineage_match(lin2, 'phylum')
+    assert not lin2.is_lineage_match(lin1, 'phylum')
+    assert not lin1.is_lineage_match(lin2, 'class')
+    assert not lin2.is_lineage_match(lin1, 'class')
+    assert not lin1.is_lineage_match(lin2, 'order')
+    assert not lin2.is_lineage_match(lin1, 'order')
+    assert not lin1.is_lineage_match(lin2, 'family')
+    assert not lin2.is_lineage_match(lin1, 'family')
+    assert not lin1.is_lineage_match(lin2, 'genus')
+    assert not lin2.is_lineage_match(lin1, 'genus')
+    assert not lin1.is_lineage_match(lin2, 'species')
+    assert not lin2.is_lineage_match(lin1, 'species')
+
+
+def test_is_lineage_match_incorrect_ranks():
+    #test comparison with incompatible ranks
+    taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain')
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e', ranks=taxranks[::-1])
+    lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    print(lin1.lineage)
+    with pytest.raises(ValueError) as exc:
+        lin1.is_lineage_match(lin2, 'superkingdom')
+    print(str(exc))
+    assert 'Cannot compare lineages from taxonomies with different ranks.' in str(exc)
+
+
+def test_pop_to_rank_1():
+    # basic behavior - pop to order?
+    lin1 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d')
+    lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f')
+
+    print(lin1)
+    popped = lin2.pop_to_rank('order')
+    print(popped)
+    assert popped == lin1
+
+
+def test_pop_to_rank_2():
+    # what if we're already above rank?
+    lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f')
+    print(lin2.pop_to_rank('species'))
+    assert lin2.pop_to_rank('species') == lin2
+
+
+def test_pop_to_rank_rank_not_avail():
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    with pytest.raises(ValueError) as exc:
+        lin1.pop_to_rank("NotARank")
+    print(str(exc))
+    assert "Desired Rank 'NotARank' not available for this lineage" in str(exc)
+
+
+def test_lineage_at_rank_norank():
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    with pytest.raises(TypeError) as exc:
+        lin1.lineage_at_rank()
+    print(str(exc))
+    assert "lineage_at_rank() missing 1 required positional argument: 'rank'" in str(exc)
+
+
+def test_lineage_at_rank_rank_not_avail():
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    with pytest.raises(ValueError) as exc:
+        lin1.lineage_at_rank("NotARank")
+    print(str(exc))
+    assert "Desired Rank 'NotARank' not available for this lineage" in str(exc)
+
+
+def test_lineage_at_rank_1():
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    print(lin1.lineage_at_rank('superkingdom'))
+    
+    assert lin1.lineage_at_rank('superkingdom') == (LineagePair(rank='superkingdom', name='d__a', taxid=None),)
+    print(lin1.lineage_at_rank('class'))
+    assert lin1.lineage_at_rank('class') == (LineagePair(rank='superkingdom', name='d__a', taxid=None),
+                                             LineagePair(rank='phylum', name='p__b', taxid=None),
+                                             LineagePair(rank='class', name='c__c', taxid=None))
+
+
+def test_lineage_at_rank_below_rank():
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    print(lin1.lineage_at_rank('superkingdom'))
+    # if rank is not provided, we only return the filled lineage, to follow original pop_to_rank behavior.
+
+    print(lin1.lineage_at_rank('genus'))
+    assert lin1.lineage_at_rank('genus') == (LineagePair(rank='superkingdom', name='d__a', taxid=None),
+                                             LineagePair(rank='phylum', name='p__b', taxid=None),
+                                             LineagePair(rank='class', name='c__c', taxid=None),
+                                             LineagePair(rank='order', name='o__d', taxid=None),
+                                             LineagePair(rank='family', name='f__f', taxid=None))

From ea7a16392810f92029d6afb85d0c2442f18f0e3c Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 5 Jan 2023 17:00:01 -0800
Subject: [PATCH 03/21] fix

---
 tests/test_tax_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 65334e82aa..d2aceb224c 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1309,14 +1309,14 @@ def test_BaseLineageInfo_init_no_ranks():
 
 def test_BaseLineageInfo_init_with_wrong_ranks():
     ranks=["A", "B", "C"]
-    lin_tups = (LineagePair(rank="rank1", name='name1'))
+    lin_tups = [LineagePair(rank="rank1", name='name1')]
     linD = {"rank1": "a"}
     with pytest.raises(ValueError) as exc:
         BaseLineageInfo(lineage=lin_tups, ranks=ranks)
     print(str(exc))
     assert "Rank 'rank1' not present in A, B, C" in str(exc)
     with pytest.raises(ValueError) as exc:
-        BaseLineageInfo(lineage=linD, ranks=ranks)
+        BaseLineageInfo(lineage_dict=linD, ranks=ranks)
     print(str(exc))
     assert "Rank 'rank1' not present in A, B, C" in str(exc)
 

From 23e2480876800e471bbdfb92cadf42793e959c8c Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 5 Jan 2023 17:00:23 -0800
Subject: [PATCH 04/21] add v4.4 columns that will be required

---
 tests/test-data/tax/test1.gather.csv | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test-data/tax/test1.gather.csv b/tests/test-data/tax/test1.gather.csv
index 80388e8d9d..a349fd2774 100644
--- a/tests/test-data/tax/test1.gather.csv
+++ b/tests/test-data/tax/test1.gather.csv
@@ -1,5 +1,5 @@
-intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename
-442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig
-390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig
-138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig
-338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig
+intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes
+442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,1024000,31,1000,2507
+390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,1024000,31,1000,2507
+138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,1024000,31,1000,2507
+338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,1024000,31,1000,2507

From f458dc1dc90739c6019eb4ddd57ff5aca8ec1a12 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 5 Jan 2023 17:54:35 -0800
Subject: [PATCH 05/21] make the newly added query_bp info in test1.gather.csv
 work with existing tests

---
 tests/test-data/tax/test1.gather.csv     |  8 ++++----
 tests/test-data/tax/test1.gather_ani.csv | 10 +++++-----
 tests/test_tax.py                        |  7 +++----
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/tests/test-data/tax/test1.gather.csv b/tests/test-data/tax/test1.gather.csv
index a349fd2774..8d00401dbb 100644
--- a/tests/test-data/tax/test1.gather.csv
+++ b/tests/test-data/tax/test1.gather.csv
@@ -1,5 +1,5 @@
 intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes
-442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,1024000,31,1000,2507
-390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,1024000,31,1000,2507
-138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,1024000,31,1000,2507
-338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,1024000,31,1000,2507
+442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,5014000,31,1000,2507
+390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,50140000,31,1000,2507
+138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,5014000,31,1000,2507
+338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,5014000,31,1000,2507
diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_ani.csv
index 48a09eb199..80388e8d9d 100644
--- a/tests/test-data/tax/test1.gather_ani.csv
+++ b/tests/test-data/tax/test1.gather_ani.csv
@@ -1,5 +1,5 @@
-intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled,query_nhashes
-442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000,5013970
-390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000,4571970
-138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000,4181970
-338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000,4327970
+intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename
+442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig
+390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig
+138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig
+338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig
diff --git a/tests/test_tax.py b/tests/test_tax.py
index b1b3c1b8f8..f44bbfe2a3 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -1023,7 +1023,7 @@ def test_genome_rank_human_output(runtmp):
     # test basic genome - output csv
     c = runtmp
 
-    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
     csv_base = "out"
     csvout = runtmp.output(csv_base + '.human.txt')
@@ -1878,7 +1878,7 @@ def test_genome_ani_threshold_input_errors(runtmp):
 
 def test_genome_ani_threshold(runtmp):
     c = runtmp
-    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
 
     c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax,
@@ -1889,7 +1889,6 @@ def test_genome_ani_threshold(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status == 0
-    assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err
     assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
     assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out 
 
@@ -1917,7 +1916,7 @@ def test_genome_ani_threshold(runtmp):
 def test_genome_ani_oldgather(runtmp):
     # Ignore ANI if we don't have the information we need to estimate it
     c = runtmp
-    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
 
     c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax)

From 592993d2c19ec444d88fe733dab176b726cca274 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 5 Jan 2023 17:58:33 -0800
Subject: [PATCH 06/21] rename test1.gather_ani.csv to test1.gather_old.csv to
 reflect its new usage

---
 .../tax/{test1.gather_ani.csv => test1.gather_old.csv}      | 0
 tests/test_tax.py                                           | 6 +++---
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename tests/test-data/tax/{test1.gather_ani.csv => test1.gather_old.csv} (100%)

diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_old.csv
similarity index 100%
rename from tests/test-data/tax/test1.gather_ani.csv
rename to tests/test-data/tax/test1.gather_old.csv
diff --git a/tests/test_tax.py b/tests/test_tax.py
index f44bbfe2a3..ba88ecfe00 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -1023,7 +1023,7 @@ def test_genome_rank_human_output(runtmp):
     # test basic genome - output csv
     c = runtmp
 
-    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
+    g_csv = utils.get_test_data('tax/test1.gather_old.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
     csv_base = "out"
     csvout = runtmp.output(csv_base + '.human.txt')
@@ -1841,7 +1841,7 @@ def test_genome_over100percent_error(runtmp):
 
 def test_genome_ani_threshold_input_errors(runtmp):
     c = runtmp
-    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
+    g_csv = utils.get_test_data('tax/test1.gather_old.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
     below_threshold = "-1"
 
@@ -1916,7 +1916,7 @@ def test_genome_ani_threshold(runtmp):
 def test_genome_ani_oldgather(runtmp):
     # Ignore ANI if we don't have the information we need to estimate it
     c = runtmp
-    g_csv = utils.get_test_data('tax/test1.gather_ani.csv')
+    g_csv = utils.get_test_data('tax/test1.gather_old.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
 
     c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax)

From 06fb8489cc1a4ba53ae9b2574300ff9276e38560 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 6 Jan 2023 07:49:09 -0800
Subject: [PATCH 07/21] moar lineage tests

---
 src/sourmash/tax/tax_utils.py |  2 ++
 tests/test_tax_utils.py       | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index ec731ede5e..02a55f3f8b 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -85,6 +85,8 @@ def ascending_taxlist(self):
 
     @property
     def lowest_rank(self):
+        if not self.filled_ranks:
+            return None
         return self.filled_ranks[-1]
 
     def rank_index(self, rank):
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index d2aceb224c..beef8f214f 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1243,6 +1243,19 @@ def test_LineagePair_1():
     print(lin1)
 
 
+def test_BaseLineageInfo_init_empty():
+    ranks=["A", "B", "C"]
+    taxinf = BaseLineageInfo(ranks=ranks)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['', '', ''] # this is a bit odd, but it's what preserves empty ranks...
+    print(taxinf.filled_lineage)
+    assert taxinf.filled_lineage == ()
+    assert taxinf.lowest_lineage_name == ""
+    assert taxinf.filled_ranks == ()
+    assert taxinf.lowest_rank == None 
+
+
 def test_BaseLineageInfo_init_lineage_str():
     x = "a;b;c"
     ranks=["A", "B", "C"]
@@ -1250,6 +1263,11 @@ def test_BaseLineageInfo_init_lineage_str():
     print(taxinf.lineage)
     print(taxinf.lineage_str)
     assert taxinf.zip_lineage()== ['a', 'b', 'c']
+    print(taxinf.filled_lineage)
+    assert taxinf.filled_lineage == (LineagePair(rank='A', name='a', taxid=None),
+                                     LineagePair(rank='B', name='b', taxid=None),
+                                     LineagePair(rank='C', name='c', taxid=None))
+    assert taxinf.lowest_lineage_name == "C"
 
 
 def test_BaseLineageInfo_init_lineage_tups():
@@ -1278,6 +1296,8 @@ def test_BaseLineageInfo_init_lineage_dict_withtaxid():
     print("zipped lineage: ", taxinf.zip_lineage())
     assert taxinf.zip_lineage()== ['name1', 'name2']
     assert taxinf.zip_taxid()== ['1', '2']
+    assert taxinf.lowest_lineage_taxid == "2"
+    assert taxinf.lowest_lineage_name == "name2"
 
 
 def test_BaseLineageInfo_init_lineage_str_lineage_dict_test_eq():

From ed055b6d348ae5077a6a010be89162698e159ab8 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 6 Jan 2023 09:21:49 -0800
Subject: [PATCH 08/21] test remaining codecov misses

---
 src/sourmash/tax/tax_utils.py | 51 +++++++++++++-----------------
 tests/test_tax_utils.py       | 59 ++++++++++++++++++++++++++++++++---
 2 files changed, 76 insertions(+), 34 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 02a55f3f8b..835da2cf3b 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -105,9 +105,9 @@ def filled_lineage(self):
     def lowest_lineage_name(self, null_as_unclassified = False):
         """Return the name of the lowest filled lineage"""
         if not self.filled_ranks:
-            #if null_as_unclassified: # todo: enable me
-            #    return "unclassified"
-            #else:
+            if null_as_unclassified: # todo: enable me
+                return "unclassified"
+            else:
                 return ""
         return self.filled_lineage[-1].name
 
@@ -143,7 +143,7 @@ def _init_from_lineage_tuples(self):
                     rank_idx = self.rank_index(lin_tup.rank)
                 except ValueError as e:
                     raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e
-                # make sure we're adding LineagePairs, not lineagePairs for consistency
+                # make sure we're adding tax_utils.LineagePairs, not lca_utils.LineagePairs for consistency
                 if isinstance(lin_tup, lca_utils.LineagePair):
                     new_lineage[rank_idx] =  LineagePair(rank=lin_tup.rank, name=lin_tup.name)
                 else:
@@ -238,16 +238,28 @@ def display_taxid(self, truncate_empty=True):
         "Return lineage taxids as ';'-separated list"
         return ";".join(self.zip_taxid(truncate_empty=truncate_empty))
 
+    def check_rank_availability(self, rank):
+        if rank in self.ranks: # rank is available
+            return True
+        raise ValueError(f"Desired Rank '{rank}' not available for this lineage.")
+ 
+    def rank_is_filled(self, rank, other=None):
+        self.check_rank_availability(rank)
+        if other is not None:
+            if rank in self.filled_ranks and rank in other.filled_ranks:
+                return True
+        elif rank in self.filled_ranks:
+            return True
+        return False
+
     def is_lineage_match(self, other, rank):
         """
         check to see if two lineages are a match down to given rank.
         """
         if not other.ranks == self.ranks: # check same ranks
             raise ValueError("Cannot compare lineages from taxonomies with different ranks.")
-        if rank not in self.ranks: # rank is available
-            raise ValueError(f"Desired Rank {rank} not available for this lineage")
         # always return false if rank is not filled in either of the two lineages
-        if rank in self.filled_ranks and rank in other.filled_ranks:
+        if self.rank_is_filled(rank, other=other):
             rank_idx = self.rank_index(rank)
             a_lin = self.lineage[:rank_idx+1]
             b_lin = other.lineage[:rank_idx+1]
@@ -257,10 +269,8 @@ def is_lineage_match(self, other, rank):
 
     def pop_to_rank(self, rank):
         "Return new LineageInfo with ranks only filled to desired rank"
-        if rank not in self.ranks:
-            raise ValueError(f"Desired Rank '{rank}' not available for this lineage")
         # are we already above rank?
-        if rank not in self.filled_ranks:
+        if not self.rank_is_filled(rank):
             return replace(self)
         # if not, make filled_lineage at this rank + use to generate new LineageInfo
         new_lineage = self.lineage_at_rank(rank)
@@ -272,10 +282,8 @@ def pop_to_rank(self, rank):
     def lineage_at_rank(self, rank):
         # non-descructive pop_to_rank. Returns tuple of LineagePairs
         "Returns tuple of LineagePairs at given rank."
-        if rank not in self.ranks:
-            raise ValueError(f"Desired Rank '{rank}' not available for this lineage")
         # are we already above rank?
-        if rank not in self.filled_ranks:
+        if not self.rank_is_filled(rank):
             return self.filled_lineage
         # if not, return lineage tuples down to desired rank
         rank_idx = self.rank_index(rank)
@@ -302,23 +310,6 @@ def __post_init__(self):
             self._init_empty()
 
 
-@dataclass(frozen=True, order=True)
-class LINSLineageInfo(BaseLineageInfo):
-    """Class for storing multi-rank lineage information"""
-    num_positions: int = None
-    ## WHAT special considerations do we have here?
-    def __post_init__(self):
-        "Initialize according to passed values"
-        if self.lineage != [LineagePair()]:
-            self._init_from_lineage_tuples()
-        elif self.lineage_str is not None and self.ranks:
-            self._init_from_lineage_str()
-        elif self.ranks:
-            self._init_empty()
-        else:
-            raise ValueError("Cannot initialize LINSLineageInfo. Please provide lineage or rank info.")
-
-
 def get_ident(ident, *,
               keep_full_identifiers=False, keep_identifier_versions=False):
     # split identifiers = split on whitespace
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index beef8f214f..8580fb8ee9 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1252,8 +1252,11 @@ def test_BaseLineageInfo_init_empty():
     print(taxinf.filled_lineage)
     assert taxinf.filled_lineage == ()
     assert taxinf.lowest_lineage_name == ""
+    assert taxinf.lowest_lineage_taxid == ""
     assert taxinf.filled_ranks == ()
-    assert taxinf.lowest_rank == None 
+    assert taxinf.lowest_rank == None
+    assert taxinf.display_lineage() == ""
+    assert taxinf.display_lineage(null_as_unclassified=True) == "unclassified"
 
 
 def test_BaseLineageInfo_init_lineage_str():
@@ -1267,7 +1270,17 @@ def test_BaseLineageInfo_init_lineage_str():
     assert taxinf.filled_lineage == (LineagePair(rank='A', name='a', taxid=None),
                                      LineagePair(rank='B', name='b', taxid=None),
                                      LineagePair(rank='C', name='c', taxid=None))
-    assert taxinf.lowest_lineage_name == "C"
+    assert taxinf.lowest_lineage_name == "c"
+
+def test_BaseLineageInfo_init_lineage_str_comma_sep():
+    x = "a,b,c"
+    ranks=["A", "B", "C"]
+    taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', 'b', 'c']
+    print(taxinf.filled_lineage)
+    assert taxinf.lowest_lineage_name == "c"
 
 
 def test_BaseLineageInfo_init_lineage_tups():
@@ -1279,7 +1292,26 @@ def test_BaseLineageInfo_init_lineage_tups():
     assert taxinf.zip_lineage()== ['a', '', 'b']
 
 
-def test_BaseLineageInfo_init_lineage_dict():
+def test_BaseLineageInfo_init_lca_lineage_tups():
+    ranks=["A", "B", "C"]
+    lin_tups = (lca_utils.LineagePair(rank="A", name='a'), lca_utils.LineagePair(rank="C", name='b'))
+    taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', '', 'b']
+
+
+def test_BaseLineageInfo_init_lineage_dict_fail():
+    ranks=["A", "B", "C"]
+    lin_tups = (lca_utils.LineagePair(rank="A", name='a'), lca_utils.LineagePair(rank="C", name='b'))
+    with pytest.raises(ValueError) as exc:
+        taxinf = BaseLineageInfo(ranks=ranks, lineage_dict=lin_tups)
+    print(str(exc))
+
+    assert "is not dictionary" in str(exc)
+
+
+def test_BaseLineageInfo_init_lineage_dict  ():
     x = {'rank1': 'name1', 'rank2': 'name2'}
     taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"])
     print("ranks: ", taxinf.ranks)
@@ -1296,7 +1328,7 @@ def test_BaseLineageInfo_init_lineage_dict_withtaxid():
     print("zipped lineage: ", taxinf.zip_lineage())
     assert taxinf.zip_lineage()== ['name1', 'name2']
     assert taxinf.zip_taxid()== ['1', '2']
-    assert taxinf.lowest_lineage_taxid == "2"
+    assert taxinf.lowest_lineage_taxid == 2
     assert taxinf.lowest_lineage_name == "name2"
 
 
@@ -1364,6 +1396,14 @@ def test_RankLineageInfo_init_lineage_str():
     print(taxinf.lineage_str)
     assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', '']
 
+def test_RankLineageInfo_init_lineage_str_with_ranks_as_list():
+    x = "a;b;c"
+    taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
+    taxinf = RankLineageInfo(lineage_str=x, ranks=taxranks)
+    print(taxinf.lineage)
+    print(taxinf.lineage_str)
+    assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '']
+
 
 def test_RankLineageInfo_init_lineage_tups():
     x = (LineagePair(rank="superkingdom", name='a'), LineagePair(rank="phylum", name='b'))
@@ -1577,6 +1617,17 @@ def test_is_lineage_match_incorrect_ranks():
     assert 'Cannot compare lineages from taxonomies with different ranks.' in str(exc)
 
 
+def test_is_lineage_match_improper_rank():
+    #test comparison with incompatible ranks
+    lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e')
+    lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f')
+    print(lin1.lineage)
+    with pytest.raises(ValueError) as exc:
+        lin1.is_lineage_match(lin2, 'NotARank')
+    print(str(exc))
+    assert "Desired Rank 'NotARank' not available for this lineage" in str(exc)
+
+
 def test_pop_to_rank_1():
     # basic behavior - pop to order?
     lin1 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d')

From c15ff78bc82a56f97388e0c48b32ad45d6fa175c Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 6 Jan 2023 09:58:19 -0800
Subject: [PATCH 09/21] whoops, one last codecov miss

---
 src/sourmash/tax/tax_utils.py | 7 ++-----
 tests/test_tax_utils.py       | 1 +
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 835da2cf3b..875d8e40d9 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -102,13 +102,10 @@ def filled_lineage(self):
         return self.lineage[:lowest_filled_rank_idx+1]
 
     @property
-    def lowest_lineage_name(self, null_as_unclassified = False):
+    def lowest_lineage_name(self):
         """Return the name of the lowest filled lineage"""
         if not self.filled_ranks:
-            if null_as_unclassified: # todo: enable me
-                return "unclassified"
-            else:
-                return ""
+            return ""
         return self.filled_lineage[-1].name
 
     @property
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 8580fb8ee9..d61d7596a1 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1271,6 +1271,7 @@ def test_BaseLineageInfo_init_lineage_str():
                                      LineagePair(rank='B', name='b', taxid=None),
                                      LineagePair(rank='C', name='c', taxid=None))
     assert taxinf.lowest_lineage_name == "c"
+    assert taxinf.lowest_rank == "C"
 
 def test_BaseLineageInfo_init_lineage_str_comma_sep():
     x = "a,b,c"

From 766a8b9ddfcd26405aed3e41634c1a5b353325ee Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 6 Jan 2023 10:31:18 -0800
Subject: [PATCH 10/21] add tax summarization classes; rename old NamedTuples
 to avoid breakage

---
 src/sourmash/tax/tax_utils.py | 513 +++++++++++++++++++++++++++++++++-
 tests/test_tax_utils.py       | 128 ++++++++-
 2 files changed, 613 insertions(+), 28 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 875d8e40d9..db1bb5b593 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -7,7 +7,7 @@
 from collections import abc
 from itertools import zip_longest
 from typing import NamedTuple
-from dataclasses import dataclass, field, replace
+from dataclasses import dataclass, field, replace, asdict
 import gzip
 
 from sourmash import sqlite_utils, sourmash_args
@@ -24,15 +24,15 @@
            'aggregate_by_lineage_at_rank', 'format_for_krona',
            'write_krona', 'write_summary', 'write_classifications',
            'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac',
-           'MultiLineageDB']
+           'MultiLineageDB', 'RankLineageInfo']
 
 from sourmash.logging import notify
 from sourmash.sourmash_args import load_pathlist_from_file
 
 # CTB: these could probably usefully be converted into dataclasses.
-QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp, query_hashes, total_weighted_hashes")
-SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank, total_weighted_hashes")
-ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank")
+QInfo = namedtuple("QInfo", "query_md5, query_filename, query_bp, query_hashes, total_weighted_hashes")
+SumGathInf = namedtuple("SumGathInf", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank, total_weighted_hashes")
+ClassInf = namedtuple("ClassInf", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank")
 
 # Essential Gather column names that must be in gather_csv to allow `tax` summarization
 EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename')
@@ -494,7 +494,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                 query_bp = unique_intersect_bp + int(row['remaining_bp'])
         
         # store query info
-        query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes=query_nhashes, total_weighted_hashes=total_weighted_hashes)
+        query_info[query_name] = QInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes=query_nhashes, total_weighted_hashes=total_weighted_hashes)
         
         if estimate_query_ani and (not ksize or not scaled):
             if not set_ksize:
@@ -528,7 +528,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
         sum_uniq_weighted[query_name][lineage] += f_uniq_weighted
         sum_uniq_bp[query_name][lineage] += unique_intersect_bp
 
-    # sort and store each as SummarizedGatherResult
+    # sort and store each as SumGathInf
     sum_uniq_to_query_sorted = []
     for query_name, lineage_weights in sum_uniq_to_query.items():
         qInfo = query_info[query_name]
@@ -546,7 +546,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
             if estimate_query_ani:
                 query_ani = containment_to_distance(fraction, ksize, scaled,
                                                     n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp).ani
-            sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5,
+            sres = SumGathInf(query_name, rank, fraction, lineage, qInfo.query_md5,
                                           qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes * scaled)
             sum_uniq_to_query_sorted.append(sres)
         else:
@@ -567,7 +567,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
                 if estimate_query_ani:
                     query_ani = containment_to_distance(fraction, ksize, scaled,
                                                         n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp).ani
-                sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5,
+                sres = SumGathInf(query_name, rank, fraction, lineage, query_md5,
                                               query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes * scaled)
                 sum_uniq_to_query_sorted.append(sres)
 
@@ -578,7 +578,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
             if fraction > 0:
                 f_weighted_at_rank = 1.0 - total_f_weighted
                 bp_intersect_at_rank = qInfo.query_bp - total_bp_classified
-                sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5,
+                sres = SumGathInf(query_name, rank, fraction, lineage, query_md5,
                                               query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes*scaled)
                 sum_uniq_to_query_sorted.append(sres)
 
@@ -616,7 +616,7 @@ def make_krona_header(min_rank, *, include_strain=False):
 
 def aggregate_by_lineage_at_rank(rank_results, *, by_query=False):
     '''
-    Aggregate list of rank SummarizedGatherResults,
+    Aggregate list of rank SumGathInfs,
     keeping query info or aggregating across queries.
     '''
     lineage_summary = defaultdict(float)
@@ -635,7 +635,7 @@ def aggregate_by_lineage_at_rank(rank_results, *, by_query=False):
 
 def format_for_krona(rank, summarized_gather):
     '''
-    Aggregate list of SummarizedGatherResults and format for krona output
+    Aggregate list of SumGathInfs and format for krona output
     '''
     num_queries=0
     for res_rank, rank_results in summarized_gather.items():
@@ -685,7 +685,7 @@ def write_summary(summarized_gather, csv_fp, *, sep=',', limit_float_decimals=Fa
     '''
     Write taxonomy-summarized gather results for each rank.
     '''
-    header = SummarizedGatherResult._fields
+    header = SumGathInf._fields
     w = csv.DictWriter(csv_fp, header, delimiter=sep)
     w.writeheader()
     for rank, rank_results in summarized_gather.items():
@@ -840,7 +840,7 @@ def write_classifications(classifications, csv_fp, *, sep=',', limit_float_decim
     '''
     Write taxonomy-classifed gather results.
     '''
-    header = ClassificationResult._fields
+    header = ClassInf._fields
     w = csv.DictWriter(csv_fp, header, delimiter=sep)
     w.writeheader()
     for rank, rank_results in classifications.items():
@@ -1448,3 +1448,488 @@ def load(cls, locations, **kwargs):
                 tax_assign.add(this_tax_assign)
 
         return tax_assign
+
+
+# strategy from: https://subscription.packtpub.com/book/programming/9781800207455/10/ch10lvl1sec01/using-dataclasses-to-simplify-working-with-csv-files
+@dataclass
+class GatherRow(): # all cols should match "gather_write_cols" in `search.py`
+   # essential columns
+   query_name: str
+   name: str # match_name
+   f_unique_weighted: float
+   f_unique_to_query: float
+   unique_intersect_bp: int
+   remaining_bp: int
+   query_md5: str
+   query_filename: str
+   # new essential cols: requires 4.4x
+   query_bp: int
+   ksize: int
+   scaled: int
+
+   # non-essential
+   intersect_bp: int = None
+   f_orig_query: float = None
+   f_match: float = None
+   average_abund: float = None
+   median_abund: float = None
+   std_abund: float = None
+   filename: str = None
+   md5: str = None
+   f_match_orig: float = None
+   gather_result_rank: str = None
+   moltype: str = None
+   query_n_hashes: int = None
+   query_abundance: int = None
+   query_containment_ani: float = None
+   match_containment_ani: float = None
+   average_containment_ani: float = None
+   max_containment_ani: float = None
+   potential_false_negative: bool = None
+   n_unique_weighted_found: int = None
+   sum_weighted_found: int = None
+   total_weighted_hashes: int = None
+
+
+@dataclass(frozen=True)
+class QueryInfo():
+   """Class for storing per-rank lineage information"""
+   query_name: str
+   query_md5: str
+   query_filename: str
+   query_bp: int
+   query_n_hashes: int
+   ksize: int
+   scaled: int
+   total_weighted_hashes: int = 0
+
+   @property
+   def total_weighted_bp(self):
+       return self.total_weighted_hashes * self.scaled
+
+@dataclass
+class TaxResult():
+    raw: GatherRow
+    # can we get rid of these / just choose default ident hacking/slashing for future?
+    keep_full_identifiers: bool = False
+    keep_identifier_versions: bool = False
+
+    query_name: str = field(init=False)
+    query_info: QueryInfo = field(init=False)
+    match_ident: str = field(init=False)
+    lineageInfo: RankLineageInfo = RankLineageInfo()
+    skipped_ident: bool = False
+    missed_ident: bool = False
+    match_lineage_attempted: bool = False
+
+    def __post_init__(self):
+        self.get_ident()
+        self.query_name = self.raw.query_name
+        if self.raw.query_n_hashes:
+            self.raw.query_n_hashes = int(self.raw.query_n_hashes)
+        if self.raw.total_weighted_hashes:
+            self.raw.total_weighted_hashes = int(self.raw.total_weighted_hashes)
+        else:
+            self.raw.total_weighted_hashes = 0
+
+        self.query_info = QueryInfo(query_name = self.raw.query_name,
+                                  query_md5=self.raw.query_md5,
+                                  query_filename = self.raw.query_filename,
+                                  query_bp = int(self.raw.query_bp),
+                                  query_n_hashes = self.raw.query_n_hashes,
+                                  total_weighted_hashes = self.raw.total_weighted_hashes,
+                                  ksize = int(self.raw.ksize),
+                                  scaled = int(self.raw.scaled)
+                                  )
+        # cast and store the imp bits
+        self.f_unique_to_query = float(self.raw.f_unique_to_query)
+        self.f_unique_weighted = float(self.raw.f_unique_weighted)
+        self.unique_intersect_bp = int(self.raw.unique_intersect_bp)
+
+    def get_ident(self):
+        # split identifiers = split on whitespace
+        # keep identifiers = don't split .[12] from assembly accessions
+        "Hack and slash identifiers."
+        self.match_ident = self.raw.name
+        if not self.keep_full_identifiers:
+            self.match_ident = self.raw.name.split(' ')[0]
+        else:
+            #overrides version bc can't keep full without keeping version
+            self.keep_identifier_versions = True
+        if not self.keep_identifier_versions:
+            self.match_ident = self.match_ident.split('.')[0]
+
+
+    def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False):
+        if skip_idents and self.match_ident in skip_idents:
+            self.skipped_ident = True
+        else:
+            lin = tax_assignments.get(self.match_ident)
+            if lin:
+                self.lineageInfo = RankLineageInfo(lineage=lin)
+            else:
+                self.missed_ident=True
+        self.match_lineage_attempted = True
+        if self.missed_ident and fail_on_missing_taxonomy:
+            raise ValueError(f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy")
+#            raise ValueError('Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.')
+
+@dataclass
+class SummarizedGatherResult():
+#   """Class for storing summarized lineage information"""
+    rank: str
+    fraction: float
+    lineage: RankLineageInfo
+    f_weighted_at_rank: float
+    bp_match_at_rank: int
+    query_ani_at_rank: float
+
+    def as_summary_dict(self, query_info=None, limit_float=False):
+        sD = asdict(self)
+        if self.lineage == (): # to do -- get rid of me by using null RankLineageInfo() instead of () for empties
+            sD['lineage'] = 'unclassified'
+        else:
+            sD['lineage'] = self.lineage.display_lineage()
+        if query_info is not None:
+            sD['query_name'] = query_info.query_name
+            sD['query_md5'] = query_info.query_md5
+            sD['query_filename'] = query_info.query_filename
+            sD['total_weighted_hashes'] = query_info.total_weighted_hashes
+        if limit_float:
+            sD['fraction'] = f'{self.fraction:.3f}'
+            sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}'
+
+        return(sD)
+
+    def as_human_friendly_dict(self, query_info = None):
+        sD = self.as_summary_dict(query_info=query_info, limit_float=True)
+        sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%"
+        if sD['query_ani_at_rank'] is not None:
+            sD['query_ani_at_rank'] = f"{self.query_ani_at_rank*100:>3.1f}%"
+        else:
+            sD['query_ani_at_rank'] = '-    '
+        return sD
+
+    def as_kreport_dict(self, rankCode, total_weighted_hashes):
+         sD = asdict(self)
+         this_rank = self.lineage.lowest_rank
+         this_sciname = self.lineage.lowest_lineage_name
+         sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid
+         sD['sci_name'] = this_sciname
+         #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
+         sD['rank_code'] = rankCode[this_rank]
+         sD['num_bp_assigned'] = 0
+         # total percent containment, weighted to include abundance info
+         sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
+         sD["num_bp_contained"] = int(self.f_weighted_at_rank * total_weighted_hashes)
+         # the number of bp actually 'assigned' at this rank. Sourmash assigns everything
+         # at genome level, but since kreport traditionally doesn't include 'strain' or genome,
+         # it is reasonable to state that sourmash assigns at 'species' level for this.
+         # can be modified later.
+         lowest_assignment_rank = 'species'
+         if this_rank == lowest_assignment_rank or this_sciname == "unclassified":
+            sD["num_bp_assigned"] = sD["num_bp_contained"]
+
+@dataclass
+class ClassificationResult(SummarizedGatherResult):
+#   """Class for storing summarized lineage information"""
+    status: str
+
+
+@dataclass
+class QueryTaxResult():
+    """Store all TaxResults for a query. Enable summarization."""
+    query_info: QueryInfo # initialize with QueryInfo dataclass
+
+    def __post_init__(self):
+        self.query_name = self.query_info.query_name # for convenience
+        self._init_taxresult_vars()
+        self._init_summarization_vars()
+
+    def _init_taxresult_vars(self):
+        self.ranks = []
+        self.raw_taxresults = []
+        self.skipped_idents= set()
+        self.missed_idents = set()
+        self.n_missed = 0
+        self.n_skipped = 0
+        self.perfect_match = set()
+
+    def _init_summarization_vars(self):
+        self.sum_uniq_weighted = defaultdict(lambda: defaultdict(float))
+        self.sum_uniq_to_query = defaultdict(lambda: defaultdict(float))
+        self.sum_uniq_bp = defaultdict(lambda: defaultdict(int))
+        self.summarized_ranks = []
+        self._init_summarization_results()
+
+    def _init_summarization_results(self):
+        self.total_f_weighted = defaultdict(float) #0.0
+        self.total_f_classified = defaultdict(float)#0.0
+        self.total_bp_classified = defaultdict(int) #0
+        self.summarized_lineage_results = defaultdict(list)
+
+    def _init_classification_results(self):
+        self.status = 'nomatch'
+        self.classified_ranks = []
+        self.classification_result = None
+        self.krona_classification_result = None
+        self.krona_unclassified_result = None
+        self.krona_classification_header = []
+
+    def is_compatible(self, taxresult):
+        return taxresult.query_info == self.query_info
+
+    @property
+    def ascending_ranks(self):
+        if not self.ranks:
+            return []
+        else:
+            return self.ranks[::-1]
+
+    def add_taxresult(self, taxresult):
+        # check that all query parameters match
+        if self.is_compatible(taxresult=taxresult):
+            if not taxresult.match_lineage_attempted:
+                raise ValueError("Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first.")
+            if not self.ranks:
+                self.ranks = taxresult.lineageInfo.ranks
+            if taxresult.skipped_ident:
+                self.n_skipped +=1
+                self.skipped_idents.add(taxresult.match_ident)
+            elif taxresult.missed_ident:
+                self.n_missed +=1
+                self.missed_idents.add(taxresult.match_ident)
+            self.raw_taxresults.append(taxresult)
+        else:
+            raise ValueError("Error: Cannot add TaxResult: query information does not match.")
+
+    def summarize_up_ranks(self, single_rank=None, force_resummarize=False):
+        if self.summarized_ranks: # has already been summarized
+            if force_resummarize:
+                self._init_summarization_vars()
+            else:
+                raise ValueError("Error: already summarized using rank(s): '{', '.join(self.summarized_ranks)}'. Use 'force_resummarize=True' to reset and resummarize")
+        # set ranks levels to summarize
+        self.summarized_ranks = self.ascending_ranks
+        if single_rank:
+            if single_rank not in self.summarized_ranks:
+                raise ValueError(f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})")
+            self.summarized_ranks = [single_rank]
+        notify(f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} ")
+        for taxres in self.raw_taxresults:
+            lininfo = taxres.lineageInfo
+            if lininfo and lininfo.filled_lineage: # won't always have lineage to summarize (skipped idents, missed idents)
+                # notify + track perfect matches
+                if taxres.f_unique_to_query >= 1.0:
+                    if taxres.match_ident not in self.perfect_match:
+                        notify(f"WARNING: 100% match! Is query '{self.query_name}' identical to its database match, '{taxres.match_ident}'?")
+                        self.perfect_match.add(taxres.match_ident)
+                # add this taxresult to summary
+                for rank in self.summarized_ranks:
+                    if rank in lininfo.filled_ranks: # only store if this rank is filled.
+                        lin_at_rank = lininfo.pop_to_rank(rank)
+                        self.sum_uniq_weighted[rank][lin_at_rank] += taxres.f_unique_weighted
+                        self.sum_uniq_to_query[rank][lin_at_rank] += taxres.f_unique_to_query
+                        self.sum_uniq_bp[rank][lin_at_rank] += taxres.unique_intersect_bp
+        # reset ranks levels to the ones that were actually summarized + that we can access for summarized result
+        self.summarized_ranks = [x for x in self.summarized_ranks if x in self.sum_uniq_bp.keys()]
+        if single_rank and single_rank not in self.summarized_ranks:
+            raise ValueError(f"Error: rank '{single_rank}' was not available for any matching lineages.")
+
+    def build_summarized_result(self, single_rank=None, force_resummarize=False):
+        # just reset if we've already built summarized result (avoid adding to existing)? Or write in an error/force option?
+        self._init_summarization_results()
+        # if taxresults haven't been summarized, do that first
+        if not self.summarized_ranks or force_resummarize:
+            self.summarize_up_ranks(single_rank=single_rank, force_resummarize=force_resummarize)
+        # catch potential error from running summarize_up_ranks separately and passing in different single_rank
+        if single_rank and single_rank not in self.summarized_ranks:
+            raise ValueError(f"Error: rank '{single_rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}")
+        # rank loop is currently done in __main__
+        for rank in self.summarized_ranks[::-1]:  # reverse so that results are in descending order
+            sum_uniq_to_query = self.sum_uniq_to_query[rank] #should be lineage: value
+            # first, sort
+            sorted_sum_uniq_to_query = list(sum_uniq_to_query.items())
+            sorted_sum_uniq_to_query.sort(key = lambda x: -x[1])
+            for lineage, f_unique in sorted_sum_uniq_to_query:
+                query_ani = None
+                if f_unique > 1:
+                    raise ValueError(f"The tax summary of query '{self.query_name}' is {f_unique}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
+                elif f_unique == 0: #no annotated results for this query. do we need to handle this differently now?
+                    continue
+                f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage]
+                bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage]
+
+                # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank)
+                # now that I pulled best_only into separate fn, these don't need to be dicts...
+                self.total_f_classified[rank] += f_unique
+                self.total_f_weighted[rank] += f_weighted_at_rank
+                self.total_bp_classified[rank] += bp_intersect_at_rank
+
+                query_ani = containment_to_distance(f_unique, self.query_info.ksize, self.query_info.scaled,
+                                                    n_unique_kmers=self.query_info.query_hashes,
+                                                    sequence_len_bp=self.query_info.query_bp).ani
+                sres = SummarizedGatherResult(lineage=lineage, rank=rank,
+                                              f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique,
+                                              bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani)
+                self.summarized_lineage_results[rank].append(sres)
+
+            # record unclassified
+            lineage = ()
+            query_ani = None
+            f_unique = 1.0 - self.total_f_classified[rank]
+            if f_unique > 0:
+                f_weighted_at_rank = 1.0 - self.total_f_weighted[rank]
+                bp_intersect_at_rank = self.query_info.query_bp - self.total_bp_classified[rank]
+                sres = SummarizedGatherResult(lineage=lineage, rank=rank, f_weighted_at_rank=f_weighted_at_rank,
+                                              fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani)
+                self.summarized_lineage_results[rank].append(sres)
+
+    def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False):
+        if containment_threshold and not 0 <= containment_threshold <= 1:
+            raise ValueError(f"Containment threshold must be between 0 and 1 (input value: {containment_threshold}).")
+        if ani_threshold and not 0 <= ani_threshold <= 1:
+            raise ValueError(f"ANI threshold must be between 0 and 1 (input value: {ani_threshold}).")
+        self._init_classification_results() # init some fields
+        #for sg in self.summarized_lineage_results:
+        if not self.summarized_ranks or force_resummarize:
+            self.summarize_up_ranks(single_rank=rank, force_resummarize=force_resummarize)
+        # catch potential error from running summarize_up_ranks separately and passing in different single_rank
+        self.classified_ranks = self.summarized_ranks
+        # if a rank is provided, we need to classify ONLY using that rank
+        if rank:
+            if rank not in self.summarized_ranks:
+                raise ValueError(f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}")
+            else:
+                self.classified_ranks = [rank]
+        # CLASSIFY using summarization--> best only result. Best way = use ANI or containment threshold
+        for this_rank in self.classified_ranks: # ascending order or just single rank
+            # reset for this rank
+            f_weighted=0.0
+            f_unique_at_rank=0.0
+            bp_intersect_at_rank=0
+            #sum_uniq_to_query = self.sum_uniq_to_query[this_rank]
+            sum_uniq_weighted = self.sum_uniq_weighted[this_rank]  ##SHOULD WE BE USING WEIGHTED HERE? I THINK YES, but this is a change from before.
+            # sort the results and grab best
+            sorted_sum_uniq_weighted = list(sum_uniq_weighted.items())
+            sorted_sum_uniq_weighted.sort(key = lambda x: -x[1])
+            # best only
+            this_lineage, f_weighted = sorted_sum_uniq_weighted[0]
+            if f_weighted > 1:
+                raise ValueError(f"The tax classification of query '{self.query_name}' is {f_weighted}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
+            elif f_weighted == 0: #no annotated results for this query. do we need to handle this differently?
+                continue
+            else:
+                status="below_threshold"
+            f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage]
+            query_ani = containment_to_distance(f_unique_at_rank, self.query_info.ksize, self.query_info.scaled,
+                                                n_unique_kmers=self.query_info.query_hashes,
+                                                sequence_len_bp=self.query_info.query_bp).ani
+            # set classification status based on thresholds
+            if ani_threshold: # if provided, just use ani thresh
+                if query_ani and query_ani >= ani_threshold:
+                    status = 'match'
+                else:
+                    status = 'below_threshold'
+            elif f_weighted >= containment_threshold:
+                status = 'match'
+            # determine whether to move on to a higher tax rank (if avail)
+            if status == 'match':
+                break
+
+        if this_rank == "superkingdom" and status == "nomatch":
+            status="below_threshold"
+
+        # what happens when we don't have a gather match at all?
+        bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage]
+
+        classif = ClassificationResult(status=status, rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage,
+                                       f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank,
+                                       query_ani_at_rank= query_ani)
+        self.classification_result = classif
+
+        if rank is not None:
+            lin_as_list = this_lineage.display_lineage().split(';')
+            krona_classification = (f_weighted, *lin_as_list)
+            self.krona_classification_result = (krona_classification)
+            # handle unclassified - do we want/need this?
+            unclassified_fraction= 1.0-f_weighted
+            len_unclassified_lin = len(lin_as_list)
+            unclassifed_lin = ["unclassified"]*(len_unclassified_lin)
+            self.krona_unclassified_result = (unclassified_fraction, *unclassifed_lin)
+            self.krona_classification_header=self.make_krona_header(min_rank=rank)
+
+    def make_krona_header(self, min_rank):
+        "make header for krona output"
+        if min_rank not in self.summarized_ranks:
+            raise ValueError(f"Rank '{min_rank}' not present in summarized ranks.")
+        else:
+            rank_index = self.ranks.index(min_rank)
+        return ["fraction"] + list(self.ranks[:rank_index+1])
+
+    def make_human_summary(self, display_rank, classification=False):
+        results = []
+        if classification:
+            if not self.classification_result:
+                raise ValueError("query not classified yet.")
+            display_rank_results = [self.classification_result]
+        else:
+            if not self.summarized_lineage_results:
+                raise ValueError("lineages not summarized yet.")
+            display_rank_results = self.summarized_lineage_results[display_rank]
+            display_rank_results.sort(key=lambda res: -res.f_weighted_at_rank)
+
+        for res in display_rank_results:
+            results.append(res.as_human_friendly_dict())
+        return results
+
+    def make_full_summary(self, classification=False, limit_float=False):
+        results = []
+        if classification:
+            header= ["query_name", "status", "rank", "fraction", "lineage",
+                     "query_md5", "query_filename", "f_weighted_at_rank",
+                     "bp_match_at_rank", "query_ani_at_rank"]
+            if not self.classification_result:
+                raise ValueError("query not classified yet.")
+            rD = self.classification_result.as_summary_dict(query_info = self.query_info, limit_float=limit_float)
+            results.append(rD)
+        else:
+            header= ["query_name", "rank", "fraction", "lineage", "query_md5",
+                     "query_filename", "f_weighted_at_rank", "bp_match_at_rank",
+                     "query_ani_at_rank", "total_weighted_hashes"]
+            if not self.summarized_lineage_results:
+                raise ValueError("lineages not summarized yet.")
+
+            for rank in self.summarized_ranks[::-1]:
+                rank_results = self.summarized_lineage_results[rank]
+                rank_results.sort(key=lambda res: -res.f_weighted_at_rank)
+                for res in rank_results:
+                    results.append(res.as_summary_dict(query_info=self.query_info, limit_float=limit_float))
+        return header, results
+
+    def make_kreport_results(self):
+        rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C",
+                        "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"}
+        if self.query_info.total_weighted_hashes == 0:
+            raise ValueError("ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0")
+        required_ranks = set(rankCode.keys).pop('unclassified')
+        if not set(required_ranks).issubset(set(self.ranks)):
+            raise ValueError("ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}")
+        kreport_results = []
+        unclassified_recorded=False
+        # want to order results descending by rank
+        for rank in self.ranks:
+            if rank == 'strain': # no code for strain, can't include in this output afaik
+                continue
+            rank_results = self.summarized_lineage_results[rank]
+            for res in rank_results:
+                kresD = res.as_kreport_dict(rankCode, self.query_info.total_weighted_hashes)
+                if kresD['sci_name'] == "unclassified":
+                    # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank.
+                    # Here, we only need to report it once, since it will be the same fraction for all ranks
+                    if unclassified_recorded:
+                        continue
+                    else:
+                        unclassified_recorded = True
+                kreport_results.append(kresD)
+        return(kreport_results)
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index d61d7596a1..20e53092f4 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -14,15 +14,17 @@
                                     write_summary, MultiLineageDB,
                                     collect_gather_csvs, check_and_load_gather_csvs,
                                     SummarizedGatherResult, ClassificationResult,
+                                    QueryInfo, GatherRow, TaxResult, QueryTaxResult,
+                                    BaseLineageInfo, RankLineageInfo, LineagePair,
                                     write_classifications,
                                     aggregate_by_lineage_at_rank,
                                     make_krona_header, format_for_krona, write_krona,
                                     combine_sumgather_csvs_by_lineage, write_lineage_sample_frac,
-                                    LineageDB, LineageDB_Sqlite)
+                                    LineageDB, LineageDB_Sqlite,
+                                    SumGathInf, ClassInf, QInfo)
 
 # import lca utils as needed for now
 from sourmash.lca import lca_utils
-from sourmash.tax.tax_utils import LineagePair, BaseLineageInfo, RankLineageInfo
 
 # utility functions for testing
 def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False):
@@ -45,6 +47,68 @@ def make_mini_taxonomy(tax_info):
     return taxD
 
 
+def make_GatherRow(gather_dict=None, exclude_cols=[]):
+    """Load artificial gather row (dict) into GatherRow class"""
+    # default contains just the essential cols
+    gatherD = {'query_name': 'q1',
+               'query_md5': 'md5',
+               'query_filename': 'query_fn',
+               'name': 'gA',
+               'f_unique_weighted': 0.2,
+               'f_unique_to_query': 0.1,
+               'query_bp':100,
+               'unique_intersect_bp': 20,
+               'remaining_bp': 1,
+               'ksize': 31,
+               'scaled': 1}
+    if gather_dict is not None:
+        gatherD.update(gather_dict)
+    for col in exclude_cols:
+        gatherD.pop(col)
+    gatherRaw = GatherRow(**gatherD)
+    return gatherRaw
+
+
+def make_TaxResult(gather_dict=None, taxD=None, keep_full_ident=False, keep_ident_version=False, skip_idents=None):
+    """Make TaxResult from artificial gather row (dict)"""
+    gRow = make_GatherRow(gather_dict)
+    taxres = TaxResult(raw=gRow, keep_full_identifiers=keep_full_ident, keep_identifier_versions=keep_ident_version)
+    if taxD is not None:
+        taxres.get_match_lineage(tax_assignments=taxD, skip_idents=skip_idents)
+    return taxres
+
+
+def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_ident=False, keep_ident_version=False,
+                        skip_idents=None, summarize=False, classify=False, classify_rank=None, c_thresh=0.1, ani_thresh=None):
+    """Make QueryTaxResult(s) from artificial gather information, formatted as list of gather rows (dicts)"""
+    gather_results = {}
+    this_querytaxres = None
+    for gather_infoD in gather_info:
+        taxres = make_TaxResult(gather_infoD, taxD=taxD,  keep_full_ident=keep_full_ident,
+                                keep_ident_version=keep_ident_version, skip_idents=skip_idents)
+        query_name = taxres.query_name
+        # add to matching QueryTaxResult or create new one
+        if not this_querytaxres or not this_querytaxres.is_compatible(taxres):
+            # get existing or initialize new
+            this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info))
+        this_querytaxres.add_taxresult(taxres)
+#        print('missed_ident?', taxres.missed_ident)
+        gather_results[query_name] = this_querytaxres
+    if summarize:
+        for query_name, qres in gather_results.items():
+            qres.build_summarized_result()
+    if classify:
+        for query_name, qres in gather_results.items():
+            qres.build_classification_result(rank=classify_rank, containment_threshold=c_thresh, ani_threshold=ani_thresh)
+    # for convenience: If working with single query, just return that QueryTaxResult.
+    if single_query:
+        if len(gather_results.keys()) > 1:
+            raise ValueError("You passed in results for more than one query")
+        else:
+            return next(iter(gather_results.values()))
+    return gather_results
+
+
 ## tests
 def test_ascending_taxlist_1():
     assert list(ascending_taxlist()) ==  ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
@@ -54,24 +118,60 @@ def test_ascending_taxlist_2():
     assert list(ascending_taxlist(include_strain=False)) ==  ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
 
 
+def test_GatherRow_old_gather():
+    # gather does not contain query_name column
+    gA = {"name": "gA.1 name"}
+    with pytest.raises(TypeError) as exc:
+        make_GatherRow(gA, exclude_cols=['query_bp'])
+    print(str(exc))
+    assert "__init__() missing 1 required positional argument: 'query_bp'" in str(exc)
+
+
 def test_get_ident_default():
     ident = "GCF_001881345.1"
     n_id = get_ident(ident)
     assert n_id == "GCF_001881345"
 
 
+def test_TaxResult_get_ident_default():
+    gA = {"name": "GCF_001881345.1"}  # gather result with match name as GCF_001881345.1
+    taxres = make_TaxResult(gA)
+    print(taxres.match_ident)
+    assert taxres.match_ident == "GCF_001881345"
+
+
 def test_get_ident_split_but_keep_version():
-    ident = "GCF_001881345.1"
+    ident = "GCF_001881345.1 secondname"
     n_id = get_ident(ident, keep_identifier_versions=True)
     assert n_id == "GCF_001881345.1"
 
 
+def test_TaxResult_get_ident_split_but_keep_version():
+    gA = {"name": "GCF_001881345.1 secondname"}
+    taxres = make_TaxResult(gA, keep_ident_version=True)
+    print("raw ident: ", taxres.raw.name)
+    print("keep_full?: ", taxres.keep_full_identifiers)
+    print("keep_version?: ",taxres.keep_identifier_versions)
+    print("final ident: ", taxres.match_ident)
+    assert taxres.match_ident == "GCF_001881345.1"
+
+
 def test_get_ident_no_split():
     ident = "GCF_001881345.1 secondname"
     n_id = get_ident(ident, keep_full_identifiers=True)
     assert n_id == "GCF_001881345.1 secondname"
 
 
+def test_TaxResult_get_ident_keep_full():
+    gA = {"name": "GCF_001881345.1 secondname"}
+    taxres = make_TaxResult(gA, keep_full_ident=True)
+    print("raw ident: ", taxres.raw.name)
+    print("keep_full?: ", taxres.keep_full_identifiers)
+    print("keep_version?: ",taxres.keep_identifier_versions)
+    print("final ident: ", taxres.match_ident)
+    assert taxres.match_ident == "GCF_001881345.1 secondname"
+
+
 def test_collect_gather_csvs(runtmp):
     g_csv = utils.get_test_data('tax/test1.gather.csv')
     from_file = runtmp.output("tmp-from-file.txt")
@@ -684,13 +784,13 @@ def test_summarize_gather_at_best_only_equal_choose_first():
 def test_write_summary_csv(runtmp):
     """test summary csv write function"""
 
-    sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0,
+    sum_gather = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=1.0,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
                                                           lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                           query_ani_at_rank=None,
                                                           total_weighted_hashes=0)],
-                  'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0,
+                  'phylum':  [SumGathInf(query_name='queryA', rank='phylum', fraction=1.0,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=1.0, bp_match_at_rank=100,
                                                      lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
@@ -711,7 +811,7 @@ def test_write_summary_csv(runtmp):
 
 def test_write_classification(runtmp):
     """test classification csv write function"""
-    classif = ClassificationResult('queryA', 'match', 'phylum', 1.0,
+    classif = ClassInf('queryA', 'match', 'phylum', 1.0,
                                     (lca_utils.LineagePair(rank='superkingdom', name='a'),
                                      lca_utils.LineagePair(rank='phylum', name='b')),
                                      'queryA_md5', 'queryA.sig', 1.0, 100,
@@ -908,26 +1008,26 @@ def test_write_krona(runtmp):
 
 def test_combine_sumgather_csvs_by_lineage(runtmp):
     # some summarized gather dicts
-    sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
+    sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
                                                           lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
-                  'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
+                  'phylum':  [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
                                                      lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
                                                               lca_utils.LineagePair(rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
-    sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
+    sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
                                                           lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
-                  'phylum':  [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
+                  'phylum':  [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
                                                      lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
@@ -1003,26 +1103,26 @@ def test_write_lineage_sample_frac_format_lineage(runtmp):
 
 def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
     # some summarized gather dicts
-    sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,
+    sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=0.5, bp_match_at_rank=50,
                                                           lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
-                  'phylum':  [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5,
+                  'phylum':  [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
                                                      lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
                                                               lca_utils.LineagePair(rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
-    sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7,
+    sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
                                                           lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
-                  'phylum':  [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7,
+                  'phylum':  [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
                                                      lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),

From 808e6befb55e4c603ccdc02f9116dab4307bf1bd Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 6 Jan 2023 10:42:14 -0800
Subject: [PATCH 11/21] finish renaming

---
 src/sourmash/tax/__main__.py  | 8 ++++----
 src/sourmash/tax/tax_utils.py | 7 ++-----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 6544e5d610..905db68afd 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -13,7 +13,7 @@
 from sourmash.lca.lca_utils import display_lineage, zip_lineage
 
 from . import tax_utils
-from .tax_utils import ClassificationResult, MultiLineageDB
+from .tax_utils import ClassInf, MultiLineageDB
 
 usage='''
 sourmash taxonomy <command> [<args>] - manipulate/work with taxonomy information.
@@ -222,7 +222,7 @@ def genome(args):
                 notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}")
             else:
                 status="match"
-            classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank)
+            classif = ClassInf(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank)
             classifications[args.rank].append(classif)
             matched_queries.add(sg.query_name)
             if "krona" in args.output_format:
@@ -251,7 +251,7 @@ def genome(args):
                 elif sg.fraction >= args.containment_threshold:
                     status = "match"
                 if status == "match":
-                    classif = ClassificationResult(query_name=sg.query_name, status=status, rank=sg.rank,
+                    classif = ClassInf(query_name=sg.query_name, status=status, rank=sg.rank,
                                                     fraction=sg.fraction, lineage=sg.lineage,
                                                     query_md5=sg.query_md5, query_filename=sg.query_filename,
                                                     f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank,
@@ -261,7 +261,7 @@ def genome(args):
                     continue
                 elif rank == "superkingdom" and status == "nomatch":
                     status="below_threshold"
-                    classif = ClassificationResult(query_name=sg.query_name, status=status,
+                    classif = ClassInf(query_name=sg.query_name, status=status,
                                                    rank="", fraction=0, lineage="",
                                                    query_md5=sg.query_md5, query_filename=sg.query_filename,
                                                    f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, 
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index db1bb5b593..36fb9475f7 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -777,7 +777,7 @@ def write_human_summary(summarized_gather, out_fp, display_rank):
     '''
     Write human-readable taxonomy-summarized gather results for a specific rank.
     '''
-    header = SummarizedGatherResult._fields
+    header = SumGathInf._fields
 
     found_ANI = False
     results = [] 
@@ -1586,10 +1586,7 @@ class SummarizedGatherResult():
 
     def as_summary_dict(self, query_info=None, limit_float=False):
         sD = asdict(self)
-        if self.lineage == (): # to do -- get rid of me by using null RankLineageInfo() instead of () for empties
-            sD['lineage'] = 'unclassified'
-        else:
-            sD['lineage'] = self.lineage.display_lineage()
+        sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
         if query_info is not None:
             sD['query_name'] = query_info.query_name
             sD['query_md5'] = query_info.query_md5

From 7c9fdb4450cdc1c004fdeb609701a0a2ddf7dcee Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 6 Jan 2023 11:00:51 -0800
Subject: [PATCH 12/21] add tests for new classes

---
 src/sourmash/tax/tax_utils.py |   4 +-
 tests/test_tax_utils.py       | 755 ++++++++++++++++++++++++++++++++++
 2 files changed, 757 insertions(+), 2 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 36fb9475f7..a2503a3e4e 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1764,7 +1764,7 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False):
                 self.total_bp_classified[rank] += bp_intersect_at_rank
 
                 query_ani = containment_to_distance(f_unique, self.query_info.ksize, self.query_info.scaled,
-                                                    n_unique_kmers=self.query_info.query_hashes,
+                                                    n_unique_kmers=self.query_info.query_n_hashes,
                                                     sequence_len_bp=self.query_info.query_bp).ani
                 sres = SummarizedGatherResult(lineage=lineage, rank=rank,
                                               f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique,
@@ -1820,7 +1820,7 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
                 status="below_threshold"
             f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage]
             query_ani = containment_to_distance(f_unique_at_rank, self.query_info.ksize, self.query_info.scaled,
-                                                n_unique_kmers=self.query_info.query_hashes,
+                                                n_unique_kmers=self.query_info.query_n_hashes,
                                                 sequence_len_bp=self.query_info.query_bp).ani
             # set classification status based on thresholds
             if ani_threshold: # if provided, just use ani thresh
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 20e53092f4..2f620627e4 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -3,6 +3,7 @@
 """
 
 import pytest
+from pytest import approx
 import os
 from os.path import basename
 import gzip
@@ -1793,3 +1794,757 @@ def test_lineage_at_rank_below_rank():
                                              LineagePair(rank='class', name='c__c', taxid=None),
                                              LineagePair(rank='order', name='o__d', taxid=None),
                                              LineagePair(rank='family', name='f__f', taxid=None))
+
+
+def test_TaxResult_get_match_lineage_1():
+    gA_tax = ("gA", "a;b;c")
+    taxD = make_mini_taxonomy([gA_tax])
+
+    gA = {"name": "gA.1 name"}
+    taxres = make_TaxResult(gA)
+    taxres.get_match_lineage(tax_assignments=taxD)
+    assert taxres.lineageInfo.display_lineage() == "a;b;c"
+
+
+def test_TaxResult_get_match_lineage_skip_ident():
+    gA_tax = ("gA", "a;b;c")
+    taxD = make_mini_taxonomy([gA_tax])
+
+    gA = {"name": "gA.1 name"}
+    taxres = make_TaxResult(gA)
+    taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA'])
+    print("skipped_ident?: ", taxres.skipped_ident)
+    print("missed_ident?: ", taxres.missed_ident)
+    assert taxres.skipped_ident == True
+    assert taxres.lineageInfo.display_lineage() == ""
+
+
+def test_TaxResult_get_match_lineage_missed_ident():
+    gA_tax = ("gA.1", "a;b;c")
+    taxD = make_mini_taxonomy([gA_tax])
+
+    gA = {"name": "gA.1 name"}
+    taxres = make_TaxResult(gA)
+    taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB'])
+    print("skipped_ident?: ", taxres.skipped_ident)
+    print("missed_ident?: ", taxres.missed_ident)
+    assert taxres.skipped_ident == False
+    assert taxres.missed_ident == True
+    assert taxres.lineageInfo.display_lineage() == ""
+
+
+def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing():
+    gA_tax = ("gA.1", "a;b;c")
+    taxD = make_mini_taxonomy([gA_tax])
+
+    gA = {"name": "gA.1 name"}
+    taxres = make_TaxResult(gA)
+    with pytest.raises(ValueError) as exc:
+        taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB'], fail_on_missing_taxonomy=True)
+    print(str(exc))
+    assert "Error: ident 'gA' is not in the taxonomy database." in str(exc)
+
+
+def test_QueryTaxResult():
+    "basic functionality: initialize and add a taxresult"
+    tax_info = [("gA", "a;b;c")]
+    taxD = make_mini_taxonomy(tax_info=tax_info)
+    taxres = make_TaxResult(taxD=taxD)
+    # initialize
+    q_res = QueryTaxResult(taxres.query_info)
+    q_res.add_taxresult(taxres)
+    # check that new querytaxres is compatible with taxres
+    assert q_res.is_compatible(taxres)
+    # check that a few thngs were set properly and/or are not yet set.
+    assert q_res.query_name == "q1"
+    assert q_res.query_info.query_bp == 100
+    assert len(q_res.raw_taxresults) == 1
+    assert q_res.skipped_idents == set()
+    assert q_res.missed_idents == set()
+    assert q_res.summarized_lineage_results == {}
+
+
+def test_QueryTaxResult_add_incompatible():
+    "initialize and try to add incompatible taxresult"
+    tax_info = [("gA", "a;b;c")]
+    taxD = make_mini_taxonomy(tax_info=tax_info)
+    taxres = make_TaxResult(taxD=taxD)
+    taxres2 = make_TaxResult({'query_name': 'q2'}, taxD=taxD)
+    # initialize
+    q_res = QueryTaxResult(taxres.query_info)
+    # check that new querytaxres is compatible with taxres and not taxres2
+    assert q_res.is_compatible(taxres)
+    assert not q_res.is_compatible(taxres2)
+    q_res.add_taxresult(taxres)
+    with pytest.raises(ValueError) as exc:
+        q_res.add_taxresult(taxres2)
+    print(str(exc))
+    assert "Error: Cannot add TaxResult: query information does not match." in str(exc)
+
+
+def test_QueryTaxResult_add_without_tax_info():
+    "initialize and add a taxresult with missed ident"
+    taxres = make_TaxResult() # do not add taxonomic info
+    # initialize
+    q_res = QueryTaxResult(taxres.query_info)
+    print("attempted to add lineage info?: ", taxres.match_lineage_attempted)
+    with pytest.raises(ValueError) as exc:
+        q_res.add_taxresult(taxres)
+    print(str(exc))
+    assert "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." in str(exc)
+    
+    
+def test_QueryTaxResult_add_skipped_ident():
+    "initialize and add a taxresult with skipped ident"
+    gA_tax = ("gA", "a;b;c")
+    taxD = make_mini_taxonomy([gA_tax])
+    taxres = make_TaxResult(taxD=taxD, skip_idents = ['gA'])
+#    taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA'])
+    # initialize
+    q_res = QueryTaxResult(taxres.query_info)
+    q_res.add_taxresult(taxres)
+    assert len(q_res.skipped_idents) == 1
+    assert len(q_res.raw_taxresults) == 1
+    assert q_res.missed_idents == set()
+    assert q_res.summarized_lineage_results == {}
+
+
+def test_QueryTaxResult_add_missed_ident():
+    "initialize and add a taxresult with missed ident"
+    gA_tax = ("gB", "a;b;c")
+    taxD = make_mini_taxonomy([gA_tax])
+    taxres = make_TaxResult(taxD=taxD)
+    # initialize
+    q_res = QueryTaxResult(taxres.query_info)
+    # add taxonomic info to taxres
+    q_res.add_taxresult(taxres)
+    assert len(q_res.missed_idents) == 1
+    assert len(q_res.raw_taxresults) == 1
+    assert q_res.skipped_idents == set()
+    assert q_res.summarized_lineage_results == {}
+
+
+def test_QueryTaxResult_track_missed_and_skipped():
+    "make sure missed and skipped idents are being tracked"
+    # make taxonomy
+    tax_info = [("gA", "a;b;c"), ("gB", "a;b;d")]
+    taxD = make_mini_taxonomy(tax_info=tax_info)
+    # make results
+    taxres = make_TaxResult()
+    taxres2 = make_TaxResult({"name": 'gB'}) # skipped
+    taxres3 = make_TaxResult({"name": 'gB'}) # skipped
+    taxres4 = make_TaxResult({"name": 'gC'}) # skipped
+    taxres5 = make_TaxResult({"name": 'gD'}) # missed
+    taxres6 = make_TaxResult({"name": 'gE'}) # missed
+    # initialize
+    q_res = QueryTaxResult(taxres.query_info)
+    # add taxonomic info to taxres, add to q_res
+    for n, tr in enumerate([taxres, taxres2, taxres3, taxres4, taxres5, taxres6]):
+        tr.get_match_lineage(tax_assignments=taxD, skip_idents=['gB', 'gC'])
+        print("num: ", n)
+        print("skipped?: ", tr.skipped_ident)
+        print("missed?: ", tr.missed_ident)
+        q_res.add_taxresult(tr)
+    assert len(q_res.raw_taxresults) == 6
+    print(q_res.n_skipped)
+    print(q_res.n_missed)
+    assert q_res.n_missed == 2
+    assert q_res.n_skipped == 3
+    assert 'gB' in q_res.skipped_idents
+    assert len(q_res.skipped_idents) == 2
+    assert 'gD' in q_res.missed_idents
+    assert q_res.summarized_lineage_results == {}
+
+
+def test_QueryTaxResult_track_missed_and_skipped_using_fn():
+    "make sure missed and skipped idents are being tracked. Same as above but use helper fn."
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}, {"name": 'gB'}, {"name": 'gC'}, {"name": 'gD'}, {"name": 'gE'}]
+    gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB', 'gC'])
+    # should have 6 results for default query 'q1' 
+    print(gres.keys())
+    q_res = next(iter(gres.values()))
+    assert len(q_res.raw_taxresults) == 6
+    print(q_res.n_skipped)
+    print(q_res.n_missed)
+    assert q_res.n_missed == 2
+    assert q_res.n_skipped == 3
+    assert 'gB' in q_res.skipped_idents
+    assert len(q_res.skipped_idents) == 2
+    assert 'gD' in q_res.missed_idents
+    assert q_res.summarized_lineage_results == {}
+
+
+def test_QueryTaxResult_summarize_up_ranks_1():
+    "basic functionality: summarize up ranks"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD)
+    assert len(gres.keys()) == 1
+    q_res = next(iter(gres.values()))
+    # now summarize up the ranks
+    q_res.summarize_up_ranks()
+    assert len(q_res.raw_taxresults) == 2
+    #print(q_res.sum_uniq_weighted.values())
+    #print(q_res.sum_uniq_weighted['superkingdom'])
+    assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom']
+    assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.4)}
+    assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)}
+    assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 40}
+    assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.4)}
+    assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)}
+    assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 40}
+    assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2),
+                                                RankLineageInfo(lineage_str="a;b;d"): approx(0.2)}
+    assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1),
+                                                RankLineageInfo(lineage_str="a;b;d"): approx(0.1)}
+    assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20,
+                                          RankLineageInfo(lineage_str="a;b;d"): 20}
+
+
+def test_QueryTaxResult_summarize_up_ranks_2():
+    "summarize up ranks: different values"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    q_res.summarize_up_ranks()
+    assert len(q_res.raw_taxresults) == 2
+    print(q_res.sum_uniq_weighted.values())
+    print(q_res.sum_uniq_weighted['superkingdom'])
+    assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.3)}
+    assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.15)}
+    assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 30}
+    assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)}
+    assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.15)}
+    assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 30}
+    assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2),
+                                                RankLineageInfo(lineage_str="a;b;d"): approx(0.1)}
+    assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1),
+                                                RankLineageInfo(lineage_str="a;b;d"): approx(0.05)}
+    assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20,
+                                          RankLineageInfo(lineage_str="a;b;d"): 10}
+
+
+
+def test_QueryTaxResult_summarize_up_ranks_missing_lineage():
+    "basic functionality: summarize up ranks"
+    taxD = make_mini_taxonomy([("gA", "a;b;c")])
+    gather_results = [{}, {"name": 'gB'}]
+    gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD)
+    assert len(gres.keys()) == 1
+    q_res = next(iter(gres.values()))
+    # now summarize up the ranks
+    q_res.summarize_up_ranks()
+    assert len(q_res.raw_taxresults) == 2
+    #print(q_res.sum_uniq_weighted.values())
+    print(q_res.sum_uniq_weighted['superkingdom'])
+    assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)}
+    assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)}
+    assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20}
+    assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)}
+    assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)}
+    assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20}
+    assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)}
+    assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)}
+    assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20}
+
+
+def test_QueryTaxResult_summarize_up_ranks_skipped_lineage():
+    "basic functionality: summarize up ranks"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB'])
+    assert len(gres.keys()) == 1
+    q_res = next(iter(gres.values()))
+    # now summarize up the ranks
+    q_res.summarize_up_ranks()
+    assert len(q_res.raw_taxresults) == 2
+    assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom']
+    #print(q_res.sum_uniq_weighted.values())
+    print(q_res.sum_uniq_weighted['superkingdom'])
+    assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)}
+    assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)}
+    assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20}
+    assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)}
+    assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)}
+    assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20}
+    assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)}
+    assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)}
+    assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20}
+
+
+def test_QueryTaxResult_summarize_up_ranks_perfect_match():
+    "summarize up ranks: different values"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{'f_unique_to_query': 1.0}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    q_res.summarize_up_ranks()
+    assert len(q_res.raw_taxresults) == 1
+    print(q_res.sum_uniq_weighted.values())
+    print(q_res.sum_uniq_to_query['superkingdom'])
+    assert list(q_res.sum_uniq_to_query['superkingdom'].values()) == [1.0]
+    assert 'gA' in q_res.perfect_match
+
+
+def test_QueryTaxResult_summarize_up_ranks_already_summarized():
+    "summarize up ranks: error, already summarized"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{'f_unique_to_query': 1.0}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    q_res.summarize_up_ranks()
+    with pytest.raises(ValueError) as exc:
+        q_res.summarize_up_ranks()
+    print(str(exc))
+    assert "Error: already summarized" in str(exc)
+    
+
+def test_QueryTaxResult_summarize_up_ranks_already_summarized_force():
+    "summarize up ranks: already summarized but force"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    q_res.summarize_up_ranks()
+    q_res.summarize_up_ranks(force_resummarize=True)
+    assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom']
+
+    #check that all results are still good 
+    assert len(q_res.raw_taxresults) == 2
+    assert q_res.sum_uniq_weighted['superkingdom'] ==  {RankLineageInfo(lineage_str="a"): approx(0.3)}
+    assert q_res.sum_uniq_weighted['phylum'] ==  {RankLineageInfo(lineage_str="a;b"): approx(0.3)}
+    assert q_res.sum_uniq_to_query['phylum'] ==  {RankLineageInfo(lineage_str="a;b"): approx(0.15)}
+    assert q_res.sum_uniq_bp['phylum'] ==  {RankLineageInfo(lineage_str="a;b"): 30}
+    assert q_res.sum_uniq_to_query['class'] ==  {RankLineageInfo(lineage_str="a;b;c"): approx(0.1),
+                                                 RankLineageInfo(lineage_str="a;b;d"): approx(0.05)}
+    assert q_res.sum_uniq_weighted['class'] ==  {RankLineageInfo(lineage_str="a;b;c"): approx(0.2),
+                                                 RankLineageInfo(lineage_str="a;b;d"): approx(0.1)}
+    assert q_res.sum_uniq_bp['class'] ==  {RankLineageInfo(lineage_str="a;b;c"): 20,
+                                           RankLineageInfo(lineage_str="a;b;d"): 10}
+
+
+def test_QueryTaxResult_summarize_up_ranks_single_rank():
+    "summarize up ranks: different values"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    q_res.summarize_up_ranks(single_rank='phylum')
+    assert len(q_res.raw_taxresults) == 2
+    assert list(q_res.sum_uniq_weighted.keys()) == ['phylum']
+    print(q_res.sum_uniq_weighted.keys())
+    print(q_res.sum_uniq_weighted.values())
+    print(q_res.sum_uniq_weighted['phylum'])
+    assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)}
+    assert list(q_res.sum_uniq_to_query['phylum'].values()) == [approx(0.15)]                                                    
+    assert list(q_res.sum_uniq_bp['phylum'].values()) == [30]                                                    
+    assert q_res.summarized_ranks == ['phylum']
+
+def test_QueryTaxResult_summarize_up_ranks_single_rank_not_available():
+    "summarize up ranks: different values"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    with pytest.raises(ValueError) as exc:
+        q_res.summarize_up_ranks(single_rank='NotARank')
+    print(str(exc))
+    assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc)
+
+
+def test_QueryTaxResult_summarize_up_ranks_single_rank_not_filled():
+    "summarize up ranks: different values"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    with pytest.raises(ValueError) as exc:
+        q_res.summarize_up_ranks(single_rank='species')
+    print(str(exc))
+    assert "Error: rank 'species' was not available for any matching lineages." in str(exc)
+
+
+def test_QueryTaxResult_build_summarized_result_1():
+    "basic functionality: build summarized_result"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_summarized_result()
+    print(q_res.summarized_lineage_results.keys())
+    sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.2, f_weighted_at_rank=0.4, 
+                             lineage=RankLineageInfo(lineage_str='a'),
+                             bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)),
+          SummarizedGatherResult(rank='superkingdom', fraction=0.8, f_weighted_at_rank=0.6,
+                             lineage=(), bp_match_at_rank=60, query_ani_at_rank=None)]
+    print(q_res.summarized_lineage_results['superkingdom'])
+    assert q_res.summarized_lineage_results['superkingdom'] == sk
+    print(q_res.summarized_lineage_results['phylum'])
+    phy = [SummarizedGatherResult(rank='phylum', fraction=0.2, f_weighted_at_rank=0.4, 
+                                   lineage=RankLineageInfo(lineage_str='a;b'),
+                                   bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)),
+            SummarizedGatherResult(rank='phylum', fraction=0.8, f_weighted_at_rank=0.6,
+                                   lineage=(), bp_match_at_rank=60, query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['phylum'] == phy
+    print(q_res.summarized_lineage_results['class'])
+    cl = [SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2, 
+                                 lineage=RankLineageInfo(lineage_str='a;b;c'), 
+                                  bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)),
+          SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2,
+                                 lineage=RankLineageInfo(lineage_str='a;b;d'),
+                                  bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)),
+          SummarizedGatherResult(rank='class', fraction=0.8, f_weighted_at_rank=0.6,
+                                 lineage=(), bp_match_at_rank=60, query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['class'] == cl
+
+    assert q_res.total_f_weighted['phylum'] == approx(0.4)
+    assert q_res.total_f_classified['class'] == approx(0.2)
+    assert q_res.total_bp_classified['superkingdom'] == 40
+
+
+def test_QueryTaxResult_build_summarized_result_2():
+    """test two queries, build summarized result for each"""
+    # make mini taxonomy
+    gA_tax = ("gA", "a;b")
+    gB_tax = ("gB", "a;c")
+    taxD = make_mini_taxonomy([gA_tax, gB_tax])
+    # make gather results
+    gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.5,'unique_intersect_bp': 50}, 
+                      {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.4,'f_unique_to_query': 0.3,'unique_intersect_bp': 30},
+                      {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}]
+    gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD)
+    
+    for query_name, q_res in gres.items():
+        q_res.build_summarized_result() # summarize and build result
+        sk = q_res.summarized_lineage_results['superkingdom']
+        phy = q_res.summarized_lineage_results['phylum']
+        assert len(sk) == 2
+        assert sk[0].lineage == RankLineageInfo(lineage_str="a")
+        print(phy)
+        if query_name == 'queryA':
+            # check superkingdom results
+            assert sk[0].fraction == approx(0.8)
+            assert sk[0].f_weighted_at_rank == approx(0.9)
+            assert sk[0].bp_match_at_rank == 80
+            assert sk[1].fraction == approx(0.2)
+            assert sk[1].f_weighted_at_rank == approx(0.1)
+            assert sk[1].bp_match_at_rank == 20
+            assert sk[1].lineage == ()
+            # check phylum results
+            assert len(phy) == 3
+            assert phy[0].fraction == approx(0.5)
+            assert phy[0].f_weighted_at_rank == approx(0.5)
+            assert phy[0].bp_match_at_rank == 50
+            assert phy[0].lineage ==  RankLineageInfo(lineage_str="a;b")
+            assert phy[1].fraction == approx(0.3)
+            assert phy[1].f_weighted_at_rank == approx(0.4)
+            assert phy[1].bp_match_at_rank == 30
+            assert phy[1].lineage ==  RankLineageInfo(lineage_str="a;c")
+            assert phy[2].fraction == approx(0.2)
+            assert phy[2].f_weighted_at_rank == approx(0.1)
+            assert phy[2].bp_match_at_rank == 20
+            assert phy[2].lineage == ()
+        if query_name == 'queryB':
+            # check superkingdom results
+            assert sk[0].fraction == approx(0.3)
+            assert sk[0].f_weighted_at_rank == approx(0.3)
+            assert sk[0].bp_match_at_rank == 30
+            assert sk[1].fraction == approx(0.7)
+            assert sk[1].f_weighted_at_rank == approx(0.7)
+            assert sk[1].bp_match_at_rank == 70
+            assert sk[1].lineage == ()
+            # check phylum results
+            assert len(phy) == 2
+            assert phy[0].fraction == approx(0.3)
+            assert phy[0].f_weighted_at_rank == approx(0.3)
+            assert phy[0].bp_match_at_rank == 30
+            assert phy[0].lineage ==  RankLineageInfo(lineage_str="a;c")
+            assert phy[1].fraction == approx(0.7)
+            assert phy[1].f_weighted_at_rank == approx(0.7)
+            assert phy[1].bp_match_at_rank == 70
+            assert phy[1].lineage == ()
+
+
+def test_QueryTaxResult_build_summarized_result_missing_lineage():
+    "build summarized_result with missing lineage"
+    taxD = make_mini_taxonomy([("gA", "a;b;c")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_summarized_result()
+    print(q_res.summarized_lineage_results.keys())
+    print(q_res.summarized_lineage_results['superkingdom'])
+
+    sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2, 
+                                 lineage=RankLineageInfo(lineage_str="a"), 
+                                 bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)),
+          SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=(),f_weighted_at_rank=0.8,
+                                 bp_match_at_rank=80, query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['superkingdom'] == sk
+    print(q_res.summarized_lineage_results['phylum'])
+    phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, f_weighted_at_rank=0.2,
+                                  lineage=RankLineageInfo(lineage_str="a;b"), 
+                                  bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)),
+           SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=(),f_weighted_at_rank=0.8,
+                                  bp_match_at_rank=80, query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['phylum'] == phy
+    print(q_res.summarized_lineage_results['class'])
+    cl = [SummarizedGatherResult(rank='class', fraction=0.1, lineage= RankLineageInfo(lineage_str="a;b;c"),
+                                  f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)),
+          SummarizedGatherResult(rank='class', fraction=0.9, lineage=(), f_weighted_at_rank=0.8,
+                                 bp_match_at_rank=80, query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['class'] == cl
+
+    assert q_res.total_f_weighted['phylum'] == approx(0.2)
+    assert q_res.total_f_classified['class'] == approx(0.1)
+    assert q_res.total_bp_classified['superkingdom'] == 20
+
+
+def test_QueryTaxResult_build_summarized_result_skipped_lineage():
+    "build summarized_result with skipped lineage"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, skip_idents=['gB'])
+    q_res.build_summarized_result()
+    print(q_res.summarized_lineage_results.keys())
+    print(q_res.summarized_lineage_results['superkingdom'])
+
+    sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2,  
+                                 lineage=RankLineageInfo(lineage_str="a"), 
+                                 bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), 
+          SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=(),f_weighted_at_rank=0.8,
+                                 bp_match_at_rank=80, query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['superkingdom'] == sk
+    print(q_res.summarized_lineage_results['phylum'])
+    phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, lineage=RankLineageInfo(lineage_str="a;b"),
+                                  f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)),
+           SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=(), f_weighted_at_rank=0.8, bp_match_at_rank=80,
+                                  query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['phylum'] == phy
+    print(q_res.summarized_lineage_results['class'])
+    cl = [SummarizedGatherResult(rank='class', fraction=0.1,lineage=RankLineageInfo(lineage_str="a;b;c"),
+                                  f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)),
+          SummarizedGatherResult(rank='class', fraction=0.9, lineage=(), f_weighted_at_rank=0.8, bp_match_at_rank=80,
+                                 query_ani_at_rank=None)]
+    assert q_res.summarized_lineage_results['class'] == cl
+
+    assert q_res.total_f_weighted['phylum'] == approx(0.2)
+    assert q_res.total_f_classified['class'] == approx(0.1)
+    assert q_res.total_bp_classified['superkingdom'] == 20
+
+
+def test_QueryTaxResult_build_summarized_result_over100percent():
+    "summarize up ranks: different values"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB','f_unique_to_query': 0.95}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    # now summarize up the ranks
+    assert len(q_res.raw_taxresults) == 2
+    with pytest.raises(ValueError) as exc:
+        q_res.build_summarized_result()
+    print(str(exc))
+    assert "The tax summary of query 'q1' is 1.05, which is > 100% of the query!! This should not be possible." in str(exc)
+
+
+def test_build_summarized_result_rank_fail_not_available_resummarize():
+    "build classification result"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.summarize_up_ranks('superkingdom')
+    with pytest.raises(ValueError) as exc:
+        q_res.build_summarized_result(single_rank='order')
+    print(str(exc))
+    assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc)
+
+
+def test_build_classification_result_containment_threshold():
+    "basic functionality: build classification result using containment threshold"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_classification_result(containment_threshold=0.1)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
+                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    q_res.build_classification_result(containment_threshold=0.4)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4,
+                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
+    q_res.build_classification_result(containment_threshold=1.0)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, 
+                                                               lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4,
+                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
+
+
+
+def test_build_classification_result_containment_threshold_fail():
+    "classification result: improper containment threshold"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.build_classification_result(containment_threshold=1.2)
+    print(str(exc))
+    assert "Containment threshold must be between 0 and 1 (input value: 1.2)." in str(exc)
+    with pytest.raises(ValueError) as exc:
+        q_res.build_classification_result(containment_threshold=-.1)
+    print(str(exc))
+    assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str(exc)
+
+
+def test_build_classification_result_ani_threshold():
+    "basic functionality: build classification result"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_classification_result(ani_threshold=.92)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
+                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    q_res.build_classification_result(ani_threshold=0.94)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4,
+                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
+    q_res.build_classification_result(ani_threshold=0.96)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, 
+                                                               lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4,
+                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
+
+
+def test_build_classification_result_ani_threshold_fail():
+    "classification result: improper ANI threshold"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.build_classification_result(ani_threshold=1.2)
+    print(str(exc))
+    assert "ANI threshold must be between 0 and 1 (input value: 1.2)." in str(exc)
+    with pytest.raises(ValueError) as exc:
+        q_res.build_classification_result(ani_threshold=-.1)
+    print(str(exc))
+    assert "ANI threshold must be between 0 and 1 (input value: -0.1)." in str(exc)
+
+
+def test_build_classification_result_rank_fail_not_filled():
+    "classification result: rank not available (wasn't filled in tax lineage matches)"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.build_classification_result(rank='order')
+    print(str(exc))
+    assert "Error: rank 'order' was not available for any matching lineages." in str(exc)
+
+def test_build_classification_result_rank_fail_not_available_resummarize():
+    "classification result: rank not available (wasn't summarized)"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.summarize_up_ranks('superkingdom')
+    with pytest.raises(ValueError) as exc:
+        q_res.build_classification_result(rank='order')
+    print(str(exc))
+    assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc)
+
+def test_build_classification_result_rank_fail_not_available():
+    "classification result: rank not available"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.build_classification_result(rank='NotARank')
+    print(str(exc))
+    assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc)
+
+
+def test_build_classification_result_rank_containment_threshold():
+    "classification result - rank and containment threshold (default)"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_classification_result(rank='class')
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
+                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    q_res.build_classification_result(rank='class', containment_threshold=0.4)
+    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
+                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    
+def test_build_classification_result_rank_ani_threshold():
+    "classification result with rank and ANI threshold"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_classification_result(rank='class', ani_threshold=0.92)
+    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
+                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    q_res.build_classification_result(rank='class', ani_threshold=0.95)
+    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, 
+                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
+                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+
+
+def test_krona_classification_result():
+    "basic functionality: build classification result using containment threshold"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_classification_result()
+    assert q_res.krona_classification_result == None
+    q_res.build_classification_result(rank='phylum')#, force_resummarize=True)
+    print(q_res.krona_classification_result)
+    assert q_res.krona_classification_result == (0.4, 'a', 'b')
+    q_res.build_classification_result(rank='superkingdom')
+    print(q_res.krona_classification_result)
+    assert q_res.krona_classification_result == (0.4, 'a')
+    q_res.build_classification_result()
+    assert q_res.krona_classification_result == None
+
+def test_make_krona_header_0():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    phy_header = ["fraction", "superkingdom", "phylum"]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_classification_result(rank='phylum')
+    print(q_res.krona_classification_result)
+    print(q_res.krona_classification_header)
+    assert q_res.krona_classification_header == phy_header
+    hd = q_res.make_krona_header('phylum')
+    print("header: ", hd)
+    assert hd == phy_header
+
+
+def test_make_krona_header_1():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    class_header = ["fraction", "superkingdom", "phylum", "class"]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+    q_res.build_classification_result(rank='class')
+    assert q_res.krona_classification_header == class_header
+    hd = q_res.make_krona_header(min_rank='class')
+    print("header: ", hd)
+    assert hd == class_header
+
+
+def test_make_krona_header_fail():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.make_krona_header("order")
+    assert "Rank 'order' not present in summarized ranks." in str(exc.value)
+    with pytest.raises(ValueError) as exc:
+        q_res.make_krona_header("NotARank")
+    assert "Rank 'NotARank' not present in summarized ranks." in str(exc.value)
+

From 8c2c546d79bb19fbbe994d6bacbdb09da5bb73b2 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Sat, 7 Jan 2023 10:29:03 -0800
Subject: [PATCH 13/21] upd

---
 src/sourmash/tax/tax_utils.py |  5 +++-
 tests/test_tax_utils.py       | 54 +++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index a2503a3e4e..c7b4e7e66d 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1586,7 +1586,10 @@ class SummarizedGatherResult():
 
     def as_summary_dict(self, query_info=None, limit_float=False):
         sD = asdict(self)
-        sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
+        if sD['lineage'] == (): # get rid of my by using blank RankLineageInfo() instead of () as empty lini?
+            sD['lineage'] = "unclassified"
+        else:
+            sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
         if query_info is not None:
             sD['query_name'] = query_info.query_name
             sD['query_md5'] = query_info.query_md5
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 2f620627e4..bde1dbf158 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -2548,3 +2548,57 @@ def test_make_krona_header_fail():
         q_res.make_krona_header("NotARank")
     assert "Rank 'NotARank' not present in summarized ranks." in str(exc.value)
 
+
+# def test_make_human_summary():
+#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+#     gather_results = [{}, {"name": 'gB'}]
+#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+#     hs = q_res.make_human_summary(display_rank = "superkingdom")
+#     print(hs)
+#     assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', 
+#                    'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
+#                    'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+#                    'total_weighted_hashes': 0}, 
+#                   {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a",
+#                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
+#                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+
+
+# def test_make_human_summary_2():
+#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+#     gather_results = [{}, {"name": 'gB'}]
+#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+#     hs = q_res.make_human_summary(display_rank = "phylum")
+#     print(hs)
+#     assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', 
+#                    'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
+#                    'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+#                    'total_weighted_hashes': 0}, 
+#                   {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
+#                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
+#                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+
+
+# def test_make_human_summary_classification():
+#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+#     gather_results = [{}, {"name": 'gB'}]
+#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom")
+#     hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True)
+#     print(hs)
+#     assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a',
+#                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
+#                   'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1',
+#                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+
+
+# def test_make_human_summary_classification_2():
+#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+#     gather_results = [{}, {"name": 'gB'}]
+#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum")
+#     hs = q_res.make_human_summary(display_rank = "phylum", classification=True)
+#     print(hs)
+#     assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
+#                    'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
+#                    'query_ani_at_rank': '94.9%', 'status': 'match',
+#                    'query_name': 'q1', 'query_md5': 'md5',
+#                    'query_filename': 'query_fn', 'total_weighted_hashes': 0}]

From 00f9a64c3517109f14d7afc80271c000a1f7130a Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 9 Jan 2023 13:48:32 -0800
Subject: [PATCH 14/21] more tests; move status checking into
 ClassificationResult

---
 src/sourmash/tax/tax_utils.py | 157 ++++++++++++------------
 tests/test_tax_utils.py       | 220 ++++++++++++++++++++++++----------
 2 files changed, 242 insertions(+), 135 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index c7b4e7e66d..1f255e427e 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1491,21 +1491,29 @@ class GatherRow(): # all cols should match "gather_write_cols" in `search.py`
    total_weighted_hashes: int = None
 
 
-@dataclass(frozen=True)
+@dataclass()
 class QueryInfo():
-   """Class for storing per-rank lineage information"""
-   query_name: str
-   query_md5: str
-   query_filename: str
-   query_bp: int
-   query_n_hashes: int
-   ksize: int
-   scaled: int
-   total_weighted_hashes: int = 0
+    """Class for storing query information"""
+    query_name: str
+    query_md5: str
+    query_filename: str
+    query_bp: int
+    ksize: int
+    scaled: int
+    query_n_hashes: int = None
+    total_weighted_hashes: int = 0
+
+    def __post_init__(self):
+        "Initialize and cast types"
+        self.query_bp = int(self.query_bp)
+        self.ksize = int(self.ksize)
+        self.scaled = int(self.scaled)
+        self.query_n_hashes = int(self.query_n_hashes) if self.query_n_hashes else 0
+        self.total_weighted_hashes = int(self.total_weighted_hashes) if self.total_weighted_hashes else 0
 
-   @property
-   def total_weighted_bp(self):
-       return self.total_weighted_hashes * self.scaled
+    @property
+    def total_weighted_bp(self):
+        return self.total_weighted_hashes * self.scaled
 
 @dataclass
 class TaxResult():
@@ -1524,22 +1532,15 @@ class TaxResult():
 
     def __post_init__(self):
         self.get_ident()
-        self.query_name = self.raw.query_name
-        if self.raw.query_n_hashes:
-            self.raw.query_n_hashes = int(self.raw.query_n_hashes)
-        if self.raw.total_weighted_hashes:
-            self.raw.total_weighted_hashes = int(self.raw.total_weighted_hashes)
-        else:
-            self.raw.total_weighted_hashes = 0
-
+        self.query_name = self.raw.query_name # convenience
         self.query_info = QueryInfo(query_name = self.raw.query_name,
                                   query_md5=self.raw.query_md5,
                                   query_filename = self.raw.query_filename,
-                                  query_bp = int(self.raw.query_bp),
+                                  query_bp = self.raw.query_bp,
                                   query_n_hashes = self.raw.query_n_hashes,
                                   total_weighted_hashes = self.raw.total_weighted_hashes,
-                                  ksize = int(self.raw.ksize),
-                                  scaled = int(self.raw.scaled)
+                                  ksize = self.raw.ksize,
+                                  scaled = self.raw.scaled
                                   )
         # cast and store the imp bits
         self.f_unique_to_query = float(self.raw.f_unique_to_query)
@@ -1582,7 +1583,12 @@ class SummarizedGatherResult():
     lineage: RankLineageInfo
     f_weighted_at_rank: float
     bp_match_at_rank: int
-    query_ani_at_rank: float
+    query_ani_at_rank: float = None
+
+    def set_query_ani(self, query_info):
+        self.query_ani_at_rank = containment_to_distance(self.fraction, query_info.ksize, query_info.scaled,
+                                                         n_unique_kmers=query_info.query_n_hashes,
+                                                         sequence_len_bp=query_info.query_bp).ani
 
     def as_summary_dict(self, query_info=None, limit_float=False):
         sD = asdict(self)
@@ -1633,8 +1639,41 @@ def as_kreport_dict(self, rankCode, total_weighted_hashes):
 @dataclass
 class ClassificationResult(SummarizedGatherResult):
 #   """Class for storing summarized lineage information"""
-    status: str
+    status: str = field(init=False)
 
+    def __post_init__(self):
+        self.status = None
+
+    def set_status(self, query_info, containment_threshold=None, ani_threshold=None):
+        # check for out of bounds values, set "nomatch" if no match at all
+        if self.f_weighted_at_rank > 1:
+            raise ValueError(f"The tax classification of query '{query_info.query_name}' is {self.f_weighted_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
+        elif self.f_weighted_at_rank == 0: #no annotated results for this query. This also  shouldn't happen, right?
+            self.status="nomatch"
+        else:
+            # if any matches, use 'below_threshold' as default; set 'match' if meets threshold
+            self.status="below_threshold"
+            self.set_query_ani(query_info=query_info)
+            if ani_threshold:  # if provided, just use ani thresh, don't use containment threshold
+                if self.query_ani_at_rank >= ani_threshold:
+                    self.status = 'match'
+            elif containment_threshold and self.f_weighted_at_rank >= containment_threshold:
+                    self.status = 'match'
+
+    def build_krona_result(self, rank=None):
+        krona_classified, krona_unclassified = None, None
+        if rank is not None and rank == self.rank:
+            lin_as_list = self.lineage.display_lineage().split(';')
+            krona_classification = (self.f_weighted_at_rank, *lin_as_list)
+            krona_classified = (krona_classification)
+            # handle unclassified - do we want/need this?
+            unclassified_fraction= 1.0-self.f_weighted_at_rank
+            len_unclassified_lin = len(lin_as_list)
+            unclassifed_lin = ["unclassified"]*(len_unclassified_lin)
+            krona_unclassified = (unclassified_fraction, *unclassifed_lin)
+        return krona_classified, krona_unclassified
+
+ 
 
 @dataclass
 class QueryTaxResult():
@@ -1672,9 +1711,9 @@ def _init_classification_results(self):
         self.status = 'nomatch'
         self.classified_ranks = []
         self.classification_result = None
-        self.krona_classification_result = None
-        self.krona_unclassified_result = None
-        self.krona_classification_header = []
+        self.krona_classified = None
+        self.krona_unclassified = None
+        self.krona_header = []
 
     def is_compatible(self, taxresult):
         return taxresult.query_info == self.query_info
@@ -1759,19 +1798,15 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False):
                     continue
                 f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage]
                 bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage]
-
                 # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank)
                 # now that I pulled best_only into separate fn, these don't need to be dicts...
                 self.total_f_classified[rank] += f_unique
                 self.total_f_weighted[rank] += f_weighted_at_rank
                 self.total_bp_classified[rank] += bp_intersect_at_rank
-
-                query_ani = containment_to_distance(f_unique, self.query_info.ksize, self.query_info.scaled,
-                                                    n_unique_kmers=self.query_info.query_n_hashes,
-                                                    sequence_len_bp=self.query_info.query_bp).ani
                 sres = SummarizedGatherResult(lineage=lineage, rank=rank,
                                               f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique,
-                                              bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani)
+                                              bp_match_at_rank=bp_intersect_at_rank)
+                sres.set_query_ani(query_info=self.query_info)
                 self.summarized_lineage_results[rank].append(sres)
 
             # record unclassified
@@ -1791,7 +1826,6 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
         if ani_threshold and not 0 <= ani_threshold <= 1:
             raise ValueError(f"ANI threshold must be between 0 and 1 (input value: {ani_threshold}).")
         self._init_classification_results() # init some fields
-        #for sg in self.summarized_lineage_results:
         if not self.summarized_ranks or force_resummarize:
             self.summarize_up_ranks(single_rank=rank, force_resummarize=force_resummarize)
         # catch potential error from running summarize_up_ranks separately and passing in different single_rank
@@ -1815,52 +1849,27 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
             sorted_sum_uniq_weighted.sort(key = lambda x: -x[1])
             # best only
             this_lineage, f_weighted = sorted_sum_uniq_weighted[0]
-            if f_weighted > 1:
-                raise ValueError(f"The tax classification of query '{self.query_name}' is {f_weighted}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
-            elif f_weighted == 0: #no annotated results for this query. do we need to handle this differently?
-                continue
-            else:
-                status="below_threshold"
             f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage]
-            query_ani = containment_to_distance(f_unique_at_rank, self.query_info.ksize, self.query_info.scaled,
-                                                n_unique_kmers=self.query_info.query_n_hashes,
-                                                sequence_len_bp=self.query_info.query_bp).ani
-            # set classification status based on thresholds
-            if ani_threshold: # if provided, just use ani thresh
-                if query_ani and query_ani >= ani_threshold:
-                    status = 'match'
-                else:
-                    status = 'below_threshold'
-            elif f_weighted >= containment_threshold:
-                status = 'match'
-            # determine whether to move on to a higher tax rank (if avail)
-            if status == 'match':
-                break
+            bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage]
 
-        if this_rank == "superkingdom" and status == "nomatch":
-            status="below_threshold"
+            classif = ClassificationResult(rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage,
+                                           f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank)
 
-        # what happens when we don't have a gather match at all?
-        bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage]
+            classif.set_status(self.query_info, containment_threshold=containment_threshold, ani_threshold=ani_threshold)
+            # determine whether to move on to a higher tax rank (if avail)
+            if classif.status == 'match' or classif.status == "nomatch": # not sure we want/need the `nomatch` part...
+                break
 
-        classif = ClassificationResult(status=status, rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage,
-                                       f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank,
-                                       query_ani_at_rank= query_ani)
+        # store the final classification result
         self.classification_result = classif
-
-        if rank is not None:
-            lin_as_list = this_lineage.display_lineage().split(';')
-            krona_classification = (f_weighted, *lin_as_list)
-            self.krona_classification_result = (krona_classification)
-            # handle unclassified - do we want/need this?
-            unclassified_fraction= 1.0-f_weighted
-            len_unclassified_lin = len(lin_as_list)
-            unclassifed_lin = ["unclassified"]*(len_unclassified_lin)
-            self.krona_unclassified_result = (unclassified_fraction, *unclassifed_lin)
-            self.krona_classification_header=self.make_krona_header(min_rank=rank)
+        # could do this later, in __main__.py, for example
+        self.krona_classified, self.krona_unclassified = self.classification_result.build_krona_result(rank=rank)
+        self.krona_header = self.make_krona_header(min_rank = rank)
 
     def make_krona_header(self, min_rank):
         "make header for krona output"
+        if min_rank is None:
+            return []
         if min_rank not in self.summarized_ranks:
             raise ValueError(f"Rank '{min_rank}' not present in summarized ranks.")
         else:
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index bde1dbf158..e3021d93ab 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -119,6 +119,40 @@ def test_ascending_taxlist_2():
     assert list(ascending_taxlist(include_strain=False)) ==  ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
 
 
+def test_QueryInfo_basic():
+    "basic functionality of QueryInfo dataclass"
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    assert qInf.query_name == 'q1'
+    assert isinstance(qInf.query_n_hashes, int)
+    assert isinstance(qInf.ksize, int)
+    assert isinstance(qInf.scaled, int)
+    assert qInf.total_weighted_hashes == 200
+    assert qInf.total_weighted_bp == 2000
+
+
+def test_QueryInfo_no_hash_info():
+    "QueryInfo dataclass for older gather results without query_n_hashes or total_weighted_hashes"
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',ksize=31,scaled=10)
+    assert qInf.query_name == 'q1'
+    assert qInf.query_n_hashes == 0
+    assert qInf.total_weighted_hashes == 0
+    assert qInf.total_weighted_bp == 0
+
+
+def test_QueryInfo_missing():
+    "check that required args"
+    with pytest.raises(TypeError) as exc:
+        QueryInfo(query_name='q1', query_filename='f1',query_bp='100',query_n_hashes='10',ksize=31,scaled=10, total_weighted_hashes=200)
+    print(str(exc))
+    assert "missing 1 required positional argument: 'query_md5'" in str(exc)
+
+
+def test_SummarizedGatherResult():
+    "basic functionality of SummarizedGatherResult dataclass"
+    sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97)
+
+
+
 def test_GatherRow_old_gather():
     # gather does not contain query_name column
     gA = {"name": "gA.1 name"}
@@ -1337,11 +1371,17 @@ def test_lineage_db_sql_load(runtmp):
 def test_LineagePair():
     lin = LineagePair(rank="rank1", name='name1')
     print(lin)
+    assert lin.rank=="rank1"
+    assert lin.name =="name1"
+    assert lin.taxid==None
 
 
 def test_LineagePair_1():
-    lin1 = LineagePair(rank="rank1", name='name1', taxid=1)
-    print(lin1)
+    lin = LineagePair(rank="rank1", name='name1', taxid=1)
+    assert lin.rank=="rank1"
+    assert lin.name =="name1"
+    assert lin.taxid==1
+    print(lin)
 
 
 def test_BaseLineageInfo_init_empty():
@@ -2358,29 +2398,6 @@ def test_build_summarized_result_rank_fail_not_available_resummarize():
     assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc)
 
 
-def test_build_classification_result_containment_threshold():
-    "basic functionality: build classification result using containment threshold"
-    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
-    gather_results = [{}, {"name": 'gB'}]
-    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
-    q_res.build_classification_result(containment_threshold=0.1)
-    print("classif: ", q_res.classification_result)
-    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
-                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
-    q_res.build_classification_result(containment_threshold=0.4)
-    print("classif: ", q_res.classification_result)
-    assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4,
-                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
-    q_res.build_classification_result(containment_threshold=1.0)
-    print("classif: ", q_res.classification_result)
-    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, 
-                                                               lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4,
-                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
-
-
-
 def test_build_classification_result_containment_threshold_fail():
     "classification result: improper containment threshold"
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
@@ -2396,26 +2413,80 @@ def test_build_classification_result_containment_threshold_fail():
     assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str(exc)
 
 
+
+def test_build_classification_result_containment_threshold():
+    "basic functionality: build classification result using containment threshold"
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+
+    q_res.build_classification_result(containment_threshold=0.1)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result.status == 'match'
+    assert q_res.classification_result.rank == 'class'
+    assert q_res.classification_result.fraction == 0.1
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c")
+    assert q_res.classification_result.f_weighted_at_rank == 0.2
+    assert q_res.classification_result.bp_match_at_rank == 20
+    assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2)
+
+    q_res.build_classification_result(containment_threshold=0.4)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result.status == 'match'
+    assert q_res.classification_result.rank == 'phylum'
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b")
+    assert q_res.classification_result.f_weighted_at_rank == 0.4
+    assert q_res.classification_result.fraction == 0.2
+    assert q_res.classification_result.bp_match_at_rank == 40
+    assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2)
+
+    q_res.build_classification_result(containment_threshold=1.0)
+    print("classif: ", q_res.classification_result)
+    assert q_res.classification_result.status == 'below_threshold'
+    assert q_res.classification_result.rank == 'superkingdom'
+    assert q_res.classification_result.fraction == 0.2
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a")
+    assert q_res.classification_result.f_weighted_at_rank == 0.4
+    assert q_res.classification_result.bp_match_at_rank == 40
+    assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2)
+
+
 def test_build_classification_result_ani_threshold():
     "basic functionality: build classification result"
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
     gather_results = [{}, {"name": 'gB'}]
     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+
     q_res.build_classification_result(ani_threshold=.92)
     print("classif: ", q_res.classification_result)
-    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
-                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
-    q_res.build_classification_result(ani_threshold=0.94)
+    assert q_res.classification_result.status == 'match'
+    assert q_res.classification_result.rank == 'class'
+    assert q_res.classification_result.fraction == 0.1
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c")
+    assert q_res.classification_result.f_weighted_at_rank == 0.2
+    assert q_res.classification_result.bp_match_at_rank == 20
+    assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2)
+
+    q_res.build_classification_result(ani_threshold=0.94) # should classify at phylum
     print("classif: ", q_res.classification_result)
-    assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4,
-                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
+    assert q_res.classification_result.status == 'match'
+    assert q_res.classification_result.rank == 'phylum'
+    assert q_res.classification_result.fraction == 0.2
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b")
+    assert q_res.classification_result.f_weighted_at_rank == 0.4
+    assert q_res.classification_result.bp_match_at_rank == 40
+    assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2)
+
+    # superk result, but doesn't meet ANI threshold
     q_res.build_classification_result(ani_threshold=0.96)
     print("classif: ", q_res.classification_result)
-    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, 
-                                                               lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4,
-                                                               bp_match_at_rank=40,  query_ani_at_rank=approx(0.95, rel=1e-2))
+    assert q_res.classification_result.status == 'below_threshold'
+    assert q_res.classification_result.rank == 'superkingdom'
+    assert q_res.classification_result.fraction == 0.2
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a")
+    assert q_res.classification_result.f_weighted_at_rank == 0.4
+    assert q_res.classification_result.bp_match_at_rank == 40
+    assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2)
 
 
 def test_build_classification_result_ani_threshold_fail():
@@ -2470,68 +2541,95 @@ def test_build_classification_result_rank_containment_threshold():
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
     gather_results = [{}, {"name": 'gB'}]
     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+
     q_res.build_classification_result(rank='class')
     print("classif: ", q_res.classification_result)
-    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
-                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    assert q_res.classification_result.status == 'match'
+    assert q_res.classification_result.rank == 'class'
+    assert q_res.classification_result.fraction == 0.1
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c")
+    assert q_res.classification_result.f_weighted_at_rank == 0.2
+    assert q_res.classification_result.bp_match_at_rank == 20
+    assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2)
+
     q_res.build_classification_result(rank='class', containment_threshold=0.4)
-    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
-                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
-    
+    assert q_res.classification_result.status == 'below_threshold'
+    assert q_res.classification_result.rank == 'class'
+    assert q_res.classification_result.fraction == 0.1
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c")
+    assert q_res.classification_result.f_weighted_at_rank == 0.2
+    assert q_res.classification_result.bp_match_at_rank == 20
+    assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2)
+
+
 def test_build_classification_result_rank_ani_threshold():
     "classification result with rank and ANI threshold"
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
     gather_results = [{}, {"name": 'gB'}]
     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
+
     q_res.build_classification_result(rank='class', ani_threshold=0.92)
-    assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
-                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    assert q_res.classification_result.status == 'match'
+    assert q_res.classification_result.rank == 'class'
+    assert q_res.classification_result.fraction == 0.1
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c")
+    assert q_res.classification_result.f_weighted_at_rank == 0.2
+    assert q_res.classification_result.bp_match_at_rank == 20
+    assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2)
+
     q_res.build_classification_result(rank='class', ani_threshold=0.95)
-    assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, 
-                                                               lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2,
-                                                               bp_match_at_rank=20,  query_ani_at_rank=approx(0.928, rel=1e-2))
+    assert q_res.classification_result.status == 'below_threshold'
+    assert q_res.classification_result.rank == 'class'
+    assert q_res.classification_result.fraction == 0.1
+    assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c")
+    assert q_res.classification_result.f_weighted_at_rank == 0.2
+    assert q_res.classification_result.bp_match_at_rank == 20
+    assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2)
 
 
-def test_krona_classification_result():
+def test_krona_classified():
     "basic functionality: build classification result using containment threshold"
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
     gather_results = [{}, {"name": 'gB'}]
     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
     q_res.build_classification_result()
-    assert q_res.krona_classification_result == None
+    assert q_res.krona_classified == None
     q_res.build_classification_result(rank='phylum')#, force_resummarize=True)
-    print(q_res.krona_classification_result)
-    assert q_res.krona_classification_result == (0.4, 'a', 'b')
+    print(q_res.krona_classified)
+    assert q_res.krona_classified == (0.4, 'a', 'b')
+    assert q_res.krona_unclassified == (0.6, 'unclassified', 'unclassified')
     q_res.build_classification_result(rank='superkingdom')
-    print(q_res.krona_classification_result)
-    assert q_res.krona_classification_result == (0.4, 'a')
+    print(q_res.krona_classified)
+    assert q_res.krona_classified == (0.4, 'a')
+    assert q_res.krona_unclassified == (0.6, 'unclassified')
+    # make sure this goes back to None if we reclassify without rank
     q_res.build_classification_result()
-    assert q_res.krona_classification_result == None
+    assert q_res.krona_classified == None
+    assert q_res.krona_unclassified == None
+    assert q_res.krona_header == []
 
-def test_make_krona_header_0():
+
+def test_make_krona_header_basic():
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
     gather_results = [{}, {"name": 'gB'}]
     phy_header = ["fraction", "superkingdom", "phylum"]
     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
     q_res.build_classification_result(rank='phylum')
-    print(q_res.krona_classification_result)
-    print(q_res.krona_classification_header)
-    assert q_res.krona_classification_header == phy_header
+    print(q_res.krona_classified)
+    print(q_res.krona_header)
+    assert q_res.krona_header == phy_header
     hd = q_res.make_krona_header('phylum')
     print("header: ", hd)
     assert hd == phy_header
 
 
-def test_make_krona_header_1():
+def test_make_krona_header_basic_1():
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
     gather_results = [{}, {"name": 'gB'}]
     class_header = ["fraction", "superkingdom", "phylum", "class"]
     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True)
     q_res.build_classification_result(rank='class')
-    assert q_res.krona_classification_header == class_header
+    assert q_res.krona_header == class_header
     hd = q_res.make_krona_header(min_rank='class')
     print("header: ", hd)
     assert hd == class_header

From cbb1e65e71e6868f312e8bceb17117ad06ce70bb Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 9 Jan 2023 13:58:33 -0800
Subject: [PATCH 15/21] human summary dict tests

---
 src/sourmash/tax/tax_utils.py |   4 +-
 tests/test_tax_utils.py       | 106 +++++++++++++++++-----------------
 2 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 1f255e427e..853f66f520 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1595,7 +1595,7 @@ def as_summary_dict(self, query_info=None, limit_float=False):
         if sD['lineage'] == (): # get rid of my by using blank RankLineageInfo() instead of () as empty lini?
             sD['lineage'] = "unclassified"
         else:
-            sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
+            sD['lineage'] = self.lineage.display_lineage() # null_as_unclassified=True
         if query_info is not None:
             sD['query_name'] = query_info.query_name
             sD['query_md5'] = query_info.query_md5
@@ -1889,7 +1889,7 @@ def make_human_summary(self, display_rank, classification=False):
             display_rank_results.sort(key=lambda res: -res.f_weighted_at_rank)
 
         for res in display_rank_results:
-            results.append(res.as_human_friendly_dict())
+            results.append(res.as_human_friendly_dict(query_info=self.query_info))
         return results
 
     def make_full_summary(self, classification=False, limit_float=False):
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index e3021d93ab..2451a27575 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -2647,56 +2647,56 @@ def test_make_krona_header_fail():
     assert "Rank 'NotARank' not present in summarized ranks." in str(exc.value)
 
 
-# def test_make_human_summary():
-#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
-#     gather_results = [{}, {"name": 'gB'}]
-#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
-#     hs = q_res.make_human_summary(display_rank = "superkingdom")
-#     print(hs)
-#     assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', 
-#                    'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
-#                    'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
-#                    'total_weighted_hashes': 0}, 
-#                   {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a",
-#                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
-#                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
-
-
-# def test_make_human_summary_2():
-#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
-#     gather_results = [{}, {"name": 'gB'}]
-#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
-#     hs = q_res.make_human_summary(display_rank = "phylum")
-#     print(hs)
-#     assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', 
-#                    'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
-#                    'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
-#                    'total_weighted_hashes': 0}, 
-#                   {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
-#                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
-#                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
-
-
-# def test_make_human_summary_classification():
-#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
-#     gather_results = [{}, {"name": 'gB'}]
-#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom")
-#     hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True)
-#     print(hs)
-#     assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a',
-#                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
-#                   'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1',
-#                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
-
-
-# def test_make_human_summary_classification_2():
-#     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
-#     gather_results = [{}, {"name": 'gB'}]
-#     q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum")
-#     hs = q_res.make_human_summary(display_rank = "phylum", classification=True)
-#     print(hs)
-#     assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
-#                    'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
-#                    'query_ani_at_rank': '94.9%', 'status': 'match',
-#                    'query_name': 'q1', 'query_md5': 'md5',
-#                    'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+def test_make_human_summary():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    hs = q_res.make_human_summary(display_rank = "superkingdom")
+    print(hs)
+    assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified',
+                   'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
+                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+                   'total_weighted_hashes': 0},
+                  {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a",
+                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
+                  'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+
+
+def test_make_human_summary_2():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    hs = q_res.make_human_summary(display_rank = "phylum")
+    print(hs)
+    assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified',
+                   'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
+                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+                   'total_weighted_hashes': 0},
+                  {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
+                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
+                  'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+
+
+def test_make_human_summary_classification():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom")
+    hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True)
+    print(hs)
+    assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a',
+                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
+                  'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1',
+                  'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+
+
+def test_make_human_summary_classification_2():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum")
+    hs = q_res.make_human_summary(display_rank = "phylum", classification=True)
+    print(hs)
+    assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
+                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
+                   'query_ani_at_rank': '94.9%', 'status': 'match',
+                   'query_name': 'q1', 'query_md5': 'md5',
+                   'query_filename': 'query_fn', 'total_weighted_hashes': 0}]

From d4c43fdcae653ba8811edc59ba63b989b0f0d638 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 9 Jan 2023 15:40:17 -0800
Subject: [PATCH 16/21] add value checks and tests for SGR,CR classes

---
 src/sourmash/tax/tax_utils.py | 100 +++++++++--------
 tests/test_tax_utils.py       | 206 +++++++++++++++++++++++++++++++---
 2 files changed, 248 insertions(+), 58 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 853f66f520..879a486d37 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -37,6 +37,9 @@
 # Essential Gather column names that must be in gather_csv to allow `tax` summarization
 EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename')
 
+RANKCODE = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C",
+                        "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"}
+
 # import lca utils as needed for now
 from sourmash.lca import lca_utils
 from sourmash.lca.lca_utils import (taxlist, display_lineage, pop_to_rank)
@@ -737,9 +740,6 @@ def write_kreport(summarized_gather, csv_fp, *, sep='\t'):
     columns = ["percent_containment", "num_bp_contained", "num_bp_assigned", "rank_code", "ncbi_taxid", "sci_name"]
     w = csv.DictWriter(csv_fp, columns, delimiter=sep)
 
-    rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C",
-                 "order": "O", "family":"F", "genus": "G", "species": "S"} # , "": "U"
-
     # check - are we using v4.5.0 or later gather CSVs?
     for rank, rank_results in summarized_gather.items():
         for res in rank_results:
@@ -748,7 +748,7 @@ def write_kreport(summarized_gather, csv_fp, *, sep='\t'):
 
     unclassified_written=False
     for rank, rank_results in summarized_gather.items():
-        rcode = rankCode[rank]
+        rcode = RANKCODE[rank]
         for res in rank_results:
             # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank.
             # Here, we only need to report it once, since it will be the same fraction for all ranks
@@ -1585,29 +1585,42 @@ class SummarizedGatherResult():
     bp_match_at_rank: int
     query_ani_at_rank: float = None
 
+    def __post_init__(self):
+        self.check_values()
+
+    def check_values(self):
+        if any([self.fraction > 1, self.f_weighted_at_rank > 1]):
+            raise ValueError(f"Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query.")
+        # is this true for weighted too, or is that set to 0 when --ignore-abundance is used?
+        if any([self.fraction <=0, self.f_weighted_at_rank <= 0]): # this shouldn't actually happen, but it breaks ANI estimation, so let's check for it.
+            raise ValueError(f"Summarized fraction is <=0% of the query! This should not occur.")
+
     def set_query_ani(self, query_info):
         self.query_ani_at_rank = containment_to_distance(self.fraction, query_info.ksize, query_info.scaled,
                                                          n_unique_kmers=query_info.query_n_hashes,
                                                          sequence_len_bp=query_info.query_bp).ani
 
-    def as_summary_dict(self, query_info=None, limit_float=False):
+    def as_summary_dict(self, query_info, limit_float=False):
         sD = asdict(self)
         if sD['lineage'] == (): # get rid of my by using blank RankLineageInfo() instead of () as empty lini?
             sD['lineage'] = "unclassified"
         else:
             sD['lineage'] = self.lineage.display_lineage() # null_as_unclassified=True
-        if query_info is not None:
-            sD['query_name'] = query_info.query_name
-            sD['query_md5'] = query_info.query_md5
-            sD['query_filename'] = query_info.query_filename
-            sD['total_weighted_hashes'] = query_info.total_weighted_hashes
+        sD['query_name'] = query_info.query_name
+        sD['query_md5'] = query_info.query_md5
+        sD['query_filename'] = query_info.query_filename
+        sD['total_weighted_hashes'] = str(query_info.total_weighted_hashes)
+        sD['bp_match_at_rank'] = str(self.bp_match_at_rank)
         if limit_float:
             sD['fraction'] = f'{self.fraction:.3f}'
             sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}'
+        else:
+            sD['fraction'] = str(self. fraction)
+            sD['f_weighted_at_rank'] = str(self.f_weighted_at_rank)
 
         return(sD)
 
-    def as_human_friendly_dict(self, query_info = None):
+    def as_human_friendly_dict(self, query_info):
         sD = self.as_summary_dict(query_info=query_info, limit_float=True)
         sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%"
         if sD['query_ani_at_rank'] is not None:
@@ -1616,25 +1629,26 @@ def as_human_friendly_dict(self, query_info = None):
             sD['query_ani_at_rank'] = '-    '
         return sD
 
-    def as_kreport_dict(self, rankCode, total_weighted_hashes):
-         sD = asdict(self)
-         this_rank = self.lineage.lowest_rank
-         this_sciname = self.lineage.lowest_lineage_name
-         sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid
-         sD['sci_name'] = this_sciname
-         #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
-         sD['rank_code'] = rankCode[this_rank]
-         sD['num_bp_assigned'] = 0
-         # total percent containment, weighted to include abundance info
-         sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
-         sD["num_bp_contained"] = int(self.f_weighted_at_rank * total_weighted_hashes)
-         # the number of bp actually 'assigned' at this rank. Sourmash assigns everything
-         # at genome level, but since kreport traditionally doesn't include 'strain' or genome,
-         # it is reasonable to state that sourmash assigns at 'species' level for this.
-         # can be modified later.
-         lowest_assignment_rank = 'species'
-         if this_rank == lowest_assignment_rank or this_sciname == "unclassified":
-            sD["num_bp_assigned"] = sD["num_bp_contained"]
+    def as_kreport_dict(self, query_info):
+        sD = {}
+        this_rank = self.lineage.lowest_rank
+        this_sciname = self.lineage.lowest_lineage_name
+        sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid
+        sD['sci_name'] = this_sciname
+        #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
+        sD['rank_code'] = RANKCODE[this_rank]
+        sD['num_bp_assigned'] = str(0)
+        # total percent containment, weighted to include abundance info
+        sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
+        sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_hashes))
+        # the number of bp actually 'assigned' at this rank. Sourmash assigns everything
+        # at genome level, but since kreport traditionally doesn't include 'strain' or genome,
+        # it is reasonable to state that sourmash assigns at 'species' level for this.
+        # can be modified later.
+        lowest_assignment_rank = 'species'
+        if this_rank == lowest_assignment_rank or this_sciname == "unclassified":
+           sD["num_bp_assigned"] = sD["num_bp_contained"]
+        return sD
 
 @dataclass
 class ClassificationResult(SummarizedGatherResult):
@@ -1642,23 +1656,20 @@ class ClassificationResult(SummarizedGatherResult):
     status: str = field(init=False)
 
     def __post_init__(self):
-        self.status = None
+        # check for out of bounds values, default "nomatch" if no match at all
+        self.check_values()
+        self.status = 'nomatch' #None?
 
     def set_status(self, query_info, containment_threshold=None, ani_threshold=None):
-        # check for out of bounds values, set "nomatch" if no match at all
-        if self.f_weighted_at_rank > 1:
-            raise ValueError(f"The tax classification of query '{query_info.query_name}' is {self.f_weighted_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
-        elif self.f_weighted_at_rank == 0: #no annotated results for this query. This also  shouldn't happen, right?
-            self.status="nomatch"
-        else:
-            # if any matches, use 'below_threshold' as default; set 'match' if meets threshold
+        # if any matches, use 'below_threshold' as default; set 'match' if meets threshold
+        if any([containment_threshold, ani_threshold]):
             self.status="below_threshold"
-            self.set_query_ani(query_info=query_info)
-            if ani_threshold:  # if provided, just use ani thresh, don't use containment threshold
-                if self.query_ani_at_rank >= ani_threshold:
-                    self.status = 'match'
-            elif containment_threshold and self.f_weighted_at_rank >= containment_threshold:
-                    self.status = 'match'
+        self.set_query_ani(query_info=query_info)
+        if ani_threshold:  # if provided, just use ani thresh, don't use containment threshold
+            if self.query_ani_at_rank >= ani_threshold:
+                self.status = 'match'
+        elif containment_threshold and self.f_weighted_at_rank >= containment_threshold:
+                self.status = 'match'
 
     def build_krona_result(self, rank=None):
         krona_classified, krona_unclassified = None, None
@@ -1672,7 +1683,6 @@ def build_krona_result(self, rank=None):
             unclassifed_lin = ["unclassified"]*(len_unclassified_lin)
             krona_unclassified = (unclassified_fraction, *unclassifed_lin)
         return krona_classified, krona_unclassified
-
  
 
 @dataclass
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 2451a27575..4852d653b5 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -149,8 +149,185 @@ def test_QueryInfo_missing():
 
 def test_SummarizedGatherResult():
     "basic functionality of SummarizedGatherResult dataclass"
-    sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97)
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    print(sgr)
+    assert sgr.rank=='phylum'
+    sumD = sgr.as_summary_dict(query_info=qInf)
+    print(sumD)
+    assert sumD == {'rank': 'phylum', 'fraction': "0.2", 'lineage': 'a;b', 'f_weighted_at_rank': "0.3",
+                    'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1',
+                    'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"}
+    hD = sgr.as_human_friendly_dict(query_info=qInf)
+    print(hD)
+    assert hD == {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '30.0%',
+                  'bp_match_at_rank': "30", 'query_ani_at_rank': '-    ', 'query_name': 'q1',
+                  'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"}
+    krD = sgr.as_kreport_dict(query_info=qInf)
+    print(krD)
+    assert krD == {'ncbi_taxid': None, 'sci_name': 'b', 'rank_code': 'P', 'num_bp_assigned': "0",
+                   'percent_containment': '30.00', 'num_bp_contained': "60"}
+
+
+def test_SummarizedGatherResult_set_query_ani():
+    "Check ANI estimation within SummarizedGatherResult dataclass"
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    sgr.set_query_ani(query_info=qInf)
+    print(sgr.query_ani_at_rank)
+    assert sgr.query_ani_at_rank == approx(0.949,  rel=1e-3)
+    # ANI can be calculated with query_bp OR query_n_hashes. Remove each and check the results are identical
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes=0,ksize='31',scaled='10', total_weighted_hashes='200')
+    sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    sgr.set_query_ani(query_info=qInf)
+    print(sgr.query_ani_at_rank)
+    assert sgr.query_ani_at_rank == approx(0.949,  rel=1e-3)
+    # try without query_bp
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp=0,
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    sgr.set_query_ani(query_info=qInf)
+    print(sgr.query_ani_at_rank)
+    assert sgr.query_ani_at_rank == approx(0.949,  rel=1e-3)
+
+
+def test_SummarizedGatherResult_greater_than_1():
+    "basic functionality of SummarizedGatherResult dataclass"
+    # fraction > 1
+    with pytest.raises(ValueError) as exc:
+        SummarizedGatherResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=1.2, bp_match_at_rank=30)
+    print(str(exc))
+    assert "> 100% of the query!" in str(exc)
+    # f_weighted > 1
+    with pytest.raises(ValueError) as exc:
+        SummarizedGatherResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    print(str(exc))
+    assert "> 100% of the query!" in str(exc)
+
 
+def test_SummarizedGatherResult_0_fraction():
+    with pytest.raises(ValueError) as exc:
+        SummarizedGatherResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    err_msg = "Summarized fraction is <=0% of the query! This should not occur."
+    assert err_msg in str(exc)
+    #assert cr.status == 'nomatch'
+    
+    with pytest.raises(ValueError) as exc:
+        SummarizedGatherResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0, bp_match_at_rank=30)
+    print(str(exc))
+    assert err_msg in str(exc)
+
+
+def test_SummarizedGatherResult_species_kreport():
+    "basic functionality of SummarizedGatherResult dataclass"
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    sgr = SummarizedGatherResult(rank="species", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b;c;d;e;f;g"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    print(sgr)
+    assert sgr.rank=='species'
+    krD = sgr.as_kreport_dict(query_info=qInf)
+    print(krD)
+    assert krD == {'ncbi_taxid': None, 'sci_name': 'g', 'rank_code': 'S', 'num_bp_assigned': "60",
+                   'percent_containment': '30.00', 'num_bp_contained': "60"}
+
+
+def test_SummarizedGatherResult_summary_dict_limit_float():
+    "basic functionality of SummarizedGatherResult dataclass"
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    sgr = SummarizedGatherResult(rank="phylum", fraction=0.123456, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.345678, bp_match_at_rank=30)
+    print(sgr)
+    assert sgr.rank=='phylum'
+    sumD = sgr.as_summary_dict(query_info=qInf)
+    print(sumD)
+    assert sumD == {'rank': 'phylum', 'fraction': "0.123456", 'lineage': 'a;b', 'f_weighted_at_rank': "0.345678",
+                    'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1',
+                    'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"}
+    
+    sumD = sgr.as_summary_dict(query_info=qInf, limit_float=True)
+    print(sumD)
+    assert sumD == {'rank': 'phylum', 'fraction': "0.123", 'lineage': 'a;b', 'f_weighted_at_rank': "0.346",
+                    'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1',
+                    'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"}
+
+
+def test_ClassificationResult():
+    "basic functionality of ClassificationResult dataclass"
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                              f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97)
+    cr.set_status(query_info=qInf, containment_threshold=0.1)
+    assert cr.status == 'match'
+    print(cr.query_ani_at_rank)
+    assert cr.query_ani_at_rank == approx(0.949,  rel=1e-3)
+    cr.set_status(query_info=qInf, containment_threshold=0.35)
+    assert cr.status == 'below_threshold'
+
+
+def test_ClassificationResult_greater_than_1():
+    "basic functionality of SummarizedGatherResult dataclass"
+    # fraction > 1
+    with pytest.raises(ValueError) as exc:
+        ClassificationResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=1.2, bp_match_at_rank=30)
+    print(str(exc))
+    assert "> 100% of the query!" in str(exc)
+    # f_weighted > 1
+    with pytest.raises(ValueError) as exc:
+        ClassificationResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    print(str(exc))
+    assert "> 100% of the query!" in str(exc)
+
+
+def test_ClassificationResult_0_fraction():
+    with pytest.raises(ValueError) as exc:
+        ClassificationResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0.3, bp_match_at_rank=30)
+    err_msg = "Summarized fraction is <=0% of the query! This should not occur."
+    assert err_msg in str(exc)
+    #assert cr.status == 'nomatch'
+    
+    with pytest.raises(ValueError) as exc:
+        ClassificationResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"),
+                                 f_weighted_at_rank=0, bp_match_at_rank=30)
+    print(str(exc))
+    assert err_msg in str(exc)
+
+
+def test_ClassificationResult_build_krona_result():
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                              f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97)
+    #cr.set_status(query_info=qInf, rank='phylum')
+    kr, ukr = cr.build_krona_result(rank='phylum')
+    print(kr)
+    assert kr == (0.3, 'a', 'b')
+    print(ukr)
+    assert ukr == (0.7, 'unclassified', 'unclassified')  
+
+
+def test_ClassificationResult_build_krona_result_no_rank():
+    qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',
+                     query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200')
+    cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"),
+                              f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97)
+    cr.set_status(query_info=qInf, containment_threshold=0.1)
 
 
 def test_GatherRow_old_gather():
@@ -1902,6 +2079,9 @@ def test_QueryTaxResult():
     assert q_res.skipped_idents == set()
     assert q_res.missed_idents == set()
     assert q_res.summarized_lineage_results == {}
+    taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain')
+    assert q_res.ranks == taxranks
+    assert q_res.ascending_ranks == taxranks[::-1]
 
 
 def test_QueryTaxResult_add_incompatible():
@@ -2654,12 +2834,12 @@ def test_make_human_summary():
     hs = q_res.make_human_summary(display_rank = "superkingdom")
     print(hs)
     assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified',
-                   'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
+                   'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '-    ',
                    'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
-                   'total_weighted_hashes': 0},
+                   'total_weighted_hashes': "0"},
                   {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a",
-                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
-                  'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%',
+                  'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}]
 
 
 def test_make_human_summary_2():
@@ -2669,12 +2849,12 @@ def test_make_human_summary_2():
     hs = q_res.make_human_summary(display_rank = "phylum")
     print(hs)
     assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified',
-                   'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '-    ',
+                   'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '-    ',
                    'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
-                   'total_weighted_hashes': 0},
+                   'total_weighted_hashes': "0"},
                   {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
-                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%',
-                  'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%',
+                  'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}]
 
 
 def test_make_human_summary_classification():
@@ -2684,9 +2864,9 @@ def test_make_human_summary_classification():
     hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True)
     print(hs)
     assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a',
-                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
+                  'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40",
                   'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1',
-                  'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+                  'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}]
 
 
 def test_make_human_summary_classification_2():
@@ -2696,7 +2876,7 @@ def test_make_human_summary_classification_2():
     hs = q_res.make_human_summary(display_rank = "phylum", classification=True)
     print(hs)
     assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b',
-                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40,
+                   'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40",
                    'query_ani_at_rank': '94.9%', 'status': 'match',
                    'query_name': 'q1', 'query_md5': 'md5',
-                   'query_filename': 'query_fn', 'total_weighted_hashes': 0}]
+                   'query_filename': 'query_fn', 'total_weighted_hashes': "0"}]

From 724bf74157e90e49f80fcc7778ea86269c06c94c Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 9 Jan 2023 16:20:11 -0800
Subject: [PATCH 17/21] test make_full_summary

---
 src/sourmash/tax/tax_utils.py |  45 ++++++++------
 tests/test_tax_utils.py       | 114 +++++++++++++++++++++++++++++++++-
 2 files changed, 139 insertions(+), 20 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 879a486d37..f15b954db8 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1614,8 +1614,10 @@ def as_summary_dict(self, query_info, limit_float=False):
         if limit_float:
             sD['fraction'] = f'{self.fraction:.3f}'
             sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}'
+            if self.query_ani_at_rank:
+                sD['query_ani_at_rank'] = f'{self.query_ani_at_rank:.3f}'#f"{self.query_ani_at_rank*100:>3.1f}%"
         else:
-            sD['fraction'] = str(self. fraction)
+            sD['fraction'] = str(self.fraction)
             sD['f_weighted_at_rank'] = str(self.f_weighted_at_rank)
 
         return(sD)
@@ -1623,7 +1625,7 @@ def as_summary_dict(self, query_info, limit_float=False):
     def as_human_friendly_dict(self, query_info):
         sD = self.as_summary_dict(query_info=query_info, limit_float=True)
         sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%"
-        if sD['query_ani_at_rank'] is not None:
+        if self.query_ani_at_rank is not None:
             sD['query_ani_at_rank'] = f"{self.query_ani_at_rank*100:>3.1f}%"
         else:
             sD['query_ani_at_rank'] = '-    '
@@ -1694,6 +1696,7 @@ def __post_init__(self):
         self.query_name = self.query_info.query_name # for convenience
         self._init_taxresult_vars()
         self._init_summarization_vars()
+        self._init_classification_results()
 
     def _init_taxresult_vars(self):
         self.ranks = []
@@ -1801,24 +1804,23 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False):
             sorted_sum_uniq_to_query = list(sum_uniq_to_query.items())
             sorted_sum_uniq_to_query.sort(key = lambda x: -x[1])
             for lineage, f_unique in sorted_sum_uniq_to_query:
-                query_ani = None
-                if f_unique > 1:
-                    raise ValueError(f"The tax summary of query '{self.query_name}' is {f_unique}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.")
-                elif f_unique == 0: #no annotated results for this query. do we need to handle this differently now?
+                # does this ever happen? do we need it?
+                if f_unique == 0: #no annotated results for this query. do we need to handle this differently now?
                     continue
                 f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage]
                 bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage]
-                # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank)
-                # now that I pulled best_only into separate fn, these don't need to be dicts...
-                self.total_f_classified[rank] += f_unique
-                self.total_f_weighted[rank] += f_weighted_at_rank
-                self.total_bp_classified[rank] += bp_intersect_at_rank
                 sres = SummarizedGatherResult(lineage=lineage, rank=rank,
                                               f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique,
                                               bp_match_at_rank=bp_intersect_at_rank)
                 sres.set_query_ani(query_info=self.query_info)
                 self.summarized_lineage_results[rank].append(sres)
 
+                # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank)
+                # now that I pulled best_only into separate fn, these don't need to be dicts...
+                self.total_f_classified[rank] += f_unique
+                self.total_f_weighted[rank] += f_weighted_at_rank
+                self.total_bp_classified[rank] += bp_intersect_at_rank
+
             # record unclassified
             lineage = ()
             query_ani = None
@@ -1886,15 +1888,21 @@ def make_krona_header(self, min_rank):
             rank_index = self.ranks.index(min_rank)
         return ["fraction"] + list(self.ranks[:rank_index+1])
 
+    def check_classification(self):
+        if not self.classification_result:
+            raise ValueError("query not classified yet.")
+
+    def check_summarization(self):
+        if not self.summarized_lineage_results:
+            raise ValueError("lineages not summarized yet.")
+
     def make_human_summary(self, display_rank, classification=False):
         results = []
         if classification:
-            if not self.classification_result:
-                raise ValueError("query not classified yet.")
+            self.check_classification()
             display_rank_results = [self.classification_result]
         else:
-            if not self.summarized_lineage_results:
-                raise ValueError("lineages not summarized yet.")
+            self.check_summarization()
             display_rank_results = self.summarized_lineage_results[display_rank]
             display_rank_results.sort(key=lambda res: -res.f_weighted_at_rank)
 
@@ -1905,19 +1913,17 @@ def make_human_summary(self, display_rank, classification=False):
     def make_full_summary(self, classification=False, limit_float=False):
         results = []
         if classification:
+            self.check_classification()
             header= ["query_name", "status", "rank", "fraction", "lineage",
                      "query_md5", "query_filename", "f_weighted_at_rank",
                      "bp_match_at_rank", "query_ani_at_rank"]
-            if not self.classification_result:
-                raise ValueError("query not classified yet.")
             rD = self.classification_result.as_summary_dict(query_info = self.query_info, limit_float=limit_float)
             results.append(rD)
         else:
+            self.check_summarization()
             header= ["query_name", "rank", "fraction", "lineage", "query_md5",
                      "query_filename", "f_weighted_at_rank", "bp_match_at_rank",
                      "query_ani_at_rank", "total_weighted_hashes"]
-            if not self.summarized_lineage_results:
-                raise ValueError("lineages not summarized yet.")
 
             for rank in self.summarized_ranks[::-1]:
                 rank_results = self.summarized_lineage_results[rank]
@@ -1927,6 +1933,7 @@ def make_full_summary(self, classification=False, limit_float=False):
         return header, results
 
     def make_kreport_results(self):
+        self.check_summarization()
         rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C",
                         "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"}
         if self.query_info.total_weighted_hashes == 0:
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 4852d653b5..5a73060604 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -2563,7 +2563,7 @@ def test_QueryTaxResult_build_summarized_result_over100percent():
     with pytest.raises(ValueError) as exc:
         q_res.build_summarized_result()
     print(str(exc))
-    assert "The tax summary of query 'q1' is 1.05, which is > 100% of the query!! This should not be possible." in str(exc)
+    assert "Summarized fraction is > 100% of the query! This should not be possible" in str(exc)
 
 
 def test_build_summarized_result_rank_fail_not_available_resummarize():
@@ -2694,6 +2694,7 @@ def test_build_classification_result_rank_fail_not_filled():
     print(str(exc))
     assert "Error: rank 'order' was not available for any matching lineages." in str(exc)
 
+
 def test_build_classification_result_rank_fail_not_available_resummarize():
     "classification result: rank not available (wasn't summarized)"
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
@@ -2705,6 +2706,7 @@ def test_build_classification_result_rank_fail_not_available_resummarize():
     print(str(exc))
     assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc)
 
+
 def test_build_classification_result_rank_fail_not_available():
     "classification result: rank not available"
     taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
@@ -2880,3 +2882,113 @@ def test_make_human_summary_classification_2():
                    'query_ani_at_rank': '94.9%', 'status': 'match',
                    'query_name': 'q1', 'query_md5': 'md5',
                    'query_filename': 'query_fn', 'total_weighted_hashes': "0"}]
+
+
+def test_make_full_summary():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    header, fs = q_res.make_full_summary()
+    assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 
+                   'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes']
+    print(fs)
+    assert fs == [{'rank': 'superkingdom', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank':
+                   '0.6', 'bp_match_at_rank': '60', 'query_ani_at_rank': None,
+                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+                   'total_weighted_hashes': '0'},
+                   {'rank': 'superkingdom', 'fraction': '0.2', 'lineage': 'a', 'f_weighted_at_rank': '0.4',
+                   'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1',
+                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'phylum', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6',
+                   'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5',
+                   'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'phylum', 'fraction': '0.2', 'lineage': 'a;b', 'f_weighted_at_rank': '0.4',
+                   'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1',
+                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'class', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6',
+                   'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5',
+                   'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2',
+                   'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3),
+                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;d','f_weighted_at_rank': '0.2',
+                   'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), 'query_name': 'q1',
+                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}]
+    
+    header, fs = q_res.make_full_summary(limit_float=True)
+    assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 
+                   'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes']
+    print(fs)
+    assert fs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank':
+                   '0.600', 'bp_match_at_rank': '60', 'query_ani_at_rank': None,
+                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+                   'total_weighted_hashes': '0'},
+                   {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', 'f_weighted_at_rank': '0.400',
+                   'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1',
+                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600',
+                   'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5',
+                   'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '0.400',
+                   'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1',
+                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'class', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600',
+                   'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5',
+                   'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200',
+                   'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928",
+                   'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'},
+                   {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;d','f_weighted_at_rank': '0.200',
+                   'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", 'query_name': 'q1',
+                   'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}]
+
+
+def test_make_full_summary_summarization_fail():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False)
+    with pytest.raises(ValueError) as exc:
+        q_res.make_full_summary()
+    print(str(exc))
+    assert 'not summarized yet' in str(exc)
+
+
+def test_make_full_summary_classification():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True)
+    header, fs = q_res.make_full_summary(classification=True)
+    assert header == ["query_name", "status", "rank", "fraction", "lineage",
+                     "query_md5", "query_filename", "f_weighted_at_rank",
+                     "bp_match_at_rank", "query_ani_at_rank"]
+    print(fs)
+    assert fs == [{'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2', 
+                   'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3),
+                   'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+                   'total_weighted_hashes': '0'}]
+
+ 
+def test_make_full_summary_classification_limit_float():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True)
+    header, fs = q_res.make_full_summary(classification=True, limit_float=True)
+    assert header == ["query_name", "status", "rank", "fraction", "lineage",
+                     "query_md5", "query_filename", "f_weighted_at_rank",
+                     "bp_match_at_rank", "query_ani_at_rank"]
+    print(fs)
+    assert fs == [{'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200', 
+                   'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928",
+                   'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn',
+                   'total_weighted_hashes': '0'}]
+
+
+def test_make_full_summary_classification_fail():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.make_full_summary(classification=True)
+    print(str(exc))
+    assert 'not classified yet' in str(exc)
+

From 9dd9adffb0c5357703f57515e8204066561b9c38 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 9 Jan 2023 16:28:48 -0800
Subject: [PATCH 18/21] use f_unique, not f_weighted to preserve current
 functionality

---
 src/sourmash/tax/tax_utils.py | 19 ++++++++++---------
 tests/test_tax_utils.py       |  2 +-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index f15b954db8..2e33c1cd68 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1670,8 +1670,10 @@ def set_status(self, query_info, containment_threshold=None, ani_threshold=None)
         if ani_threshold:  # if provided, just use ani thresh, don't use containment threshold
             if self.query_ani_at_rank >= ani_threshold:
                 self.status = 'match'
-        elif containment_threshold and self.f_weighted_at_rank >= containment_threshold:
-                self.status = 'match'
+        # should we switch to using weighted here? I think yes, but this would be behavior change
+        elif containment_threshold and self.fraction >= containment_threshold:
+        #elif containment_threshold and self.f_weighted_at_rank >= containment_threshold:
+            self.status = 'match'
 
     def build_krona_result(self, rank=None):
         krona_classified, krona_unclassified = None, None
@@ -1854,15 +1856,14 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
             f_weighted=0.0
             f_unique_at_rank=0.0
             bp_intersect_at_rank=0
-            #sum_uniq_to_query = self.sum_uniq_to_query[this_rank]
-            sum_uniq_weighted = self.sum_uniq_weighted[this_rank]  ##SHOULD WE BE USING WEIGHTED HERE? I THINK YES, but this is a change from before.
+            sum_uniq_to_query = self.sum_uniq_to_query[this_rank]
             # sort the results and grab best
-            sorted_sum_uniq_weighted = list(sum_uniq_weighted.items())
-            sorted_sum_uniq_weighted.sort(key = lambda x: -x[1])
-            # best only
-            this_lineage, f_weighted = sorted_sum_uniq_weighted[0]
-            f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage]
+            sorted_sum_uniq_to_query = list(sum_uniq_to_query.items())
+            sorted_sum_uniq_to_query.sort(key = lambda x: -x[1])
+            # select best-at-rank only
+            this_lineage, f_unique_at_rank = sorted_sum_uniq_to_query[0]
             bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage]
+            f_weighted = self.sum_uniq_weighted[this_rank][this_lineage]
 
             classif = ClassificationResult(rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage,
                                            f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank)
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 5a73060604..e1894a7ff5 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -2610,7 +2610,7 @@ def test_build_classification_result_containment_threshold():
     assert q_res.classification_result.bp_match_at_rank == 20
     assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2)
 
-    q_res.build_classification_result(containment_threshold=0.4)
+    q_res.build_classification_result(containment_threshold=0.2)
     print("classif: ", q_res.classification_result)
     assert q_res.classification_result.status == 'match'
     assert q_res.classification_result.rank == 'phylum'

From c33c1374418f46ab3b499a4bcd2129679666f01e Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 9 Jan 2023 16:31:02 -0800
Subject: [PATCH 19/21] test no ranks

---
 tests/test_tax_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index e1894a7ff5..e99625c23d 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -2069,6 +2069,8 @@ def test_QueryTaxResult():
     taxres = make_TaxResult(taxD=taxD)
     # initialize
     q_res = QueryTaxResult(taxres.query_info)
+    assert q_res.ranks == []
+    assert q_res.ascending_ranks == []
     q_res.add_taxresult(taxres)
     # check that new querytaxres is compatible with taxres
     assert q_res.is_compatible(taxres)

From 15031ef059274cf2c24c00d0997b02b8aa7b249f Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Mon, 9 Jan 2023 17:07:31 -0800
Subject: [PATCH 20/21] test make_kreport_results

---
 src/sourmash/tax/tax_utils.py | 40 ++++++++++++++------------
 tests/test_tax_utils.py       | 54 +++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 18 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 2e33c1cd68..4673803f79 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1632,24 +1632,29 @@ def as_human_friendly_dict(self, query_info):
         return sD
 
     def as_kreport_dict(self, query_info):
+        lowest_assignment_rank = 'species'
         sD = {}
-        this_rank = self.lineage.lowest_rank
-        this_sciname = self.lineage.lowest_lineage_name
-        sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid
-        sD['sci_name'] = this_sciname
-        #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
-        sD['rank_code'] = RANKCODE[this_rank]
         sD['num_bp_assigned'] = str(0)
         # total percent containment, weighted to include abundance info
         sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
         sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_hashes))
-        # the number of bp actually 'assigned' at this rank. Sourmash assigns everything
-        # at genome level, but since kreport traditionally doesn't include 'strain' or genome,
-        # it is reasonable to state that sourmash assigns at 'species' level for this.
-        # can be modified later.
-        lowest_assignment_rank = 'species'
-        if this_rank == lowest_assignment_rank or this_sciname == "unclassified":
-           sD["num_bp_assigned"] = sD["num_bp_contained"]
+        # could make this cleaner if used empty RankLineageInfo()
+        #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True)
+        if self.lineage != ():
+            this_rank = self.lineage.lowest_rank
+            sD['rank_code'] = RANKCODE[this_rank]
+            sD['sci_name'] = self.lineage.lowest_lineage_name
+            sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid
+            # the number of bp actually 'assigned' at this rank. Sourmash assigns everything
+            # at genome level, but since kreport traditionally doesn't include 'strain' or genome,
+            # it is reasonable to state that sourmash assigns at 'species' level for this.
+            # can be modified later.
+            if this_rank == lowest_assignment_rank:
+                sD["num_bp_assigned"] = sD["num_bp_contained"]
+        else:
+            sD['sci_name'] = 'unclassified'
+            sD['rank_code'] = RANKCODE['unclassified']
+            sD["num_bp_assigned"] = sD["num_bp_contained"]
         return sD
 
 @dataclass
@@ -1935,12 +1940,11 @@ def make_full_summary(self, classification=False, limit_float=False):
 
     def make_kreport_results(self):
         self.check_summarization()
-        rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C",
-                        "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"}
         if self.query_info.total_weighted_hashes == 0:
             raise ValueError("ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0")
-        required_ranks = set(rankCode.keys).pop('unclassified')
-        if not set(required_ranks).issubset(set(self.ranks)):
+        required_ranks = set(RANKCODE.keys())
+        acceptable_ranks = list(self.ranks) + ['unclassified', 'kingdom']
+        if not required_ranks.issubset(set(acceptable_ranks)):
             raise ValueError("ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}")
         kreport_results = []
         unclassified_recorded=False
@@ -1950,7 +1954,7 @@ def make_kreport_results(self):
                 continue
             rank_results = self.summarized_lineage_results[rank]
             for res in rank_results:
-                kresD = res.as_kreport_dict(rankCode, self.query_info.total_weighted_hashes)
+                kresD = res.as_kreport_dict(self.query_info)
                 if kresD['sci_name'] == "unclassified":
                     # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank.
                     # Here, we only need to report it once, since it will be the same fraction for all ranks
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index e99625c23d..55f2543e9d 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -2994,3 +2994,57 @@ def test_make_full_summary_classification_fail():
     print(str(exc))
     assert 'not classified yet' in str(exc)
 
+
+def test_make_kreport_results():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;c;d;e;f;g")])
+    #need to go down to species to check that `num_bp_assigned` is happening correctly
+    gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    krepD = q_res.make_kreport_results()
+    print(krepD)
+    assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40',
+                    'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': None},
+                    {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60',
+                    'sci_name': 'unclassified', 'rank_code': 'U'},
+                    {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40',
+                    'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': None},
+                    {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40',
+                    'rank_code': 'C', 'sci_name': 'c', 'ncbi_taxid': None},
+                    {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20',
+                    'rank_code': 'O', 'sci_name': 'd', 'ncbi_taxid': None},
+                    {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20',
+                    'rank_code': 'F', 'sci_name': 'e', 'ncbi_taxid': None},
+                    {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20',
+                    'rank_code': 'G', 'sci_name': 'f', 'ncbi_taxid': None},
+                    {'num_bp_assigned': '20', 'percent_containment': '20.00', 'num_bp_contained': '20',
+                    'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': None}]
+
+
+def test_make_kreport_results_fail():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False)
+    with pytest.raises(ValueError) as exc:
+        q_res.make_kreport_results()
+    print(str(exc))
+    assert 'not summarized yet' in str(exc)
+
+
+def test_make_kreport_results_fail_pre_v450():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.make_kreport_results()
+    print(str(exc))
+    assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc)
+
+
+def test_make_kreport_results_fail_pre_v450():
+    taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")])
+    gather_results = [{}, {"name": 'gB'}]
+    q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True)
+    with pytest.raises(ValueError) as exc:
+        q_res.make_kreport_results()
+    print(str(exc))
+    assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc)

From 826bc6b859109634f05e9b1ba24cd301912085f2 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Tue, 10 Jan 2023 16:46:02 -0800
Subject: [PATCH 21/21] fix lineagepair

---
 tests/test_tax_utils.py | 116 ++++++++++++++++++++--------------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 181c93623b..832395252f 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -691,7 +691,7 @@ def test_summarize_gather_at_0():
     assert sk_sum[0].query_md5 == "queryA_md5"
     assert sk_sum[0].query_filename == "queryA.sig"
     assert sk_sum[0].rank == 'superkingdom'
-    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 1.0
     assert sk_sum[0].f_weighted_at_rank == 1.0
     assert sk_sum[0].bp_match_at_rank == 100
@@ -704,7 +704,7 @@ def test_summarize_gather_at_0():
     assert phy_sum[0].query_md5 == "queryA_md5"
     assert phy_sum[0].query_filename == "queryA.sig"
     assert phy_sum[0].rank == 'phylum'
-    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b'))
     assert phy_sum[0].fraction == 1.0
     assert phy_sum[0].f_weighted_at_rank == 1.0
     assert phy_sum[0].bp_match_at_rank == 100
@@ -716,16 +716,16 @@ def test_summarize_gather_at_0():
     assert cl_sum[0].query_md5 == "queryA_md5"
     assert cl_sum[0].query_filename == "queryA.sig"
     assert cl_sum[0].rank == 'class'
-    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),
+                                 LineagePair(rank='phylum', name='b'),
+                                 LineagePair(rank='class', name='c'))
     assert cl_sum[0].fraction == 0.5
     assert cl_sum[0].f_weighted_at_rank == 0.5
     assert cl_sum[0].bp_match_at_rank == 50
     assert cl_sum[1].rank == 'class'
-    assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='d'))
+    assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='d'))
     assert cl_sum[1].fraction == 0.5
     assert cl_sum[1].f_weighted_at_rank == 0.5
     assert cl_sum[1].bp_match_at_rank == 50
@@ -750,7 +750,7 @@ def test_summarize_gather_at_1():
     # superkingdom
     assert len(sk_sum) == 2
     print("\nsuperkingdom summarized gather 0: ", sk_sum[0])
-    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.7
     assert sk_sum[0].bp_match_at_rank == 70
     print("superkingdom summarized gather 1: ", sk_sum[1])
@@ -765,7 +765,7 @@ def test_summarize_gather_at_1():
     print("phylum summarized gather 0: ", phy_sum[0])
     print("phylum summarized gather 1: ", phy_sum[1])
     assert len(phy_sum) == 2
-    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.7
     assert phy_sum[0].f_weighted_at_rank == 0.6
     assert phy_sum[0].bp_match_at_rank == 70
@@ -778,18 +778,18 @@ def test_summarize_gather_at_1():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True)
     assert len(cl_sum) == 3
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='c'))
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].f_weighted_at_rank == 0.5
     assert cl_sum[0].bp_match_at_rank == 60
     assert cl_sum[0].query_ani_at_rank == 0.9836567776983505
 
     assert cl_sum[1].rank == 'class'
-    assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='d'))
+    assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='d'))
     assert cl_sum[1].fraction == 0.1
     assert cl_sum[1].f_weighted_at_rank == 0.1
     assert cl_sum[1].bp_match_at_rank == 10
@@ -816,7 +816,7 @@ def test_summarize_gather_at_perfect_match():
     # superkingdom
     assert len(sk_sum) == 1
     print("superkingdom summarized gather: ", sk_sum[0])
-    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 1.0
 
 
@@ -846,15 +846,15 @@ def test_summarize_gather_at_over100percent_f_unique_to_query():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res)
     assert len(cl_sum) == 2
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='d'))
+    assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='d'))
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].bp_match_at_rank == 60
     assert cl_sum[1].rank == 'class'
-    assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='c'))
+    assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='c'))
     assert cl_sum[1].fraction == 0.5
     assert cl_sum[1].bp_match_at_rank == 50
 
@@ -875,7 +875,7 @@ def test_summarize_gather_at_missing_ignore():
     # superkingdom
     assert len(sk_sum) == 2
     print("superkingdom summarized gather: ", sk_sum[0])
-    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.5
     assert sk_sum[0].bp_match_at_rank == 50
     assert sk_sum[1].lineage == ()
@@ -886,7 +886,7 @@ def test_summarize_gather_at_missing_ignore():
     phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB'])
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 2
-    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.5
     assert phy_sum[0].bp_match_at_rank == 50
     assert phy_sum[1].lineage == ()
@@ -896,9 +896,9 @@ def test_summarize_gather_at_missing_ignore():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB'])
     assert len(cl_sum) == 2
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='c'))
     assert cl_sum[0].fraction == 0.5
     assert cl_sum[0].bp_match_at_rank == 50
     assert cl_sum[1].lineage == ()
@@ -941,7 +941,7 @@ def test_summarize_gather_at_best_only_0():
     # superkingdom
     assert len(sk_sum) == 1
     print("superkingdom summarized gather: ", sk_sum[0])
-    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.7
     assert sk_sum[0].bp_match_at_rank == 70
     print("superk ANI:",sk_sum[0].query_ani_at_rank)
@@ -951,7 +951,7 @@ def test_summarize_gather_at_best_only_0():
     phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True)
     print("phylum summarized gather: ", phy_sum[0])
     assert len(phy_sum) == 1
-    assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b'))
+    assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b'))
     assert phy_sum[0].fraction == 0.7
     assert phy_sum[0].bp_match_at_rank == 70
     print("phy ANI:",phy_sum[0].query_ani_at_rank)
@@ -960,9 +960,9 @@ def test_summarize_gather_at_best_only_0():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True)
     assert len(cl_sum) == 1
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='c'))
     assert cl_sum[0].fraction == 0.6
     assert cl_sum[0].bp_match_at_rank == 60
     print("cl ANI:",cl_sum[0].query_ani_at_rank)
@@ -986,9 +986,9 @@ def test_summarize_gather_at_best_only_equal_choose_first():
     cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True)
     assert len(cl_sum) == 1
     print("class summarized gather: ", cl_sum)
-    assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                 lca_utils.LineagePair(rank='phylum', name='b'),
-                                 lca_utils.LineagePair(rank='class', name='c'))
+    assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),
+                                 LineagePair (rank='phylum', name='b'),
+                                 LineagePair (rank='class', name='c'))
     assert cl_sum[0].fraction == 0.5
     assert cl_sum[0].bp_match_at_rank == 50
 
@@ -999,14 +999,14 @@ def test_write_summary_csv(runtmp):
     sum_gather = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=1.0,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(LineagePair (rank='superkingdom', name='a'),),
                                                           query_ani_at_rank=None,
                                                           total_weighted_hashes=0)],
                   'phylum':  [SumGathInf(query_name='queryA', rank='phylum', fraction=1.0,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                                              lca_utils.LineagePair(rank='phylum', name='b')),
+                                                     lineage=(LineagePair (rank='superkingdom', name='a'),
+                                                              LineagePair (rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
 
@@ -1024,8 +1024,8 @@ def test_write_summary_csv(runtmp):
 def test_write_classification(runtmp):
     """test classification csv write function"""
     classif = ClassInf('queryA', 'match', 'phylum', 1.0,
-                                    (lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                     lca_utils.LineagePair(rank='phylum', name='b')),
+                                    (LineagePair (rank='superkingdom', name='a'),
+                                     LineagePair (rank='phylum', name='b')),
                                      'queryA_md5', 'queryA.sig', 1.0, 100,
                                      query_ani_at_rank=None)
 
@@ -1083,7 +1083,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     print("superkingdom summarized gather results:", sk_sum)
     assert len(sk_sum) ==4
     assert sk_sum[0].query_name == "queryA"
-    assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),)
     assert sk_sum[0].fraction == 0.9
     assert sk_sum[0].bp_match_at_rank == 160
     # check for unassigned for queryA
@@ -1093,7 +1093,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     assert round(sk_sum[1].fraction,1) == 0.1
     # queryB
     assert sk_sum[2].query_name == "queryB"
-    assert sk_sum[2].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),)
+    assert sk_sum[2].lineage == (LineagePair (rank='superkingdom', name='a'),)
     assert sk_sum[2].fraction == 0.3
     assert sk_sum[2].bp_match_at_rank == 60
     # check for unassigned for queryA
@@ -1103,7 +1103,7 @@ def test_aggregate_by_lineage_at_rank_by_query():
     assert sk_sum[3].bp_match_at_rank == 140
     sk_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True)
     print("superkingdom lineage summary:", sk_lin_sum, '\n')
-    assert sk_lin_sum == {(lca_utils.LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3},
+    assert sk_lin_sum == {(LineagePair (rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3},
                           (): {'queryA': 0.09999999999999998, 'queryB': 0.7}}
     assert num_queries == 2
     assert query_names == ['queryA', 'queryB']
@@ -1112,8 +1112,8 @@ def test_aggregate_by_lineage_at_rank_by_query():
     print("phylum summary:", phy_sum, ']\n')
     phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True)
     print("phylum lineage summary:", phy_lin_sum, '\n')
-    assert phy_lin_sum ==  {(lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='b')): {'queryA': 0.5},
-                            (lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3},
+    assert phy_lin_sum ==  {(LineagePair (rank='superkingdom', name='a'), LineagePair (rank='phylum', name='b')): {'queryA': 0.5},
+                            (LineagePair (rank='superkingdom', name='a'), LineagePair (rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3},
                             (): {'queryA': 0.09999999999999998, 'queryB': 0.7}}
     assert num_queries == 2
     assert query_names == ['queryA', 'queryB']
@@ -1223,27 +1223,27 @@ def test_combine_sumgather_csvs_by_lineage(runtmp):
     sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=1.0, bp_match_at_rank=100,
-                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(LineagePair (rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
-                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                                              lca_utils.LineagePair(rank='phylum', name='b')),
+                                                     lineage=(LineagePair (rank='superkingdom', name='a'),
+                                                              LineagePair (rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
     sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(LineagePair (rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                                              lca_utils.LineagePair(rank='phylum', name='c')),
+                                                     lineage=(LineagePair (rank='superkingdom', name='a'),
+                                                              LineagePair (rank='phylum', name='c')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
 
@@ -1318,27 +1318,27 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
     sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5,
                                                           query_md5='queryA_md5', query_filename='queryA.sig',
                                                           f_weighted_at_rank=0.5, bp_match_at_rank=50,
-                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(LineagePair (rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5,
                                                      query_md5='queryA_md5', query_filename='queryA.sig',
                                                      f_weighted_at_rank=0.5, bp_match_at_rank=50,
-                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                                              lca_utils.LineagePair(rank='phylum', name='b')),
+                                                     lineage=(LineagePair (rank='superkingdom', name='a'),
+                                                              LineagePair (rank='phylum', name='b')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}
     sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7,
                                                           query_md5='queryB_md5', query_filename='queryB.sig',
                                                           f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                          lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),),
+                                                          lineage=(LineagePair (rank='superkingdom', name='a'),),
                                                            query_ani_at_rank=None,
                                                            total_weighted_hashes=0)],
                   'phylum':  [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7,
                                                      query_md5='queryB_md5', query_filename='queryB.sig',
                                                      f_weighted_at_rank=0.7, bp_match_at_rank=70,
-                                                     lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),
-                                                              lca_utils.LineagePair(rank='phylum', name='c')),
+                                                     lineage=(LineagePair (rank='superkingdom', name='a'),
+                                                              LineagePair (rank='phylum', name='c')),
                                                      query_ani_at_rank=None,
                                                      total_weighted_hashes=0)]}