From b1ed90062d8bd69d7d27803620e698c16a050aa6 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Jan 2023 15:43:17 -0800 Subject: [PATCH 01/21] init new LineagePair and LineageInfo classes --- src/sourmash/lca/lca_utils.py | 4 + src/sourmash/tax/tax_utils.py | 292 +++++++++++++++++++++++++++++++++- tests/test_tax_utils.py | 128 ++++++++------- 3 files changed, 361 insertions(+), 63 deletions(-) diff --git a/src/sourmash/lca/lca_utils.py b/src/sourmash/lca/lca_utils.py index b81f765bb9..8138e38c2f 100644 --- a/src/sourmash/lca/lca_utils.py +++ b/src/sourmash/lca/lca_utils.py @@ -131,6 +131,10 @@ def build_tree(assignments, initial=None): for assignment in assignments: node = tree + # when we switch LineagePair over, will need ot add this. + #if isinstance(assignment, (BaseLineageInfo, RankLineageInfo, LINSLineageInfo)): + # assignment = assignment.filled_lineage + for lineage_tup in assignment: if lineage_tup.name: child = node.get(lineage_tup, {}) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index bba7637833..2b47e672de 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -5,6 +5,9 @@ import csv from collections import namedtuple, defaultdict from collections import abc +from itertools import zip_longest +from typing import NamedTuple +from dataclasses import dataclass, field, replace import gzip from sourmash import sqlite_utils, sourmash_args @@ -36,7 +39,288 @@ # import lca utils as needed for now from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import (LineagePair, taxlist, display_lineage, pop_to_rank) +from sourmash.lca.lca_utils import (taxlist, display_lineage, pop_to_rank) + +class LineagePair(NamedTuple): + rank: str + name: str = "" + taxid: int = None + +@dataclass(frozen=True, order=True) +class BaseLineageInfo: + """BaseLineageInfo Class, defining a set of functions that can be used to handle + hierarchical set of LineagePairs. """ + # need to set compare=False for any mutable type to keep this class hashable + ranks: tuple() # require ranks + lineage: tuple = field(default=()) #tuple of LineagePairs + lineage_str: str = field(default=None, compare=False) # ';'- or ','-separated str of lineage names + lineage_dict: dict = field(default=None, compare=False) # dict of rank: name + + def __post_init__(self): + "Initialize according to passed values" + # ranks must be tuple for hashability + if isinstance(self.ranks, list): + object.__setattr__(self, "ranks", tuple(self.ranks)) + if self.lineage: + self._init_from_lineage_tuples() + elif self.lineage_str is not None: + self._init_from_lineage_str() + elif self.lineage_dict is not None: + self._init_from_lineage_dict() + else: + self._init_empty() + + def __eq__(self, other): + if other == (): # just handy: if comparing to a null tuple, don't try to find it's lineage before returning False + return False + return all([self.ranks == other.ranks and self.lineage==other.lineage]) + + @property + def taxlist(self): + return self.ranks + + @property + def ascending_taxlist(self): + return self.ranks[::-1] + + @property + def lowest_rank(self): + return self.filled_ranks[-1] + + def rank_index(self, rank): + return self.ranks.index(rank) + + @property + def filled_lineage(self): + """Return lineage down to lowest non-empty rank. Preserves missing ranks above.""" + # Would we prefer this to be the default returned by lineage?? + if not self.filled_ranks: + return () + lowest_filled_rank_idx = self.rank_index(self.filled_ranks[-1]) + return self.lineage[:lowest_filled_rank_idx+1] + + @property + def lowest_lineage_name(self, null_as_unclassified = False): + """Return the name of the lowest filled lineage""" + if not self.filled_ranks: + #if null_as_unclassified: # todo: enable me + # return "unclassified" + #else: + return "" + return self.filled_lineage[-1].name + + @property + def lowest_lineage_taxid(self): + """Return the taxid of the lowest filled lineage""" + if not self.filled_ranks: + return "" + return self.filled_lineage[-1].taxid + + def _init_empty(self): + 'initialize empty genome lineage' + new_lineage = [] + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + # set lineage and filled_ranks (because frozen, need to do it this way) + object.__setattr__(self, "lineage", new_lineage) + object.__setattr__(self, "filled_ranks", ()) + + def _init_from_lineage_tuples(self): + 'initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary' + new_lineage = [] + # check this is a list or tuple of lineage tuples: + for lin_tup in self.lineage: + if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)): + raise ValueError(f"{lin_tup} is not LineagePair or LineagePair") + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + # now add input tuples in correct spots. This corrects for order and allows empty values. + for lin_tup in self.lineage: + # find index for this rank + if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) + try: + rank_idx = self.rank_index(lin_tup.rank) + except ValueError as e: + raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e + # make sure we're adding LineagePairs, not lineagePairs for consistency + if not isinstance(lin_tup, LineagePair): + new_lineage[rank_idx] = LineagePair(rank=lin_tup.rank, name=lin_tup.name) + else: + new_lineage[rank_idx] = lin_tup + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", filled_ranks) + + def _init_from_lineage_dict(self): + 'initialize from lineage dict, e.g. from gather csv, allowing empty ranks and reordering if necessary' + if not isinstance(self.lineage_dict, (dict)): + raise ValueError(f"{self.lineage_dict} is not dictionary") + # first, initialize_empty + new_lineage = [] + # build empty lineage + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + # now add input information in correct spots. This corrects for order and allows empty values. + for rank, info in self.lineage_dict.items(): + try: + rank_idx = self.rank_index(rank) + except ValueError as e: + raise ValueError(f"Rank '{rank}' not present in {', '.join(self.ranks)}") from e + + name, taxid = None, None + if isinstance(info, dict): + if 'name' in info.keys(): + name = info['name'] + if 'taxid' in info.keys(): + taxid = info['taxid'] + elif isinstance(info, str): + name = info + new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", filled_ranks) + + def _init_from_lineage_str(self): + """ + Turn a ; or ,-separated set of lineages into a list of LineagePair objs. + """ + new_lineage = self.lineage_str.split(';') + if len(new_lineage) == 1: + new_lineage = self.lineage_str.split(',') + new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", filled_ranks) + + def zip_lineage(self, truncate_empty=False): + """ + Return lineage names as a list + """ + if truncate_empty: + zipped = [a.name for a in self.filled_lineage] + else: + zipped = [a.name for a in self.lineage] + # replace None with empty string ("") + if None in zipped: + zipped = ['' if x is None else x for x in zipped] + + return zipped + + def zip_taxid(self, truncate_empty=False): + """ + Return taxids as a list + """ + if truncate_empty: + zipped = [a.taxid for a in self.filled_lineage] + else: + zipped = [a.taxid for a in self.lineage] + # replace None with empty string (""); cast taxids to str + zipped = ['' if x is None else str(x) for x in zipped] + + return zipped + + def display_lineage(self, truncate_empty=True, null_as_unclassified=False): + "Return lineage names as ';'-separated list" + lin = ";".join(self.zip_lineage(truncate_empty=truncate_empty)) + if null_as_unclassified: + if lin == "": + return "unclassified" + else: + return lin + + def display_taxid(self, truncate_empty=True): + "Return lineage taxids as ';'-separated list" + return ";".join(self.zip_taxid(truncate_empty=truncate_empty)) + + def is_lineage_match(self, other, rank): + """ + check to see if two lineages are a match down to given rank. + """ + if not other.ranks == self.ranks: # check same ranks + raise ValueError("Cannot compare lineages from taxonomies with different ranks.") + if rank not in self.ranks: # rank is available + raise ValueError(f"Desired Rank {rank} not available for this lineage") + # always return false if rank is not filled in either of the two lineages + if rank in self.filled_ranks and rank in other.filled_ranks: + rank_idx = self.rank_index(rank) + a_lin = self.lineage[:rank_idx+1] + b_lin = other.lineage[:rank_idx+1] + if a_lin == b_lin: + return 1 + return 0 + + def pop_to_rank(self, rank): + "Return new LineageInfo with ranks only filled to desired rank" + if rank not in self.ranks: + raise ValueError(f"Desired Rank '{rank}' not available for this lineage") + # are we already above rank? + if rank not in self.filled_ranks: + return replace(self) + # if not, make filled_lineage at this rank + use to generate new LineageInfo + new_lineage = self.lineage_at_rank(rank) + new = replace(self, lineage = new_lineage) + # replace doesn't run the __post_init__ properly. reinitialize. + new._init_from_lineage_tuples() + return new + + def lineage_at_rank(self, rank): + # non-descructive pop_to_rank. Returns tuple of LineagePairs + "Returns tuple of LineagePairs at given rank." + if rank not in self.ranks: + raise ValueError(f"Desired Rank '{rank}' not available for this lineage") + # are we already above rank? + if rank not in self.filled_ranks: + return self.filled_lineage + # if not, return lineage tuples down to desired rank + rank_idx = self.rank_index(rank) + return self.filled_lineage[:rank_idx+1] + + +@dataclass(frozen=True, order=True) +class RankLineageInfo(BaseLineageInfo): + """Class for storing multi-rank lineage information""" + ranks: tuple = field(default_factory=lambda: ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain')) + + def __post_init__(self): + "Initialize according to passed values" + # ranks must be tuple for hashability + if isinstance(self.ranks, list): + object.__setattr__(self, "ranks", tuple(self.ranks)) + if self.lineage: + self._init_from_lineage_tuples() + elif self.lineage_str is not None: + self._init_from_lineage_str() + elif self.lineage_dict is not None: + self._init_from_lineage_dict() + elif self.ranks: + self._init_empty() + + def __eq__(self, other): + if other == (): # sometimes we compare to null lineage.. this just helps with that + return False + return all([self.ranks == other.ranks, self.lineage==other.lineage]) + + +@dataclass(frozen=True, order=True) +class LINSLineageInfo(BaseLineageInfo): + """Class for storing multi-rank lineage information""" + num_positions: int = None + ## WHAT special considerations do we have here? + def __post_init__(self): + "Initialize according to passed values" + if self.lineage != [LineagePair()]: + self._init_from_lineage_tuples() + elif self.lineage_str is not None and self.ranks: + self._init_from_lineage_str() + elif self.ranks: + self._init_empty() + else: + raise ValueError("Cannot initialize LINSLineageInfo. Please provide lineage or rank info.") def get_ident(ident, *, @@ -760,7 +1044,7 @@ def load(cls, filename, *, delimiter=',', force=False, # read row into a lineage pair for rank in lca_utils.taxlist(include_strain=include_strain): lin = row[rank] - lineage.append(LineagePair(rank, lin)) + lineage.append(lca_utils.LineagePair(rank, lin)) ident = row[identifier] # fold, spindle, and mutilate ident? @@ -770,7 +1054,7 @@ def load(cls, filename, *, delimiter=',', force=False, # clean lineage of null names, replace with 'unassigned' lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] + lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage ] # remove end nulls while lineage and lineage[-1].name == 'unassigned': @@ -924,7 +1208,7 @@ def load(cls, location): def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" - tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] + tup = [ lca_utils.LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] return tuple(tup) def __getitem__(self, ident): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 7652970f60..a48692161d 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -22,7 +22,7 @@ # import lca utils as needed for now from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import LineagePair +from sourmash.tax.tax_utils import LineagePair # utility functions for testing def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False): @@ -379,7 +379,7 @@ def test_summarize_gather_at_0(): assert sk_sum[0].query_md5 == "queryA_md5" assert sk_sum[0].query_filename == "queryA.sig" assert sk_sum[0].rank == 'superkingdom' - assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 1.0 assert sk_sum[0].f_weighted_at_rank == 1.0 assert sk_sum[0].bp_match_at_rank == 100 @@ -392,7 +392,7 @@ def test_summarize_gather_at_0(): assert phy_sum[0].query_md5 == "queryA_md5" assert phy_sum[0].query_filename == "queryA.sig" assert phy_sum[0].rank == 'phylum' - assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 1.0 assert phy_sum[0].f_weighted_at_rank == 1.0 assert phy_sum[0].bp_match_at_rank == 100 @@ -404,16 +404,16 @@ def test_summarize_gather_at_0(): assert cl_sum[0].query_md5 == "queryA_md5" assert cl_sum[0].query_filename == "queryA.sig" assert cl_sum[0].rank == 'class' - assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.5 assert cl_sum[0].f_weighted_at_rank == 0.5 assert cl_sum[0].bp_match_at_rank == 50 assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='d')) + assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='d')) assert cl_sum[1].fraction == 0.5 assert cl_sum[1].f_weighted_at_rank == 0.5 assert cl_sum[1].bp_match_at_rank == 50 @@ -438,7 +438,7 @@ def test_summarize_gather_at_1(): # superkingdom assert len(sk_sum) == 2 print("\nsuperkingdom summarized gather 0: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 print("superkingdom summarized gather 1: ", sk_sum[1]) @@ -453,7 +453,7 @@ def test_summarize_gather_at_1(): print("phylum summarized gather 0: ", phy_sum[0]) print("phylum summarized gather 1: ", phy_sum[1]) assert len(phy_sum) == 2 - assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 assert phy_sum[0].f_weighted_at_rank == 0.6 assert phy_sum[0].bp_match_at_rank == 70 @@ -466,18 +466,18 @@ def test_summarize_gather_at_1(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) assert len(cl_sum) == 3 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].f_weighted_at_rank == 0.5 assert cl_sum[0].bp_match_at_rank == 60 assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='d')) + assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='d')) assert cl_sum[1].fraction == 0.1 assert cl_sum[1].f_weighted_at_rank == 0.1 assert cl_sum[1].bp_match_at_rank == 10 @@ -504,7 +504,7 @@ def test_summarize_gather_at_perfect_match(): # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 1.0 @@ -534,15 +534,15 @@ def test_summarize_gather_at_over100percent_f_unique_to_query(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='d')) + assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='d')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].bp_match_at_rank == 60 assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')) + assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='c')) assert cl_sum[1].fraction == 0.5 assert cl_sum[1].bp_match_at_rank == 50 @@ -563,7 +563,7 @@ def test_summarize_gather_at_missing_ignore(): # superkingdom assert len(sk_sum) == 2 print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.5 assert sk_sum[0].bp_match_at_rank == 50 assert sk_sum[1].lineage == () @@ -574,7 +574,7 @@ def test_summarize_gather_at_missing_ignore(): phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 2 - assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.5 assert phy_sum[0].bp_match_at_rank == 50 assert phy_sum[1].lineage == () @@ -584,9 +584,9 @@ def test_summarize_gather_at_missing_ignore(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.5 assert cl_sum[0].bp_match_at_rank == 50 assert cl_sum[1].lineage == () @@ -629,7 +629,7 @@ def test_summarize_gather_at_best_only_0(): # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 print("superk ANI:",sk_sum[0].query_ani_at_rank) @@ -639,7 +639,7 @@ def test_summarize_gather_at_best_only_0(): phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 - assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 assert phy_sum[0].bp_match_at_rank == 70 print("phy ANI:",phy_sum[0].query_ani_at_rank) @@ -648,9 +648,9 @@ def test_summarize_gather_at_best_only_0(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].bp_match_at_rank == 60 print("cl ANI:",cl_sum[0].query_ani_at_rank) @@ -674,9 +674,9 @@ def test_summarize_gather_at_best_only_equal_choose_first(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b'), + lca_utils.LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.5 assert cl_sum[0].bp_match_at_rank == 50 @@ -687,14 +687,14 @@ def test_write_summary_csv(runtmp): sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} @@ -712,8 +712,8 @@ def test_write_summary_csv(runtmp): def test_write_classification(runtmp): """test classification csv write function""" classif = ClassificationResult('queryA', 'match', 'phylum', 1.0, - (LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), + (lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b')), 'queryA_md5', 'queryA.sig', 1.0, 100, query_ani_at_rank=None) @@ -771,7 +771,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): print("superkingdom summarized gather results:", sk_sum) assert len(sk_sum) ==4 assert sk_sum[0].query_name == "queryA" - assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.9 assert sk_sum[0].bp_match_at_rank == 160 # check for unassigned for queryA @@ -781,7 +781,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): assert round(sk_sum[1].fraction,1) == 0.1 # queryB assert sk_sum[2].query_name == "queryB" - assert sk_sum[2].lineage == (LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[2].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) assert sk_sum[2].fraction == 0.3 assert sk_sum[2].bp_match_at_rank == 60 # check for unassigned for queryA @@ -791,7 +791,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): assert sk_sum[3].bp_match_at_rank == 140 sk_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True) print("superkingdom lineage summary:", sk_lin_sum, '\n') - assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3}, + assert sk_lin_sum == {(lca_utils.LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3}, (): {'queryA': 0.09999999999999998, 'queryB': 0.7}} assert num_queries == 2 assert query_names == ['queryA', 'queryB'] @@ -800,8 +800,8 @@ def test_aggregate_by_lineage_at_rank_by_query(): print("phylum summary:", phy_sum, ']\n') phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) print("phylum lineage summary:", phy_lin_sum, '\n') - assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): {'queryA': 0.5}, - (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3}, + assert phy_lin_sum == {(lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='b')): {'queryA': 0.5}, + (lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3}, (): {'queryA': 0.09999999999999998, 'queryB': 0.7}} assert num_queries == 2 assert query_names == ['queryA', 'queryB'] @@ -911,27 +911,27 @@ def test_combine_sumgather_csvs_by_lineage(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='c')), query_ani_at_rank=None, total_weighted_hashes=0)]} @@ -1006,27 +1006,27 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair(rank='superkingdom', name='a'),), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')), + lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), + lca_utils.LineagePair(rank='phylum', name='c')), query_ani_at_rank=None, total_weighted_hashes=0)]} @@ -1231,3 +1231,13 @@ def test_lineage_db_sql_load(runtmp): # file does not exist with pytest.raises(ValueError): LineageDB_Sqlite.load(runtmp.output('no-such-file')) + + +def test_LineagePair(): + lin = LineagePair(rank="rank1", name='name1') + print(lin) + + +def test_LineagePair_1(): + lin1 = LineagePair(rank="rank1", name='name1', taxid=1) + print(lin1) From 5aa5714e4a0020178d79dbc0da1be0c5135897e9 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Jan 2023 16:09:57 -0800 Subject: [PATCH 02/21] test new LineagePair,BaseLineageInfo,RankLineageInfo --- src/sourmash/tax/tax_utils.py | 36 ++-- tests/test_tax_utils.py | 382 +++++++++++++++++++++++++++++++++- 2 files changed, 396 insertions(+), 22 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 2b47e672de..ec731ede5e 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -43,7 +43,7 @@ class LineagePair(NamedTuple): rank: str - name: str = "" + name: str = None taxid: int = None @dataclass(frozen=True, order=True) @@ -129,24 +129,23 @@ def _init_from_lineage_tuples(self): 'initialize from tuple/list of LineagePairs, allowing empty ranks and reordering if necessary' new_lineage = [] # check this is a list or tuple of lineage tuples: - for lin_tup in self.lineage: - if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)): - raise ValueError(f"{lin_tup} is not LineagePair or LineagePair") for rank in self.ranks: new_lineage.append(LineagePair(rank=rank)) - # now add input tuples in correct spots. This corrects for order and allows empty values. - for lin_tup in self.lineage: + for lin_tup in self.lineage: + # now add input tuples in correct spots. This corrects for order and allows empty values. + if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)): + raise ValueError(f"{lin_tup} is not LineagePair.") # find index for this rank - if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) - try: - rank_idx = self.rank_index(lin_tup.rank) - except ValueError as e: - raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e - # make sure we're adding LineagePairs, not lineagePairs for consistency - if not isinstance(lin_tup, LineagePair): - new_lineage[rank_idx] = LineagePair(rank=lin_tup.rank, name=lin_tup.name) - else: - new_lineage[rank_idx] = lin_tup + if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) + try: + rank_idx = self.rank_index(lin_tup.rank) + except ValueError as e: + raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e + # make sure we're adding LineagePairs, not lineagePairs for consistency + if isinstance(lin_tup, lca_utils.LineagePair): + new_lineage[rank_idx] = LineagePair(rank=lin_tup.rank, name=lin_tup.name) + else: + new_lineage[rank_idx] = lin_tup # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name] # set lineage and filled_ranks @@ -300,11 +299,6 @@ def __post_init__(self): elif self.ranks: self._init_empty() - def __eq__(self, other): - if other == (): # sometimes we compare to null lineage.. this just helps with that - return False - return all([self.ranks == other.ranks, self.lineage==other.lineage]) - @dataclass(frozen=True, order=True) class LINSLineageInfo(BaseLineageInfo): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a48692161d..65334e82aa 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -22,7 +22,7 @@ # import lca utils as needed for now from sourmash.lca import lca_utils -from sourmash.tax.tax_utils import LineagePair +from sourmash.tax.tax_utils import LineagePair, BaseLineageInfo, RankLineageInfo # utility functions for testing def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False): @@ -1241,3 +1241,383 @@ def test_LineagePair(): def test_LineagePair_1(): lin1 = LineagePair(rank="rank1", name='name1', taxid=1) print(lin1) + + +def test_BaseLineageInfo_init_lineage_str(): + x = "a;b;c" + ranks=["A", "B", "C"] + taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', 'b', 'c'] + + +def test_BaseLineageInfo_init_lineage_tups(): + ranks=["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', '', 'b'] + + +def test_BaseLineageInfo_init_lineage_dict(): + x = {'rank1': 'name1', 'rank2': 'name2'} + taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', 'name2'] + + +def test_BaseLineageInfo_init_lineage_dict_withtaxid(): + x = {'rank1': {'name': 'name1', 'taxid': 1}, 'rank2': {'name':'name2', 'taxid': 2}} + taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', 'name2'] + assert taxinf.zip_taxid()== ['1', '2'] + + +def test_BaseLineageInfo_init_lineage_str_lineage_dict_test_eq(): + x = "a;b;c" + ranks=["A", "B", "C"] + rankD = {"A": "a", "B": "b", "C": "c"} + lin1 = BaseLineageInfo(lineage_str=x, ranks=ranks) + lin2 = BaseLineageInfo(lineage_dict=rankD, ranks=ranks) + assert lin1 == lin2 + + +def test_BaseLineageInfo_init_no_ranks(): + x = "a;b;c" + rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} + lin_tups = (LineagePair(rank="rank2", name='name1'), LineagePair(rank="rank1", name='name1')) + with pytest.raises(TypeError) as exc: + BaseLineageInfo(lineage_str=x) + print(exc) + assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc) + with pytest.raises(TypeError) as exc: + BaseLineageInfo(lineage_dict=rankD) + print(exc) + assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc) + with pytest.raises(TypeError) as exc: + BaseLineageInfo(lineage=lin_tups) + print(exc) + assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc) + + +def test_BaseLineageInfo_init_with_wrong_ranks(): + ranks=["A", "B", "C"] + lin_tups = (LineagePair(rank="rank1", name='name1')) + linD = {"rank1": "a"} + with pytest.raises(ValueError) as exc: + BaseLineageInfo(lineage=lin_tups, ranks=ranks) + print(str(exc)) + assert "Rank 'rank1' not present in A, B, C" in str(exc) + with pytest.raises(ValueError) as exc: + BaseLineageInfo(lineage=linD, ranks=ranks) + print(str(exc)) + assert "Rank 'rank1' not present in A, B, C" in str(exc) + + +def test_BaseLineageInfo_init_not_lineagepair(): + ranks=["A", "B", "C"] + lin_tups = (("rank1", "name1"),) + with pytest.raises(ValueError) as exc: + BaseLineageInfo(lineage=lin_tups, ranks=ranks) + print(str(exc)) + assert "is not LineagePair" in str(exc) + + +def test_RankLineageInfo_taxlist(): + taxinf = RankLineageInfo() + taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + assert taxinf.taxlist == taxranks + assert taxinf.ascending_taxlist == taxranks[::-1] + + +def test_RankLineageInfo_init_lineage_str(): + x = "a;b;c" + taxinf = RankLineageInfo(lineage_str=x) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] + + +def test_RankLineageInfo_init_lineage_tups(): + x = (LineagePair(rank="superkingdom", name='a'), LineagePair(rank="phylum", name='b')) + taxinf = RankLineageInfo(lineage=x) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] + + +def test_RankLineageInfo_init_lineage_dict(): + x = {"superkingdom":'a',"phylum":'b'} + taxinf = RankLineageInfo(lineage_dict=x) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] + + +def test_RankLineageInfo_init_lineage_dict_missing_rank(): + x = {'superkingdom': 'name1', 'class': 'name2'} + taxinf = RankLineageInfo(lineage_dict=x) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', ''] + assert taxinf.zip_lineage(truncate_empty=True)== ['name1', '', 'name2'] + + +def test_RankLineageInfo_init_lineage_dict_missing_rank_withtaxid(): + x = {'superkingdom': {'name': 'name1', 'taxid': 1}, 'class': {'name':'name2', 'taxid': 2}} + taxinf = RankLineageInfo(lineage_dict=x) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', '', 'name2', '', '', '', '', ''] + assert taxinf.zip_taxid()== ['1', '', '2', '', '', '', '', ''] + + +def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): + x = "a;b;c" + rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_dict=rankD) + print("lin1: ", lin1) + print("lin2: ", lin2) + assert lin1 == lin2 + +def test_RankLineageInfo_init_lineage_str_1_truncate(): + x = "a;b;c" + taxinf = RankLineageInfo(lineage_str=x) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', 'c'] + + +def test_RankLineageInfo_init_lineage_str_2(): + x = "a;b;;c" + taxinf = RankLineageInfo(lineage_str=x) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', 'b', '', 'c' '', '', '', '', ''] + + +def test_RankLineageInfo_init_lineage_str_2_truncate(): + x = "a;b;;c" + taxinf = RankLineageInfo(lineage_str=x) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage(truncate_empty=True)== ['a', 'b', '', 'c'] + + +def test_RankLineageInfo_init_lineage_with_incorrect_rank(): + x = [ LineagePair('superkingdom', 'a'), LineagePair("NotARank", ''), LineagePair('class', 'c') ] + with pytest.raises(ValueError) as exc: + RankLineageInfo(lineage=x) + print(str(exc)) + assert f"Rank 'NotARank' not present in " in str(exc) + + +def test_zip_lineage_1(): + x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + taxinf = RankLineageInfo(lineage=x) + print("ranks: ", taxinf.ranks) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage() == ['a', 'b', '', '', '', '', '', ''] + + +def test_zip_lineage_2(): + x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + taxinf = RankLineageInfo(lineage=x) + print("ranks: ", taxinf.ranks) + print("zipped lineage: ", taxinf.zip_lineage(truncate_empty=True)) + assert taxinf.zip_lineage(truncate_empty=True) == ['a', 'b'] + + +def test_zip_lineage_3(): + x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + taxinf = RankLineageInfo(lineage=x) + assert taxinf.zip_lineage() == ['a', '', 'c', '', '', '', '', ''] + + +def test_zip_lineage_3_truncate(): + x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + taxinf = RankLineageInfo(lineage=x) + assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c'] + + +def test_zip_lineage_4(): + x = [ LineagePair('superkingdom', 'a'), LineagePair('class', 'c') ] + taxinf = RankLineageInfo(lineage=x) + assert taxinf.zip_lineage(truncate_empty=True) == ['a', '', 'c'] + + +def test_display_lineage_1(): + x = [ LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b') ] + taxinf = RankLineageInfo(lineage=x) + assert taxinf.display_lineage() == "a;b" + + +def test_display_lineage_2(): + x = [ LineagePair('superkingdom', 'a'), LineagePair(None, ''), LineagePair('class', 'c') ] + taxinf = RankLineageInfo(lineage=x) + assert taxinf.display_lineage() == "a;;c" + + +def test_display_taxid_1(): + x = [ LineagePair('superkingdom', 'a', 1), LineagePair('phylum', 'b', 2) ] + taxinf = RankLineageInfo(lineage=x) + print(taxinf) + assert taxinf.display_taxid() == "1;2" + +def test_display_taxid_2(): + x = [ LineagePair('superkingdom', 'name1', 1), LineagePair(None, ''), LineagePair ('class', 'name2',2) ] + taxinf = RankLineageInfo(lineage=x) + print(taxinf) + assert taxinf.display_taxid() == "1;;2" + + +def test_is_lineage_match_1(): + # basic behavior: match at order and above, but not at family or below. + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') + lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + print(lin1.lineage) + assert lin1.is_lineage_match(lin2, 'superkingdom') + assert lin2.is_lineage_match(lin1, 'superkingdom') + assert lin1.is_lineage_match(lin2, 'phylum') + assert lin2.is_lineage_match(lin1, 'phylum') + assert lin1.is_lineage_match(lin2, 'class') + assert lin2.is_lineage_match(lin1, 'class') + assert lin1.is_lineage_match(lin2, 'order') + assert lin2.is_lineage_match(lin1, 'order') + + assert not lin1.is_lineage_match(lin2, 'family') + assert not lin2.is_lineage_match(lin1, 'family') + assert not lin1.is_lineage_match(lin2, 'genus') + assert not lin2.is_lineage_match(lin1, 'genus') + assert not lin1.is_lineage_match(lin2, 'species') + assert not lin2.is_lineage_match(lin1, 'species') + + +def test_is_lineage_match_2(): + # match at family, and above, levels; no genus or species to match + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + assert lin1.is_lineage_match(lin2, 'superkingdom') + assert lin2.is_lineage_match(lin1, 'superkingdom') + assert lin1.is_lineage_match(lin2, 'phylum') + assert lin2.is_lineage_match(lin1, 'phylum') + assert lin1.is_lineage_match(lin2, 'class') + assert lin2.is_lineage_match(lin1, 'class') + assert lin1.is_lineage_match(lin2, 'order') + assert lin2.is_lineage_match(lin1, 'order') + assert lin1.is_lineage_match(lin2, 'family') + assert lin2.is_lineage_match(lin1, 'family') + + assert not lin1.is_lineage_match(lin2, 'genus') + assert not lin2.is_lineage_match(lin1, 'genus') + assert not lin1.is_lineage_match(lin2, 'species') + assert not lin2.is_lineage_match(lin1, 'species') + + +def test_is_lineage_match_3(): + # one lineage is empty + lin1 = RankLineageInfo() + lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + + assert not lin1.is_lineage_match(lin2, 'superkingdom') + assert not lin2.is_lineage_match(lin1, 'superkingdom') + assert not lin1.is_lineage_match(lin2, 'phylum') + assert not lin2.is_lineage_match(lin1, 'phylum') + assert not lin1.is_lineage_match(lin2, 'class') + assert not lin2.is_lineage_match(lin1, 'class') + assert not lin1.is_lineage_match(lin2, 'order') + assert not lin2.is_lineage_match(lin1, 'order') + assert not lin1.is_lineage_match(lin2, 'family') + assert not lin2.is_lineage_match(lin1, 'family') + assert not lin1.is_lineage_match(lin2, 'genus') + assert not lin2.is_lineage_match(lin1, 'genus') + assert not lin1.is_lineage_match(lin2, 'species') + assert not lin2.is_lineage_match(lin1, 'species') + + +def test_is_lineage_match_incorrect_ranks(): + #test comparison with incompatible ranks + taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e', ranks=taxranks[::-1]) + lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + print(lin1.lineage) + with pytest.raises(ValueError) as exc: + lin1.is_lineage_match(lin2, 'superkingdom') + print(str(exc)) + assert 'Cannot compare lineages from taxonomies with different ranks.' in str(exc) + + +def test_pop_to_rank_1(): + # basic behavior - pop to order? + lin1 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d') + lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f') + + print(lin1) + popped = lin2.pop_to_rank('order') + print(popped) + assert popped == lin1 + + +def test_pop_to_rank_2(): + # what if we're already above rank? + lin2 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d;f__f') + print(lin2.pop_to_rank('species')) + assert lin2.pop_to_rank('species') == lin2 + + +def test_pop_to_rank_rank_not_avail(): + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + with pytest.raises(ValueError) as exc: + lin1.pop_to_rank("NotARank") + print(str(exc)) + assert "Desired Rank 'NotARank' not available for this lineage" in str(exc) + + +def test_lineage_at_rank_norank(): + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + with pytest.raises(TypeError) as exc: + lin1.lineage_at_rank() + print(str(exc)) + assert "lineage_at_rank() missing 1 required positional argument: 'rank'" in str(exc) + + +def test_lineage_at_rank_rank_not_avail(): + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + with pytest.raises(ValueError) as exc: + lin1.lineage_at_rank("NotARank") + print(str(exc)) + assert "Desired Rank 'NotARank' not available for this lineage" in str(exc) + + +def test_lineage_at_rank_1(): + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + print(lin1.lineage_at_rank('superkingdom')) + + assert lin1.lineage_at_rank('superkingdom') == (LineagePair(rank='superkingdom', name='d__a', taxid=None),) + print(lin1.lineage_at_rank('class')) + assert lin1.lineage_at_rank('class') == (LineagePair(rank='superkingdom', name='d__a', taxid=None), + LineagePair(rank='phylum', name='p__b', taxid=None), + LineagePair(rank='class', name='c__c', taxid=None)) + + +def test_lineage_at_rank_below_rank(): + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + print(lin1.lineage_at_rank('superkingdom')) + # if rank is not provided, we only return the filled lineage, to follow original pop_to_rank behavior. + + print(lin1.lineage_at_rank('genus')) + assert lin1.lineage_at_rank('genus') == (LineagePair(rank='superkingdom', name='d__a', taxid=None), + LineagePair(rank='phylum', name='p__b', taxid=None), + LineagePair(rank='class', name='c__c', taxid=None), + LineagePair(rank='order', name='o__d', taxid=None), + LineagePair(rank='family', name='f__f', taxid=None)) From ea7a16392810f92029d6afb85d0c2442f18f0e3c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Jan 2023 17:00:01 -0800 Subject: [PATCH 03/21] fix --- tests/test_tax_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 65334e82aa..d2aceb224c 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1309,14 +1309,14 @@ def test_BaseLineageInfo_init_no_ranks(): def test_BaseLineageInfo_init_with_wrong_ranks(): ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="rank1", name='name1')) + lin_tups = [LineagePair(rank="rank1", name='name1')] linD = {"rank1": "a"} with pytest.raises(ValueError) as exc: BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(str(exc)) assert "Rank 'rank1' not present in A, B, C" in str(exc) with pytest.raises(ValueError) as exc: - BaseLineageInfo(lineage=linD, ranks=ranks) + BaseLineageInfo(lineage_dict=linD, ranks=ranks) print(str(exc)) assert "Rank 'rank1' not present in A, B, C" in str(exc) From 23e2480876800e471bbdfb92cadf42793e959c8c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Jan 2023 17:00:23 -0800 Subject: [PATCH 04/21] add v4.4 columns that will be required --- tests/test-data/tax/test1.gather.csv | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test-data/tax/test1.gather.csv b/tests/test-data/tax/test1.gather.csv index 80388e8d9d..a349fd2774 100644 --- a/tests/test-data/tax/test1.gather.csv +++ b/tests/test-data/tax/test1.gather.csv @@ -1,5 +1,5 @@ -intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename -442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig -390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig -138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig -338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,1024000,31,1000,2507 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,1024000,31,1000,2507 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,1024000,31,1000,2507 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,1024000,31,1000,2507 From f458dc1dc90739c6019eb4ddd57ff5aca8ec1a12 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Jan 2023 17:54:35 -0800 Subject: [PATCH 05/21] make the newly added query_bp info in test1.gather.csv work with existing tests --- tests/test-data/tax/test1.gather.csv | 8 ++++---- tests/test-data/tax/test1.gather_ani.csv | 10 +++++----- tests/test_tax.py | 7 +++---- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/tests/test-data/tax/test1.gather.csv b/tests/test-data/tax/test1.gather.csv index a349fd2774..8d00401dbb 100644 --- a/tests/test-data/tax/test1.gather.csv +++ b/tests/test-data/tax/test1.gather.csv @@ -1,5 +1,5 @@ intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,query_bp,ksize,scaled,query_n_hashes -442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,1024000,31,1000,2507 -390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,1024000,31,1000,2507 -138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,1024000,31,1000,2507 -338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,1024000,31,1000,2507 +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,5014000,31,1000,2507 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,50140000,31,1000,2507 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,5014000,31,1000,2507 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,5014000,31,1000,2507 diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_ani.csv index 48a09eb199..80388e8d9d 100644 --- a/tests/test-data/tax/test1.gather_ani.csv +++ b/tests/test-data/tax/test1.gather_ani.csv @@ -1,5 +1,5 @@ -intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled,query_nhashes -442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000,5013970 -390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000,4571970 -138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000,4181970 -338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000,4327970 +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig diff --git a/tests/test_tax.py b/tests/test_tax.py index b1b3c1b8f8..f44bbfe2a3 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1023,7 +1023,7 @@ def test_genome_rank_human_output(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') csv_base = "out" csvout = runtmp.output(csv_base + '.human.txt') @@ -1878,7 +1878,7 @@ def test_genome_ani_threshold_input_errors(runtmp): def test_genome_ani_threshold(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, @@ -1889,7 +1889,6 @@ def test_genome_ani_threshold(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert "WARNING: Please run gather with sourmash >= 4.4 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out @@ -1917,7 +1916,7 @@ def test_genome_ani_threshold(runtmp): def test_genome_ani_oldgather(runtmp): # Ignore ANI if we don't have the information we need to estimate it c = runtmp - g_csv = utils.get_test_data('tax/test1.gather.csv') + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) From 592993d2c19ec444d88fe733dab176b726cca274 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 5 Jan 2023 17:58:33 -0800 Subject: [PATCH 06/21] rename test1.gather_ani.csv to test1.gather_old.csv to reflect its new usage --- .../tax/{test1.gather_ani.csv => test1.gather_old.csv} | 0 tests/test_tax.py | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename tests/test-data/tax/{test1.gather_ani.csv => test1.gather_old.csv} (100%) diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_old.csv similarity index 100% rename from tests/test-data/tax/test1.gather_ani.csv rename to tests/test-data/tax/test1.gather_old.csv diff --git a/tests/test_tax.py b/tests/test_tax.py index f44bbfe2a3..ba88ecfe00 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1023,7 +1023,7 @@ def test_genome_rank_human_output(runtmp): # test basic genome - output csv c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + g_csv = utils.get_test_data('tax/test1.gather_old.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') csv_base = "out" csvout = runtmp.output(csv_base + '.human.txt') @@ -1841,7 +1841,7 @@ def test_genome_over100percent_error(runtmp): def test_genome_ani_threshold_input_errors(runtmp): c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + g_csv = utils.get_test_data('tax/test1.gather_old.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') below_threshold = "-1" @@ -1916,7 +1916,7 @@ def test_genome_ani_threshold(runtmp): def test_genome_ani_oldgather(runtmp): # Ignore ANI if we don't have the information we need to estimate it c = runtmp - g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + g_csv = utils.get_test_data('tax/test1.gather_old.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) From 06fb8489cc1a4ba53ae9b2574300ff9276e38560 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Jan 2023 07:49:09 -0800 Subject: [PATCH 07/21] moar lineage tests --- src/sourmash/tax/tax_utils.py | 2 ++ tests/test_tax_utils.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index ec731ede5e..02a55f3f8b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -85,6 +85,8 @@ def ascending_taxlist(self): @property def lowest_rank(self): + if not self.filled_ranks: + return None return self.filled_ranks[-1] def rank_index(self, rank): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index d2aceb224c..beef8f214f 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1243,6 +1243,19 @@ def test_LineagePair_1(): print(lin1) +def test_BaseLineageInfo_init_empty(): + ranks=["A", "B", "C"] + taxinf = BaseLineageInfo(ranks=ranks) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['', '', ''] # this is a bit odd, but it's what preserves empty ranks... + print(taxinf.filled_lineage) + assert taxinf.filled_lineage == () + assert taxinf.lowest_lineage_name == "" + assert taxinf.filled_ranks == () + assert taxinf.lowest_rank == None + + def test_BaseLineageInfo_init_lineage_str(): x = "a;b;c" ranks=["A", "B", "C"] @@ -1250,6 +1263,11 @@ def test_BaseLineageInfo_init_lineage_str(): print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.zip_lineage()== ['a', 'b', 'c'] + print(taxinf.filled_lineage) + assert taxinf.filled_lineage == (LineagePair(rank='A', name='a', taxid=None), + LineagePair(rank='B', name='b', taxid=None), + LineagePair(rank='C', name='c', taxid=None)) + assert taxinf.lowest_lineage_name == "C" def test_BaseLineageInfo_init_lineage_tups(): @@ -1278,6 +1296,8 @@ def test_BaseLineageInfo_init_lineage_dict_withtaxid(): print("zipped lineage: ", taxinf.zip_lineage()) assert taxinf.zip_lineage()== ['name1', 'name2'] assert taxinf.zip_taxid()== ['1', '2'] + assert taxinf.lowest_lineage_taxid == "2" + assert taxinf.lowest_lineage_name == "name2" def test_BaseLineageInfo_init_lineage_str_lineage_dict_test_eq(): From ed055b6d348ae5077a6a010be89162698e159ab8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Jan 2023 09:21:49 -0800 Subject: [PATCH 08/21] test remaining codecov misses --- src/sourmash/tax/tax_utils.py | 51 +++++++++++++----------------- tests/test_tax_utils.py | 59 ++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 34 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 02a55f3f8b..835da2cf3b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -105,9 +105,9 @@ def filled_lineage(self): def lowest_lineage_name(self, null_as_unclassified = False): """Return the name of the lowest filled lineage""" if not self.filled_ranks: - #if null_as_unclassified: # todo: enable me - # return "unclassified" - #else: + if null_as_unclassified: # todo: enable me + return "unclassified" + else: return "" return self.filled_lineage[-1].name @@ -143,7 +143,7 @@ def _init_from_lineage_tuples(self): rank_idx = self.rank_index(lin_tup.rank) except ValueError as e: raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e - # make sure we're adding LineagePairs, not lineagePairs for consistency + # make sure we're adding tax_utils.LineagePairs, not lca_utils.LineagePairs for consistency if isinstance(lin_tup, lca_utils.LineagePair): new_lineage[rank_idx] = LineagePair(rank=lin_tup.rank, name=lin_tup.name) else: @@ -238,16 +238,28 @@ def display_taxid(self, truncate_empty=True): "Return lineage taxids as ';'-separated list" return ";".join(self.zip_taxid(truncate_empty=truncate_empty)) + def check_rank_availability(self, rank): + if rank in self.ranks: # rank is available + return True + raise ValueError(f"Desired Rank '{rank}' not available for this lineage.") + + def rank_is_filled(self, rank, other=None): + self.check_rank_availability(rank) + if other is not None: + if rank in self.filled_ranks and rank in other.filled_ranks: + return True + elif rank in self.filled_ranks: + return True + return False + def is_lineage_match(self, other, rank): """ check to see if two lineages are a match down to given rank. """ if not other.ranks == self.ranks: # check same ranks raise ValueError("Cannot compare lineages from taxonomies with different ranks.") - if rank not in self.ranks: # rank is available - raise ValueError(f"Desired Rank {rank} not available for this lineage") # always return false if rank is not filled in either of the two lineages - if rank in self.filled_ranks and rank in other.filled_ranks: + if self.rank_is_filled(rank, other=other): rank_idx = self.rank_index(rank) a_lin = self.lineage[:rank_idx+1] b_lin = other.lineage[:rank_idx+1] @@ -257,10 +269,8 @@ def is_lineage_match(self, other, rank): def pop_to_rank(self, rank): "Return new LineageInfo with ranks only filled to desired rank" - if rank not in self.ranks: - raise ValueError(f"Desired Rank '{rank}' not available for this lineage") # are we already above rank? - if rank not in self.filled_ranks: + if not self.rank_is_filled(rank): return replace(self) # if not, make filled_lineage at this rank + use to generate new LineageInfo new_lineage = self.lineage_at_rank(rank) @@ -272,10 +282,8 @@ def pop_to_rank(self, rank): def lineage_at_rank(self, rank): # non-descructive pop_to_rank. Returns tuple of LineagePairs "Returns tuple of LineagePairs at given rank." - if rank not in self.ranks: - raise ValueError(f"Desired Rank '{rank}' not available for this lineage") # are we already above rank? - if rank not in self.filled_ranks: + if not self.rank_is_filled(rank): return self.filled_lineage # if not, return lineage tuples down to desired rank rank_idx = self.rank_index(rank) @@ -302,23 +310,6 @@ def __post_init__(self): self._init_empty() -@dataclass(frozen=True, order=True) -class LINSLineageInfo(BaseLineageInfo): - """Class for storing multi-rank lineage information""" - num_positions: int = None - ## WHAT special considerations do we have here? - def __post_init__(self): - "Initialize according to passed values" - if self.lineage != [LineagePair()]: - self._init_from_lineage_tuples() - elif self.lineage_str is not None and self.ranks: - self._init_from_lineage_str() - elif self.ranks: - self._init_empty() - else: - raise ValueError("Cannot initialize LINSLineageInfo. Please provide lineage or rank info.") - - def get_ident(ident, *, keep_full_identifiers=False, keep_identifier_versions=False): # split identifiers = split on whitespace diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index beef8f214f..8580fb8ee9 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1252,8 +1252,11 @@ def test_BaseLineageInfo_init_empty(): print(taxinf.filled_lineage) assert taxinf.filled_lineage == () assert taxinf.lowest_lineage_name == "" + assert taxinf.lowest_lineage_taxid == "" assert taxinf.filled_ranks == () - assert taxinf.lowest_rank == None + assert taxinf.lowest_rank == None + assert taxinf.display_lineage() == "" + assert taxinf.display_lineage(null_as_unclassified=True) == "unclassified" def test_BaseLineageInfo_init_lineage_str(): @@ -1267,7 +1270,17 @@ def test_BaseLineageInfo_init_lineage_str(): assert taxinf.filled_lineage == (LineagePair(rank='A', name='a', taxid=None), LineagePair(rank='B', name='b', taxid=None), LineagePair(rank='C', name='c', taxid=None)) - assert taxinf.lowest_lineage_name == "C" + assert taxinf.lowest_lineage_name == "c" + +def test_BaseLineageInfo_init_lineage_str_comma_sep(): + x = "a,b,c" + ranks=["A", "B", "C"] + taxinf = BaseLineageInfo(lineage_str=x, ranks=ranks) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', 'b', 'c'] + print(taxinf.filled_lineage) + assert taxinf.lowest_lineage_name == "c" def test_BaseLineageInfo_init_lineage_tups(): @@ -1279,7 +1292,26 @@ def test_BaseLineageInfo_init_lineage_tups(): assert taxinf.zip_lineage()== ['a', '', 'b'] -def test_BaseLineageInfo_init_lineage_dict(): +def test_BaseLineageInfo_init_lca_lineage_tups(): + ranks=["A", "B", "C"] + lin_tups = (lca_utils.LineagePair(rank="A", name='a'), lca_utils.LineagePair(rank="C", name='b')) + taxinf = BaseLineageInfo(lineage=lin_tups, ranks=ranks) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', '', 'b'] + + +def test_BaseLineageInfo_init_lineage_dict_fail(): + ranks=["A", "B", "C"] + lin_tups = (lca_utils.LineagePair(rank="A", name='a'), lca_utils.LineagePair(rank="C", name='b')) + with pytest.raises(ValueError) as exc: + taxinf = BaseLineageInfo(ranks=ranks, lineage_dict=lin_tups) + print(str(exc)) + + assert "is not dictionary" in str(exc) + + +def test_BaseLineageInfo_init_lineage_dict (): x = {'rank1': 'name1', 'rank2': 'name2'} taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) print("ranks: ", taxinf.ranks) @@ -1296,7 +1328,7 @@ def test_BaseLineageInfo_init_lineage_dict_withtaxid(): print("zipped lineage: ", taxinf.zip_lineage()) assert taxinf.zip_lineage()== ['name1', 'name2'] assert taxinf.zip_taxid()== ['1', '2'] - assert taxinf.lowest_lineage_taxid == "2" + assert taxinf.lowest_lineage_taxid == 2 assert taxinf.lowest_lineage_name == "name2" @@ -1364,6 +1396,14 @@ def test_RankLineageInfo_init_lineage_str(): print(taxinf.lineage_str) assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): + x = "a;b;c" + taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] + taxinf = RankLineageInfo(lineage_str=x, ranks=taxranks) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', ''] + def test_RankLineageInfo_init_lineage_tups(): x = (LineagePair(rank="superkingdom", name='a'), LineagePair(rank="phylum", name='b')) @@ -1577,6 +1617,17 @@ def test_is_lineage_match_incorrect_ranks(): assert 'Cannot compare lineages from taxonomies with different ranks.' in str(exc) +def test_is_lineage_match_improper_rank(): + #test comparison with incompatible ranks + lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') + lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + print(lin1.lineage) + with pytest.raises(ValueError) as exc: + lin1.is_lineage_match(lin2, 'NotARank') + print(str(exc)) + assert "Desired Rank 'NotARank' not available for this lineage" in str(exc) + + def test_pop_to_rank_1(): # basic behavior - pop to order? lin1 = RankLineageInfo(lineage_str='d__a;p__b;c__c;o__d') From c15ff78bc82a56f97388e0c48b32ad45d6fa175c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Jan 2023 09:58:19 -0800 Subject: [PATCH 09/21] whoops, one last codecov miss --- src/sourmash/tax/tax_utils.py | 7 ++----- tests/test_tax_utils.py | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 835da2cf3b..875d8e40d9 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -102,13 +102,10 @@ def filled_lineage(self): return self.lineage[:lowest_filled_rank_idx+1] @property - def lowest_lineage_name(self, null_as_unclassified = False): + def lowest_lineage_name(self): """Return the name of the lowest filled lineage""" if not self.filled_ranks: - if null_as_unclassified: # todo: enable me - return "unclassified" - else: - return "" + return "" return self.filled_lineage[-1].name @property diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 8580fb8ee9..d61d7596a1 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1271,6 +1271,7 @@ def test_BaseLineageInfo_init_lineage_str(): LineagePair(rank='B', name='b', taxid=None), LineagePair(rank='C', name='c', taxid=None)) assert taxinf.lowest_lineage_name == "c" + assert taxinf.lowest_rank == "C" def test_BaseLineageInfo_init_lineage_str_comma_sep(): x = "a,b,c" From 766a8b9ddfcd26405aed3e41634c1a5b353325ee Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Jan 2023 10:31:18 -0800 Subject: [PATCH 10/21] add tax summarization classes; rename old NamedTuples to avoid breakage --- src/sourmash/tax/tax_utils.py | 513 +++++++++++++++++++++++++++++++++- tests/test_tax_utils.py | 128 ++++++++- 2 files changed, 613 insertions(+), 28 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 875d8e40d9..db1bb5b593 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -7,7 +7,7 @@ from collections import abc from itertools import zip_longest from typing import NamedTuple -from dataclasses import dataclass, field, replace +from dataclasses import dataclass, field, replace, asdict import gzip from sourmash import sqlite_utils, sourmash_args @@ -24,15 +24,15 @@ 'aggregate_by_lineage_at_rank', 'format_for_krona', 'write_krona', 'write_summary', 'write_classifications', 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', - 'MultiLineageDB'] + 'MultiLineageDB', 'RankLineageInfo'] from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file # CTB: these could probably usefully be converted into dataclasses. -QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp, query_hashes, total_weighted_hashes") -SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank, total_weighted_hashes") -ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") +QInfo = namedtuple("QInfo", "query_md5, query_filename, query_bp, query_hashes, total_weighted_hashes") +SumGathInf = namedtuple("SumGathInf", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank, total_weighted_hashes") +ClassInf = namedtuple("ClassInf", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") # Essential Gather column names that must be in gather_csv to allow `tax` summarization EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename') @@ -494,7 +494,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], query_bp = unique_intersect_bp + int(row['remaining_bp']) # store query info - query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes=query_nhashes, total_weighted_hashes=total_weighted_hashes) + query_info[query_name] = QInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes=query_nhashes, total_weighted_hashes=total_weighted_hashes) if estimate_query_ani and (not ksize or not scaled): if not set_ksize: @@ -528,7 +528,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_weighted[query_name][lineage] += f_uniq_weighted sum_uniq_bp[query_name][lineage] += unique_intersect_bp - # sort and store each as SummarizedGatherResult + # sort and store each as SumGathInf sum_uniq_to_query_sorted = [] for query_name, lineage_weights in sum_uniq_to_query.items(): qInfo = query_info[query_name] @@ -546,7 +546,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], if estimate_query_ani: query_ani = containment_to_distance(fraction, ksize, scaled, n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp).ani - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, + sres = SumGathInf(query_name, rank, fraction, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes * scaled) sum_uniq_to_query_sorted.append(sres) else: @@ -567,7 +567,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], if estimate_query_ani: query_ani = containment_to_distance(fraction, ksize, scaled, n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp).ani - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, + sres = SumGathInf(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes * scaled) sum_uniq_to_query_sorted.append(sres) @@ -578,7 +578,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], if fraction > 0: f_weighted_at_rank = 1.0 - total_f_weighted bp_intersect_at_rank = qInfo.query_bp - total_bp_classified - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, + sres = SumGathInf(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani, qInfo.total_weighted_hashes*scaled) sum_uniq_to_query_sorted.append(sres) @@ -616,7 +616,7 @@ def make_krona_header(min_rank, *, include_strain=False): def aggregate_by_lineage_at_rank(rank_results, *, by_query=False): ''' - Aggregate list of rank SummarizedGatherResults, + Aggregate list of rank SumGathInfs, keeping query info or aggregating across queries. ''' lineage_summary = defaultdict(float) @@ -635,7 +635,7 @@ def aggregate_by_lineage_at_rank(rank_results, *, by_query=False): def format_for_krona(rank, summarized_gather): ''' - Aggregate list of SummarizedGatherResults and format for krona output + Aggregate list of SumGathInfs and format for krona output ''' num_queries=0 for res_rank, rank_results in summarized_gather.items(): @@ -685,7 +685,7 @@ def write_summary(summarized_gather, csv_fp, *, sep=',', limit_float_decimals=Fa ''' Write taxonomy-summarized gather results for each rank. ''' - header = SummarizedGatherResult._fields + header = SumGathInf._fields w = csv.DictWriter(csv_fp, header, delimiter=sep) w.writeheader() for rank, rank_results in summarized_gather.items(): @@ -840,7 +840,7 @@ def write_classifications(classifications, csv_fp, *, sep=',', limit_float_decim ''' Write taxonomy-classifed gather results. ''' - header = ClassificationResult._fields + header = ClassInf._fields w = csv.DictWriter(csv_fp, header, delimiter=sep) w.writeheader() for rank, rank_results in classifications.items(): @@ -1448,3 +1448,488 @@ def load(cls, locations, **kwargs): tax_assign.add(this_tax_assign) return tax_assign + + +# strategy from: https://subscription.packtpub.com/book/programming/9781800207455/10/ch10lvl1sec01/using-dataclasses-to-simplify-working-with-csv-files +@dataclass +class GatherRow(): # all cols should match "gather_write_cols" in `search.py` + # essential columns + query_name: str + name: str # match_name + f_unique_weighted: float + f_unique_to_query: float + unique_intersect_bp: int + remaining_bp: int + query_md5: str + query_filename: str + # new essential cols: requires 4.4x + query_bp: int + ksize: int + scaled: int + + # non-essential + intersect_bp: int = None + f_orig_query: float = None + f_match: float = None + average_abund: float = None + median_abund: float = None + std_abund: float = None + filename: str = None + md5: str = None + f_match_orig: float = None + gather_result_rank: str = None + moltype: str = None + query_n_hashes: int = None + query_abundance: int = None + query_containment_ani: float = None + match_containment_ani: float = None + average_containment_ani: float = None + max_containment_ani: float = None + potential_false_negative: bool = None + n_unique_weighted_found: int = None + sum_weighted_found: int = None + total_weighted_hashes: int = None + + +@dataclass(frozen=True) +class QueryInfo(): + """Class for storing per-rank lineage information""" + query_name: str + query_md5: str + query_filename: str + query_bp: int + query_n_hashes: int + ksize: int + scaled: int + total_weighted_hashes: int = 0 + + @property + def total_weighted_bp(self): + return self.total_weighted_hashes * self.scaled + +@dataclass +class TaxResult(): + raw: GatherRow + # can we get rid of these / just choose default ident hacking/slashing for future? + keep_full_identifiers: bool = False + keep_identifier_versions: bool = False + + query_name: str = field(init=False) + query_info: QueryInfo = field(init=False) + match_ident: str = field(init=False) + lineageInfo: RankLineageInfo = RankLineageInfo() + skipped_ident: bool = False + missed_ident: bool = False + match_lineage_attempted: bool = False + + def __post_init__(self): + self.get_ident() + self.query_name = self.raw.query_name + if self.raw.query_n_hashes: + self.raw.query_n_hashes = int(self.raw.query_n_hashes) + if self.raw.total_weighted_hashes: + self.raw.total_weighted_hashes = int(self.raw.total_weighted_hashes) + else: + self.raw.total_weighted_hashes = 0 + + self.query_info = QueryInfo(query_name = self.raw.query_name, + query_md5=self.raw.query_md5, + query_filename = self.raw.query_filename, + query_bp = int(self.raw.query_bp), + query_n_hashes = self.raw.query_n_hashes, + total_weighted_hashes = self.raw.total_weighted_hashes, + ksize = int(self.raw.ksize), + scaled = int(self.raw.scaled) + ) + # cast and store the imp bits + self.f_unique_to_query = float(self.raw.f_unique_to_query) + self.f_unique_weighted = float(self.raw.f_unique_weighted) + self.unique_intersect_bp = int(self.raw.unique_intersect_bp) + + def get_ident(self): + # split identifiers = split on whitespace + # keep identifiers = don't split .[12] from assembly accessions + "Hack and slash identifiers." + self.match_ident = self.raw.name + if not self.keep_full_identifiers: + self.match_ident = self.raw.name.split(' ')[0] + else: + #overrides version bc can't keep full without keeping version + self.keep_identifier_versions = True + if not self.keep_identifier_versions: + self.match_ident = self.match_ident.split('.')[0] + + + def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False): + if skip_idents and self.match_ident in skip_idents: + self.skipped_ident = True + else: + lin = tax_assignments.get(self.match_ident) + if lin: + self.lineageInfo = RankLineageInfo(lineage=lin) + else: + self.missed_ident=True + self.match_lineage_attempted = True + if self.missed_ident and fail_on_missing_taxonomy: + raise ValueError(f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy") +# raise ValueError('Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') + +@dataclass +class SummarizedGatherResult(): +# """Class for storing summarized lineage information""" + rank: str + fraction: float + lineage: RankLineageInfo + f_weighted_at_rank: float + bp_match_at_rank: int + query_ani_at_rank: float + + def as_summary_dict(self, query_info=None, limit_float=False): + sD = asdict(self) + if self.lineage == (): # to do -- get rid of me by using null RankLineageInfo() instead of () for empties + sD['lineage'] = 'unclassified' + else: + sD['lineage'] = self.lineage.display_lineage() + if query_info is not None: + sD['query_name'] = query_info.query_name + sD['query_md5'] = query_info.query_md5 + sD['query_filename'] = query_info.query_filename + sD['total_weighted_hashes'] = query_info.total_weighted_hashes + if limit_float: + sD['fraction'] = f'{self.fraction:.3f}' + sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}' + + return(sD) + + def as_human_friendly_dict(self, query_info = None): + sD = self.as_summary_dict(query_info=query_info, limit_float=True) + sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%" + if sD['query_ani_at_rank'] is not None: + sD['query_ani_at_rank'] = f"{self.query_ani_at_rank*100:>3.1f}%" + else: + sD['query_ani_at_rank'] = '- ' + return sD + + def as_kreport_dict(self, rankCode, total_weighted_hashes): + sD = asdict(self) + this_rank = self.lineage.lowest_rank + this_sciname = self.lineage.lowest_lineage_name + sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid + sD['sci_name'] = this_sciname + #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) + sD['rank_code'] = rankCode[this_rank] + sD['num_bp_assigned'] = 0 + # total percent containment, weighted to include abundance info + sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' + sD["num_bp_contained"] = int(self.f_weighted_at_rank * total_weighted_hashes) + # the number of bp actually 'assigned' at this rank. Sourmash assigns everything + # at genome level, but since kreport traditionally doesn't include 'strain' or genome, + # it is reasonable to state that sourmash assigns at 'species' level for this. + # can be modified later. + lowest_assignment_rank = 'species' + if this_rank == lowest_assignment_rank or this_sciname == "unclassified": + sD["num_bp_assigned"] = sD["num_bp_contained"] + +@dataclass +class ClassificationResult(SummarizedGatherResult): +# """Class for storing summarized lineage information""" + status: str + + +@dataclass +class QueryTaxResult(): + """Store all TaxResults for a query. Enable summarization.""" + query_info: QueryInfo # initialize with QueryInfo dataclass + + def __post_init__(self): + self.query_name = self.query_info.query_name # for convenience + self._init_taxresult_vars() + self._init_summarization_vars() + + def _init_taxresult_vars(self): + self.ranks = [] + self.raw_taxresults = [] + self.skipped_idents= set() + self.missed_idents = set() + self.n_missed = 0 + self.n_skipped = 0 + self.perfect_match = set() + + def _init_summarization_vars(self): + self.sum_uniq_weighted = defaultdict(lambda: defaultdict(float)) + self.sum_uniq_to_query = defaultdict(lambda: defaultdict(float)) + self.sum_uniq_bp = defaultdict(lambda: defaultdict(int)) + self.summarized_ranks = [] + self._init_summarization_results() + + def _init_summarization_results(self): + self.total_f_weighted = defaultdict(float) #0.0 + self.total_f_classified = defaultdict(float)#0.0 + self.total_bp_classified = defaultdict(int) #0 + self.summarized_lineage_results = defaultdict(list) + + def _init_classification_results(self): + self.status = 'nomatch' + self.classified_ranks = [] + self.classification_result = None + self.krona_classification_result = None + self.krona_unclassified_result = None + self.krona_classification_header = [] + + def is_compatible(self, taxresult): + return taxresult.query_info == self.query_info + + @property + def ascending_ranks(self): + if not self.ranks: + return [] + else: + return self.ranks[::-1] + + def add_taxresult(self, taxresult): + # check that all query parameters match + if self.is_compatible(taxresult=taxresult): + if not taxresult.match_lineage_attempted: + raise ValueError("Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first.") + if not self.ranks: + self.ranks = taxresult.lineageInfo.ranks + if taxresult.skipped_ident: + self.n_skipped +=1 + self.skipped_idents.add(taxresult.match_ident) + elif taxresult.missed_ident: + self.n_missed +=1 + self.missed_idents.add(taxresult.match_ident) + self.raw_taxresults.append(taxresult) + else: + raise ValueError("Error: Cannot add TaxResult: query information does not match.") + + def summarize_up_ranks(self, single_rank=None, force_resummarize=False): + if self.summarized_ranks: # has already been summarized + if force_resummarize: + self._init_summarization_vars() + else: + raise ValueError("Error: already summarized using rank(s): '{', '.join(self.summarized_ranks)}'. Use 'force_resummarize=True' to reset and resummarize") + # set ranks levels to summarize + self.summarized_ranks = self.ascending_ranks + if single_rank: + if single_rank not in self.summarized_ranks: + raise ValueError(f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})") + self.summarized_ranks = [single_rank] + notify(f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} ") + for taxres in self.raw_taxresults: + lininfo = taxres.lineageInfo + if lininfo and lininfo.filled_lineage: # won't always have lineage to summarize (skipped idents, missed idents) + # notify + track perfect matches + if taxres.f_unique_to_query >= 1.0: + if taxres.match_ident not in self.perfect_match: + notify(f"WARNING: 100% match! Is query '{self.query_name}' identical to its database match, '{taxres.match_ident}'?") + self.perfect_match.add(taxres.match_ident) + # add this taxresult to summary + for rank in self.summarized_ranks: + if rank in lininfo.filled_ranks: # only store if this rank is filled. + lin_at_rank = lininfo.pop_to_rank(rank) + self.sum_uniq_weighted[rank][lin_at_rank] += taxres.f_unique_weighted + self.sum_uniq_to_query[rank][lin_at_rank] += taxres.f_unique_to_query + self.sum_uniq_bp[rank][lin_at_rank] += taxres.unique_intersect_bp + # reset ranks levels to the ones that were actually summarized + that we can access for summarized result + self.summarized_ranks = [x for x in self.summarized_ranks if x in self.sum_uniq_bp.keys()] + if single_rank and single_rank not in self.summarized_ranks: + raise ValueError(f"Error: rank '{single_rank}' was not available for any matching lineages.") + + def build_summarized_result(self, single_rank=None, force_resummarize=False): + # just reset if we've already built summarized result (avoid adding to existing)? Or write in an error/force option? + self._init_summarization_results() + # if taxresults haven't been summarized, do that first + if not self.summarized_ranks or force_resummarize: + self.summarize_up_ranks(single_rank=single_rank, force_resummarize=force_resummarize) + # catch potential error from running summarize_up_ranks separately and passing in different single_rank + if single_rank and single_rank not in self.summarized_ranks: + raise ValueError(f"Error: rank '{single_rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}") + # rank loop is currently done in __main__ + for rank in self.summarized_ranks[::-1]: # reverse so that results are in descending order + sum_uniq_to_query = self.sum_uniq_to_query[rank] #should be lineage: value + # first, sort + sorted_sum_uniq_to_query = list(sum_uniq_to_query.items()) + sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) + for lineage, f_unique in sorted_sum_uniq_to_query: + query_ani = None + if f_unique > 1: + raise ValueError(f"The tax summary of query '{self.query_name}' is {f_unique}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") + elif f_unique == 0: #no annotated results for this query. do we need to handle this differently now? + continue + f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage] + bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage] + + # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank) + # now that I pulled best_only into separate fn, these don't need to be dicts... + self.total_f_classified[rank] += f_unique + self.total_f_weighted[rank] += f_weighted_at_rank + self.total_bp_classified[rank] += bp_intersect_at_rank + + query_ani = containment_to_distance(f_unique, self.query_info.ksize, self.query_info.scaled, + n_unique_kmers=self.query_info.query_hashes, + sequence_len_bp=self.query_info.query_bp).ani + sres = SummarizedGatherResult(lineage=lineage, rank=rank, + f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique, + bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani) + self.summarized_lineage_results[rank].append(sres) + + # record unclassified + lineage = () + query_ani = None + f_unique = 1.0 - self.total_f_classified[rank] + if f_unique > 0: + f_weighted_at_rank = 1.0 - self.total_f_weighted[rank] + bp_intersect_at_rank = self.query_info.query_bp - self.total_bp_classified[rank] + sres = SummarizedGatherResult(lineage=lineage, rank=rank, f_weighted_at_rank=f_weighted_at_rank, + fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani) + self.summarized_lineage_results[rank].append(sres) + + def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False): + if containment_threshold and not 0 <= containment_threshold <= 1: + raise ValueError(f"Containment threshold must be between 0 and 1 (input value: {containment_threshold}).") + if ani_threshold and not 0 <= ani_threshold <= 1: + raise ValueError(f"ANI threshold must be between 0 and 1 (input value: {ani_threshold}).") + self._init_classification_results() # init some fields + #for sg in self.summarized_lineage_results: + if not self.summarized_ranks or force_resummarize: + self.summarize_up_ranks(single_rank=rank, force_resummarize=force_resummarize) + # catch potential error from running summarize_up_ranks separately and passing in different single_rank + self.classified_ranks = self.summarized_ranks + # if a rank is provided, we need to classify ONLY using that rank + if rank: + if rank not in self.summarized_ranks: + raise ValueError(f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}") + else: + self.classified_ranks = [rank] + # CLASSIFY using summarization--> best only result. Best way = use ANI or containment threshold + for this_rank in self.classified_ranks: # ascending order or just single rank + # reset for this rank + f_weighted=0.0 + f_unique_at_rank=0.0 + bp_intersect_at_rank=0 + #sum_uniq_to_query = self.sum_uniq_to_query[this_rank] + sum_uniq_weighted = self.sum_uniq_weighted[this_rank] ##SHOULD WE BE USING WEIGHTED HERE? I THINK YES, but this is a change from before. + # sort the results and grab best + sorted_sum_uniq_weighted = list(sum_uniq_weighted.items()) + sorted_sum_uniq_weighted.sort(key = lambda x: -x[1]) + # best only + this_lineage, f_weighted = sorted_sum_uniq_weighted[0] + if f_weighted > 1: + raise ValueError(f"The tax classification of query '{self.query_name}' is {f_weighted}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") + elif f_weighted == 0: #no annotated results for this query. do we need to handle this differently? + continue + else: + status="below_threshold" + f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage] + query_ani = containment_to_distance(f_unique_at_rank, self.query_info.ksize, self.query_info.scaled, + n_unique_kmers=self.query_info.query_hashes, + sequence_len_bp=self.query_info.query_bp).ani + # set classification status based on thresholds + if ani_threshold: # if provided, just use ani thresh + if query_ani and query_ani >= ani_threshold: + status = 'match' + else: + status = 'below_threshold' + elif f_weighted >= containment_threshold: + status = 'match' + # determine whether to move on to a higher tax rank (if avail) + if status == 'match': + break + + if this_rank == "superkingdom" and status == "nomatch": + status="below_threshold" + + # what happens when we don't have a gather match at all? + bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage] + + classif = ClassificationResult(status=status, rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage, + f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank, + query_ani_at_rank= query_ani) + self.classification_result = classif + + if rank is not None: + lin_as_list = this_lineage.display_lineage().split(';') + krona_classification = (f_weighted, *lin_as_list) + self.krona_classification_result = (krona_classification) + # handle unclassified - do we want/need this? + unclassified_fraction= 1.0-f_weighted + len_unclassified_lin = len(lin_as_list) + unclassifed_lin = ["unclassified"]*(len_unclassified_lin) + self.krona_unclassified_result = (unclassified_fraction, *unclassifed_lin) + self.krona_classification_header=self.make_krona_header(min_rank=rank) + + def make_krona_header(self, min_rank): + "make header for krona output" + if min_rank not in self.summarized_ranks: + raise ValueError(f"Rank '{min_rank}' not present in summarized ranks.") + else: + rank_index = self.ranks.index(min_rank) + return ["fraction"] + list(self.ranks[:rank_index+1]) + + def make_human_summary(self, display_rank, classification=False): + results = [] + if classification: + if not self.classification_result: + raise ValueError("query not classified yet.") + display_rank_results = [self.classification_result] + else: + if not self.summarized_lineage_results: + raise ValueError("lineages not summarized yet.") + display_rank_results = self.summarized_lineage_results[display_rank] + display_rank_results.sort(key=lambda res: -res.f_weighted_at_rank) + + for res in display_rank_results: + results.append(res.as_human_friendly_dict()) + return results + + def make_full_summary(self, classification=False, limit_float=False): + results = [] + if classification: + header= ["query_name", "status", "rank", "fraction", "lineage", + "query_md5", "query_filename", "f_weighted_at_rank", + "bp_match_at_rank", "query_ani_at_rank"] + if not self.classification_result: + raise ValueError("query not classified yet.") + rD = self.classification_result.as_summary_dict(query_info = self.query_info, limit_float=limit_float) + results.append(rD) + else: + header= ["query_name", "rank", "fraction", "lineage", "query_md5", + "query_filename", "f_weighted_at_rank", "bp_match_at_rank", + "query_ani_at_rank", "total_weighted_hashes"] + if not self.summarized_lineage_results: + raise ValueError("lineages not summarized yet.") + + for rank in self.summarized_ranks[::-1]: + rank_results = self.summarized_lineage_results[rank] + rank_results.sort(key=lambda res: -res.f_weighted_at_rank) + for res in rank_results: + results.append(res.as_summary_dict(query_info=self.query_info, limit_float=limit_float)) + return header, results + + def make_kreport_results(self): + rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", + "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} + if self.query_info.total_weighted_hashes == 0: + raise ValueError("ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0") + required_ranks = set(rankCode.keys).pop('unclassified') + if not set(required_ranks).issubset(set(self.ranks)): + raise ValueError("ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}") + kreport_results = [] + unclassified_recorded=False + # want to order results descending by rank + for rank in self.ranks: + if rank == 'strain': # no code for strain, can't include in this output afaik + continue + rank_results = self.summarized_lineage_results[rank] + for res in rank_results: + kresD = res.as_kreport_dict(rankCode, self.query_info.total_weighted_hashes) + if kresD['sci_name'] == "unclassified": + # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. + # Here, we only need to report it once, since it will be the same fraction for all ranks + if unclassified_recorded: + continue + else: + unclassified_recorded = True + kreport_results.append(kresD) + return(kreport_results) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index d61d7596a1..20e53092f4 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -14,15 +14,17 @@ write_summary, MultiLineageDB, collect_gather_csvs, check_and_load_gather_csvs, SummarizedGatherResult, ClassificationResult, + QueryInfo, GatherRow, TaxResult, QueryTaxResult, + BaseLineageInfo, RankLineageInfo, LineagePair, write_classifications, aggregate_by_lineage_at_rank, make_krona_header, format_for_krona, write_krona, combine_sumgather_csvs_by_lineage, write_lineage_sample_frac, - LineageDB, LineageDB_Sqlite) + LineageDB, LineageDB_Sqlite, + SumGathInf, ClassInf, QInfo) # import lca utils as needed for now from sourmash.lca import lca_utils -from sourmash.tax.tax_utils import LineagePair, BaseLineageInfo, RankLineageInfo # utility functions for testing def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False): @@ -45,6 +47,68 @@ def make_mini_taxonomy(tax_info): return taxD +def make_GatherRow(gather_dict=None, exclude_cols=[]): + """Load artificial gather row (dict) into GatherRow class""" + # default contains just the essential cols + gatherD = {'query_name': 'q1', + 'query_md5': 'md5', + 'query_filename': 'query_fn', + 'name': 'gA', + 'f_unique_weighted': 0.2, + 'f_unique_to_query': 0.1, + 'query_bp':100, + 'unique_intersect_bp': 20, + 'remaining_bp': 1, + 'ksize': 31, + 'scaled': 1} + if gather_dict is not None: + gatherD.update(gather_dict) + for col in exclude_cols: + gatherD.pop(col) + gatherRaw = GatherRow(**gatherD) + return gatherRaw + + +def make_TaxResult(gather_dict=None, taxD=None, keep_full_ident=False, keep_ident_version=False, skip_idents=None): + """Make TaxResult from artificial gather row (dict)""" + gRow = make_GatherRow(gather_dict) + taxres = TaxResult(raw=gRow, keep_full_identifiers=keep_full_ident, keep_identifier_versions=keep_ident_version) + if taxD is not None: + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=skip_idents) + return taxres + + +def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_ident=False, keep_ident_version=False, + skip_idents=None, summarize=False, classify=False, classify_rank=None, c_thresh=0.1, ani_thresh=None): + """Make QueryTaxResult(s) from artificial gather information, formatted as list of gather rows (dicts)""" + gather_results = {} + this_querytaxres = None + for gather_infoD in gather_info: + taxres = make_TaxResult(gather_infoD, taxD=taxD, keep_full_ident=keep_full_ident, + keep_ident_version=keep_ident_version, skip_idents=skip_idents) + query_name = taxres.query_name + # add to matching QueryTaxResult or create new one + if not this_querytaxres or not this_querytaxres.is_compatible(taxres): + # get existing or initialize new + this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info)) + this_querytaxres.add_taxresult(taxres) +# print('missed_ident?', taxres.missed_ident) + gather_results[query_name] = this_querytaxres + if summarize: + for query_name, qres in gather_results.items(): + qres.build_summarized_result() + if classify: + for query_name, qres in gather_results.items(): + qres.build_classification_result(rank=classify_rank, containment_threshold=c_thresh, ani_threshold=ani_thresh) + # for convenience: If working with single query, just return that QueryTaxResult. + if single_query: + if len(gather_results.keys()) > 1: + raise ValueError("You passed in results for more than one query") + else: + return next(iter(gather_results.values())) + return gather_results + + ## tests def test_ascending_taxlist_1(): assert list(ascending_taxlist()) == ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] @@ -54,24 +118,60 @@ def test_ascending_taxlist_2(): assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] +def test_GatherRow_old_gather(): + # gather does not contain query_name column + gA = {"name": "gA.1 name"} + with pytest.raises(TypeError) as exc: + make_GatherRow(gA, exclude_cols=['query_bp']) + print(str(exc)) + assert "__init__() missing 1 required positional argument: 'query_bp'" in str(exc) + + def test_get_ident_default(): ident = "GCF_001881345.1" n_id = get_ident(ident) assert n_id == "GCF_001881345" +def test_TaxResult_get_ident_default(): + gA = {"name": "GCF_001881345.1"} # gather result with match name as GCF_001881345.1 + taxres = make_TaxResult(gA) + print(taxres.match_ident) + assert taxres.match_ident == "GCF_001881345" + + def test_get_ident_split_but_keep_version(): - ident = "GCF_001881345.1" + ident = "GCF_001881345.1 secondname" n_id = get_ident(ident, keep_identifier_versions=True) assert n_id == "GCF_001881345.1" +def test_TaxResult_get_ident_split_but_keep_version(): + gA = {"name": "GCF_001881345.1 secondname"} + taxres = make_TaxResult(gA, keep_ident_version=True) + print("raw ident: ", taxres.raw.name) + print("keep_full?: ", taxres.keep_full_identifiers) + print("keep_version?: ",taxres.keep_identifier_versions) + print("final ident: ", taxres.match_ident) + assert taxres.match_ident == "GCF_001881345.1" + + def test_get_ident_no_split(): ident = "GCF_001881345.1 secondname" n_id = get_ident(ident, keep_full_identifiers=True) assert n_id == "GCF_001881345.1 secondname" +def test_TaxResult_get_ident_keep_full(): + gA = {"name": "GCF_001881345.1 secondname"} + taxres = make_TaxResult(gA, keep_full_ident=True) + print("raw ident: ", taxres.raw.name) + print("keep_full?: ", taxres.keep_full_identifiers) + print("keep_version?: ",taxres.keep_identifier_versions) + print("final ident: ", taxres.match_ident) + assert taxres.match_ident == "GCF_001881345.1 secondname" + + def test_collect_gather_csvs(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') from_file = runtmp.output("tmp-from-file.txt") @@ -684,13 +784,13 @@ def test_summarize_gather_at_best_only_equal_choose_first(): def test_write_summary_csv(runtmp): """test summary csv write function""" - sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, + sum_gather = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], - 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, + 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), @@ -711,7 +811,7 @@ def test_write_summary_csv(runtmp): def test_write_classification(runtmp): """test classification csv write function""" - classif = ClassificationResult('queryA', 'match', 'phylum', 1.0, + classif = ClassInf('queryA', 'match', 'phylum', 1.0, (lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='b')), 'queryA_md5', 'queryA.sig', 1.0, 100, @@ -908,26 +1008,26 @@ def test_write_krona(runtmp): def test_combine_sumgather_csvs_by_lineage(runtmp): # some summarized gather dicts - sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, + sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], - 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, + 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} - sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, + sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], - 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, + 'phylum': [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), @@ -1003,26 +1103,26 @@ def test_write_lineage_sample_frac_format_lineage(runtmp): def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): # some summarized gather dicts - sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, + sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], - 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, + 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} - sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, + sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], - 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, + 'phylum': [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), From 808e6befb55e4c603ccdc02f9116dab4307bf1bd Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Jan 2023 10:42:14 -0800 Subject: [PATCH 11/21] finish renaming --- src/sourmash/tax/__main__.py | 8 ++++---- src/sourmash/tax/tax_utils.py | 7 ++----- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 6544e5d610..905db68afd 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -13,7 +13,7 @@ from sourmash.lca.lca_utils import display_lineage, zip_lineage from . import tax_utils -from .tax_utils import ClassificationResult, MultiLineageDB +from .tax_utils import ClassInf, MultiLineageDB usage=''' sourmash taxonomy [] - manipulate/work with taxonomy information. @@ -222,7 +222,7 @@ def genome(args): notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") else: status="match" - classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank) + classif = ClassInf(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank) classifications[args.rank].append(classif) matched_queries.add(sg.query_name) if "krona" in args.output_format: @@ -251,7 +251,7 @@ def genome(args): elif sg.fraction >= args.containment_threshold: status = "match" if status == "match": - classif = ClassificationResult(query_name=sg.query_name, status=status, rank=sg.rank, + classif = ClassInf(query_name=sg.query_name, status=status, rank=sg.rank, fraction=sg.fraction, lineage=sg.lineage, query_md5=sg.query_md5, query_filename=sg.query_filename, f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, @@ -261,7 +261,7 @@ def genome(args): continue elif rank == "superkingdom" and status == "nomatch": status="below_threshold" - classif = ClassificationResult(query_name=sg.query_name, status=status, + classif = ClassInf(query_name=sg.query_name, status=status, rank="", fraction=0, lineage="", query_md5=sg.query_md5, query_filename=sg.query_filename, f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index db1bb5b593..36fb9475f7 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -777,7 +777,7 @@ def write_human_summary(summarized_gather, out_fp, display_rank): ''' Write human-readable taxonomy-summarized gather results for a specific rank. ''' - header = SummarizedGatherResult._fields + header = SumGathInf._fields found_ANI = False results = [] @@ -1586,10 +1586,7 @@ class SummarizedGatherResult(): def as_summary_dict(self, query_info=None, limit_float=False): sD = asdict(self) - if self.lineage == (): # to do -- get rid of me by using null RankLineageInfo() instead of () for empties - sD['lineage'] = 'unclassified' - else: - sD['lineage'] = self.lineage.display_lineage() + sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) if query_info is not None: sD['query_name'] = query_info.query_name sD['query_md5'] = query_info.query_md5 From 7c9fdb4450cdc1c004fdeb609701a0a2ddf7dcee Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 6 Jan 2023 11:00:51 -0800 Subject: [PATCH 12/21] add tests for new classes --- src/sourmash/tax/tax_utils.py | 4 +- tests/test_tax_utils.py | 755 ++++++++++++++++++++++++++++++++++ 2 files changed, 757 insertions(+), 2 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 36fb9475f7..a2503a3e4e 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1764,7 +1764,7 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): self.total_bp_classified[rank] += bp_intersect_at_rank query_ani = containment_to_distance(f_unique, self.query_info.ksize, self.query_info.scaled, - n_unique_kmers=self.query_info.query_hashes, + n_unique_kmers=self.query_info.query_n_hashes, sequence_len_bp=self.query_info.query_bp).ani sres = SummarizedGatherResult(lineage=lineage, rank=rank, f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique, @@ -1820,7 +1820,7 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment status="below_threshold" f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage] query_ani = containment_to_distance(f_unique_at_rank, self.query_info.ksize, self.query_info.scaled, - n_unique_kmers=self.query_info.query_hashes, + n_unique_kmers=self.query_info.query_n_hashes, sequence_len_bp=self.query_info.query_bp).ani # set classification status based on thresholds if ani_threshold: # if provided, just use ani thresh diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 20e53092f4..2f620627e4 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -3,6 +3,7 @@ """ import pytest +from pytest import approx import os from os.path import basename import gzip @@ -1793,3 +1794,757 @@ def test_lineage_at_rank_below_rank(): LineagePair(rank='class', name='c__c', taxid=None), LineagePair(rank='order', name='o__d', taxid=None), LineagePair(rank='family', name='f__f', taxid=None)) + + +def test_TaxResult_get_match_lineage_1(): + gA_tax = ("gA", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + gA = {"name": "gA.1 name"} + taxres = make_TaxResult(gA) + taxres.get_match_lineage(tax_assignments=taxD) + assert taxres.lineageInfo.display_lineage() == "a;b;c" + + +def test_TaxResult_get_match_lineage_skip_ident(): + gA_tax = ("gA", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + gA = {"name": "gA.1 name"} + taxres = make_TaxResult(gA) + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) + print("skipped_ident?: ", taxres.skipped_ident) + print("missed_ident?: ", taxres.missed_ident) + assert taxres.skipped_ident == True + assert taxres.lineageInfo.display_lineage() == "" + + +def test_TaxResult_get_match_lineage_missed_ident(): + gA_tax = ("gA.1", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + gA = {"name": "gA.1 name"} + taxres = make_TaxResult(gA) + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB']) + print("skipped_ident?: ", taxres.skipped_ident) + print("missed_ident?: ", taxres.missed_ident) + assert taxres.skipped_ident == False + assert taxres.missed_ident == True + assert taxres.lineageInfo.display_lineage() == "" + + +def test_TaxResult_get_match_lineage_missed_ident_fail_on_missing(): + gA_tax = ("gA.1", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + gA = {"name": "gA.1 name"} + taxres = make_TaxResult(gA) + with pytest.raises(ValueError) as exc: + taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gB'], fail_on_missing_taxonomy=True) + print(str(exc)) + assert "Error: ident 'gA' is not in the taxonomy database." in str(exc) + + +def test_QueryTaxResult(): + "basic functionality: initialize and add a taxresult" + tax_info = [("gA", "a;b;c")] + taxD = make_mini_taxonomy(tax_info=tax_info) + taxres = make_TaxResult(taxD=taxD) + # initialize + q_res = QueryTaxResult(taxres.query_info) + q_res.add_taxresult(taxres) + # check that new querytaxres is compatible with taxres + assert q_res.is_compatible(taxres) + # check that a few thngs were set properly and/or are not yet set. + assert q_res.query_name == "q1" + assert q_res.query_info.query_bp == 100 + assert len(q_res.raw_taxresults) == 1 + assert q_res.skipped_idents == set() + assert q_res.missed_idents == set() + assert q_res.summarized_lineage_results == {} + + +def test_QueryTaxResult_add_incompatible(): + "initialize and try to add incompatible taxresult" + tax_info = [("gA", "a;b;c")] + taxD = make_mini_taxonomy(tax_info=tax_info) + taxres = make_TaxResult(taxD=taxD) + taxres2 = make_TaxResult({'query_name': 'q2'}, taxD=taxD) + # initialize + q_res = QueryTaxResult(taxres.query_info) + # check that new querytaxres is compatible with taxres and not taxres2 + assert q_res.is_compatible(taxres) + assert not q_res.is_compatible(taxres2) + q_res.add_taxresult(taxres) + with pytest.raises(ValueError) as exc: + q_res.add_taxresult(taxres2) + print(str(exc)) + assert "Error: Cannot add TaxResult: query information does not match." in str(exc) + + +def test_QueryTaxResult_add_without_tax_info(): + "initialize and add a taxresult with missed ident" + taxres = make_TaxResult() # do not add taxonomic info + # initialize + q_res = QueryTaxResult(taxres.query_info) + print("attempted to add lineage info?: ", taxres.match_lineage_attempted) + with pytest.raises(ValueError) as exc: + q_res.add_taxresult(taxres) + print(str(exc)) + assert "Error: Cannot add TaxResult. Please use get_match_lineage() to add taxonomic lineage information first." in str(exc) + + +def test_QueryTaxResult_add_skipped_ident(): + "initialize and add a taxresult with skipped ident" + gA_tax = ("gA", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + taxres = make_TaxResult(taxD=taxD, skip_idents = ['gA']) +# taxres.get_match_lineage(tax_assignments=taxD, skip_idents=['gA']) + # initialize + q_res = QueryTaxResult(taxres.query_info) + q_res.add_taxresult(taxres) + assert len(q_res.skipped_idents) == 1 + assert len(q_res.raw_taxresults) == 1 + assert q_res.missed_idents == set() + assert q_res.summarized_lineage_results == {} + + +def test_QueryTaxResult_add_missed_ident(): + "initialize and add a taxresult with missed ident" + gA_tax = ("gB", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + taxres = make_TaxResult(taxD=taxD) + # initialize + q_res = QueryTaxResult(taxres.query_info) + # add taxonomic info to taxres + q_res.add_taxresult(taxres) + assert len(q_res.missed_idents) == 1 + assert len(q_res.raw_taxresults) == 1 + assert q_res.skipped_idents == set() + assert q_res.summarized_lineage_results == {} + + +def test_QueryTaxResult_track_missed_and_skipped(): + "make sure missed and skipped idents are being tracked" + # make taxonomy + tax_info = [("gA", "a;b;c"), ("gB", "a;b;d")] + taxD = make_mini_taxonomy(tax_info=tax_info) + # make results + taxres = make_TaxResult() + taxres2 = make_TaxResult({"name": 'gB'}) # skipped + taxres3 = make_TaxResult({"name": 'gB'}) # skipped + taxres4 = make_TaxResult({"name": 'gC'}) # skipped + taxres5 = make_TaxResult({"name": 'gD'}) # missed + taxres6 = make_TaxResult({"name": 'gE'}) # missed + # initialize + q_res = QueryTaxResult(taxres.query_info) + # add taxonomic info to taxres, add to q_res + for n, tr in enumerate([taxres, taxres2, taxres3, taxres4, taxres5, taxres6]): + tr.get_match_lineage(tax_assignments=taxD, skip_idents=['gB', 'gC']) + print("num: ", n) + print("skipped?: ", tr.skipped_ident) + print("missed?: ", tr.missed_ident) + q_res.add_taxresult(tr) + assert len(q_res.raw_taxresults) == 6 + print(q_res.n_skipped) + print(q_res.n_missed) + assert q_res.n_missed == 2 + assert q_res.n_skipped == 3 + assert 'gB' in q_res.skipped_idents + assert len(q_res.skipped_idents) == 2 + assert 'gD' in q_res.missed_idents + assert q_res.summarized_lineage_results == {} + + +def test_QueryTaxResult_track_missed_and_skipped_using_fn(): + "make sure missed and skipped idents are being tracked. Same as above but use helper fn." + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}, {"name": 'gB'}, {"name": 'gC'}, {"name": 'gD'}, {"name": 'gE'}] + gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB', 'gC']) + # should have 6 results for default query 'q1' + print(gres.keys()) + q_res = next(iter(gres.values())) + assert len(q_res.raw_taxresults) == 6 + print(q_res.n_skipped) + print(q_res.n_missed) + assert q_res.n_missed == 2 + assert q_res.n_skipped == 3 + assert 'gB' in q_res.skipped_idents + assert len(q_res.skipped_idents) == 2 + assert 'gD' in q_res.missed_idents + assert q_res.summarized_lineage_results == {} + + +def test_QueryTaxResult_summarize_up_ranks_1(): + "basic functionality: summarize up ranks" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) + assert len(gres.keys()) == 1 + q_res = next(iter(gres.values())) + # now summarize up the ranks + q_res.summarize_up_ranks() + assert len(q_res.raw_taxresults) == 2 + #print(q_res.sum_uniq_weighted.values()) + #print(q_res.sum_uniq_weighted['superkingdom']) + assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] + assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.4)} + assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} + assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 40} + assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.4)} + assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} + assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 40} + assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.2)} + assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} + assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 20} + + +def test_QueryTaxResult_summarize_up_ranks_2(): + "summarize up ranks: different values" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + q_res.summarize_up_ranks() + assert len(q_res.raw_taxresults) == 2 + print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted['superkingdom']) + assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.3)} + assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.15)} + assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 30} + assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} + assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.15)} + assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 30} + assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} + assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.05)} + assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 10} + + + +def test_QueryTaxResult_summarize_up_ranks_missing_lineage(): + "basic functionality: summarize up ranks" + taxD = make_mini_taxonomy([("gA", "a;b;c")]) + gather_results = [{}, {"name": 'gB'}] + gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) + assert len(gres.keys()) == 1 + q_res = next(iter(gres.values())) + # now summarize up the ranks + q_res.summarize_up_ranks() + assert len(q_res.raw_taxresults) == 2 + #print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted['superkingdom']) + assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} + assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)} + assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20} + assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} + assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)} + assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20} + assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)} + assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)} + assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20} + + +def test_QueryTaxResult_summarize_up_ranks_skipped_lineage(): + "basic functionality: summarize up ranks" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, skip_idents=['gB']) + assert len(gres.keys()) == 1 + q_res = next(iter(gres.values())) + # now summarize up the ranks + q_res.summarize_up_ranks() + assert len(q_res.raw_taxresults) == 2 + assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] + #print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted['superkingdom']) + assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.2)} + assert q_res.sum_uniq_to_query['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.1)} + assert q_res.sum_uniq_bp['superkingdom'] == {RankLineageInfo(lineage_str="a"): 20} + assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.2)} + assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.1)} + assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 20} + assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2)} + assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1)} + assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20} + + +def test_QueryTaxResult_summarize_up_ranks_perfect_match(): + "summarize up ranks: different values" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{'f_unique_to_query': 1.0}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + q_res.summarize_up_ranks() + assert len(q_res.raw_taxresults) == 1 + print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_to_query['superkingdom']) + assert list(q_res.sum_uniq_to_query['superkingdom'].values()) == [1.0] + assert 'gA' in q_res.perfect_match + + +def test_QueryTaxResult_summarize_up_ranks_already_summarized(): + "summarize up ranks: error, already summarized" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{'f_unique_to_query': 1.0}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + q_res.summarize_up_ranks() + with pytest.raises(ValueError) as exc: + q_res.summarize_up_ranks() + print(str(exc)) + assert "Error: already summarized" in str(exc) + + +def test_QueryTaxResult_summarize_up_ranks_already_summarized_force(): + "summarize up ranks: already summarized but force" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + q_res.summarize_up_ranks() + q_res.summarize_up_ranks(force_resummarize=True) + assert list(q_res.sum_uniq_weighted.keys()) == ['class', 'phylum', 'superkingdom'] + + #check that all results are still good + assert len(q_res.raw_taxresults) == 2 + assert q_res.sum_uniq_weighted['superkingdom'] == {RankLineageInfo(lineage_str="a"): approx(0.3)} + assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} + assert q_res.sum_uniq_to_query['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.15)} + assert q_res.sum_uniq_bp['phylum'] == {RankLineageInfo(lineage_str="a;b"): 30} + assert q_res.sum_uniq_to_query['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.1), + RankLineageInfo(lineage_str="a;b;d"): approx(0.05)} + assert q_res.sum_uniq_weighted['class'] == {RankLineageInfo(lineage_str="a;b;c"): approx(0.2), + RankLineageInfo(lineage_str="a;b;d"): approx(0.1)} + assert q_res.sum_uniq_bp['class'] == {RankLineageInfo(lineage_str="a;b;c"): 20, + RankLineageInfo(lineage_str="a;b;d"): 10} + + +def test_QueryTaxResult_summarize_up_ranks_single_rank(): + "summarize up ranks: different values" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + q_res.summarize_up_ranks(single_rank='phylum') + assert len(q_res.raw_taxresults) == 2 + assert list(q_res.sum_uniq_weighted.keys()) == ['phylum'] + print(q_res.sum_uniq_weighted.keys()) + print(q_res.sum_uniq_weighted.values()) + print(q_res.sum_uniq_weighted['phylum']) + assert q_res.sum_uniq_weighted['phylum'] == {RankLineageInfo(lineage_str="a;b"): approx(0.3)} + assert list(q_res.sum_uniq_to_query['phylum'].values()) == [approx(0.15)] + assert list(q_res.sum_uniq_bp['phylum'].values()) == [30] + assert q_res.summarized_ranks == ['phylum'] + +def test_QueryTaxResult_summarize_up_ranks_single_rank_not_available(): + "summarize up ranks: different values" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + with pytest.raises(ValueError) as exc: + q_res.summarize_up_ranks(single_rank='NotARank') + print(str(exc)) + assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc) + + +def test_QueryTaxResult_summarize_up_ranks_single_rank_not_filled(): + "summarize up ranks: different values" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB','f_unique_weighted': 0.1,'f_unique_to_query': 0.05,'unique_intersect_bp': 10,}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + with pytest.raises(ValueError) as exc: + q_res.summarize_up_ranks(single_rank='species') + print(str(exc)) + assert "Error: rank 'species' was not available for any matching lineages." in str(exc) + + +def test_QueryTaxResult_build_summarized_result_1(): + "basic functionality: build summarized_result" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_summarized_result() + print(q_res.summarized_lineage_results.keys()) + sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.2, f_weighted_at_rank=0.4, + lineage=RankLineageInfo(lineage_str='a'), + bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)), + SummarizedGatherResult(rank='superkingdom', fraction=0.8, f_weighted_at_rank=0.6, + lineage=(), bp_match_at_rank=60, query_ani_at_rank=None)] + print(q_res.summarized_lineage_results['superkingdom']) + assert q_res.summarized_lineage_results['superkingdom'] == sk + print(q_res.summarized_lineage_results['phylum']) + phy = [SummarizedGatherResult(rank='phylum', fraction=0.2, f_weighted_at_rank=0.4, + lineage=RankLineageInfo(lineage_str='a;b'), + bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)), + SummarizedGatherResult(rank='phylum', fraction=0.8, f_weighted_at_rank=0.6, + lineage=(), bp_match_at_rank=60, query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['phylum'] == phy + print(q_res.summarized_lineage_results['class']) + cl = [SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str='a;b;c'), + bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)), + SummarizedGatherResult(rank='class', fraction=0.1, f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str='a;b;d'), + bp_match_at_rank=20, query_ani_at_rank=approx(0.93, rel=1e-2)), + SummarizedGatherResult(rank='class', fraction=0.8, f_weighted_at_rank=0.6, + lineage=(), bp_match_at_rank=60, query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['class'] == cl + + assert q_res.total_f_weighted['phylum'] == approx(0.4) + assert q_res.total_f_classified['class'] == approx(0.2) + assert q_res.total_bp_classified['superkingdom'] == 40 + + +def test_QueryTaxResult_build_summarized_result_2(): + """test two queries, build summarized result for each""" + # make mini taxonomy + gA_tax = ("gA", "a;b") + gB_tax = ("gB", "a;c") + taxD = make_mini_taxonomy([gA_tax, gB_tax]) + # make gather results + gather_results = [{'query_name': 'queryA', 'name': 'gA', 'f_unique_weighted': 0.5,'f_unique_to_query': 0.5,'unique_intersect_bp': 50}, + {'query_name': 'queryA', "name": 'gB', 'f_unique_weighted': 0.4,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}, + {'query_name': 'queryB', "name": 'gB', 'f_unique_weighted': 0.3,'f_unique_to_query': 0.3,'unique_intersect_bp': 30}] + gres = make_QueryTaxResults(gather_info=gather_results, taxD=taxD) + + for query_name, q_res in gres.items(): + q_res.build_summarized_result() # summarize and build result + sk = q_res.summarized_lineage_results['superkingdom'] + phy = q_res.summarized_lineage_results['phylum'] + assert len(sk) == 2 + assert sk[0].lineage == RankLineageInfo(lineage_str="a") + print(phy) + if query_name == 'queryA': + # check superkingdom results + assert sk[0].fraction == approx(0.8) + assert sk[0].f_weighted_at_rank == approx(0.9) + assert sk[0].bp_match_at_rank == 80 + assert sk[1].fraction == approx(0.2) + assert sk[1].f_weighted_at_rank == approx(0.1) + assert sk[1].bp_match_at_rank == 20 + assert sk[1].lineage == () + # check phylum results + assert len(phy) == 3 + assert phy[0].fraction == approx(0.5) + assert phy[0].f_weighted_at_rank == approx(0.5) + assert phy[0].bp_match_at_rank == 50 + assert phy[0].lineage == RankLineageInfo(lineage_str="a;b") + assert phy[1].fraction == approx(0.3) + assert phy[1].f_weighted_at_rank == approx(0.4) + assert phy[1].bp_match_at_rank == 30 + assert phy[1].lineage == RankLineageInfo(lineage_str="a;c") + assert phy[2].fraction == approx(0.2) + assert phy[2].f_weighted_at_rank == approx(0.1) + assert phy[2].bp_match_at_rank == 20 + assert phy[2].lineage == () + if query_name == 'queryB': + # check superkingdom results + assert sk[0].fraction == approx(0.3) + assert sk[0].f_weighted_at_rank == approx(0.3) + assert sk[0].bp_match_at_rank == 30 + assert sk[1].fraction == approx(0.7) + assert sk[1].f_weighted_at_rank == approx(0.7) + assert sk[1].bp_match_at_rank == 70 + assert sk[1].lineage == () + # check phylum results + assert len(phy) == 2 + assert phy[0].fraction == approx(0.3) + assert phy[0].f_weighted_at_rank == approx(0.3) + assert phy[0].bp_match_at_rank == 30 + assert phy[0].lineage == RankLineageInfo(lineage_str="a;c") + assert phy[1].fraction == approx(0.7) + assert phy[1].f_weighted_at_rank == approx(0.7) + assert phy[1].bp_match_at_rank == 70 + assert phy[1].lineage == () + + +def test_QueryTaxResult_build_summarized_result_missing_lineage(): + "build summarized_result with missing lineage" + taxD = make_mini_taxonomy([("gA", "a;b;c")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_summarized_result() + print(q_res.summarized_lineage_results.keys()) + print(q_res.summarized_lineage_results['superkingdom']) + + sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), + SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=(),f_weighted_at_rank=0.8, + bp_match_at_rank=80, query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['superkingdom'] == sk + print(q_res.summarized_lineage_results['phylum']) + phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), + SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=(),f_weighted_at_rank=0.8, + bp_match_at_rank=80, query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['phylum'] == phy + print(q_res.summarized_lineage_results['class']) + cl = [SummarizedGatherResult(rank='class', fraction=0.1, lineage= RankLineageInfo(lineage_str="a;b;c"), + f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), + SummarizedGatherResult(rank='class', fraction=0.9, lineage=(), f_weighted_at_rank=0.8, + bp_match_at_rank=80, query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['class'] == cl + + assert q_res.total_f_weighted['phylum'] == approx(0.2) + assert q_res.total_f_classified['class'] == approx(0.1) + assert q_res.total_bp_classified['superkingdom'] == 20 + + +def test_QueryTaxResult_build_summarized_result_skipped_lineage(): + "build summarized_result with skipped lineage" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, skip_idents=['gB']) + q_res.build_summarized_result() + print(q_res.summarized_lineage_results.keys()) + print(q_res.summarized_lineage_results['superkingdom']) + + sk = [SummarizedGatherResult(rank='superkingdom', fraction=0.1, f_weighted_at_rank=0.2, + lineage=RankLineageInfo(lineage_str="a"), + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), + SummarizedGatherResult(rank='superkingdom', fraction=0.9, lineage=(),f_weighted_at_rank=0.8, + bp_match_at_rank=80, query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['superkingdom'] == sk + print(q_res.summarized_lineage_results['phylum']) + phy = [SummarizedGatherResult(rank='phylum', fraction=0.1, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), + SummarizedGatherResult(rank='phylum', fraction=0.9, lineage=(), f_weighted_at_rank=0.8, bp_match_at_rank=80, + query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['phylum'] == phy + print(q_res.summarized_lineage_results['class']) + cl = [SummarizedGatherResult(rank='class', fraction=0.1,lineage=RankLineageInfo(lineage_str="a;b;c"), + f_weighted_at_rank=0.2, bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)), + SummarizedGatherResult(rank='class', fraction=0.9, lineage=(), f_weighted_at_rank=0.8, bp_match_at_rank=80, + query_ani_at_rank=None)] + assert q_res.summarized_lineage_results['class'] == cl + + assert q_res.total_f_weighted['phylum'] == approx(0.2) + assert q_res.total_f_classified['class'] == approx(0.1) + assert q_res.total_bp_classified['superkingdom'] == 20 + + +def test_QueryTaxResult_build_summarized_result_over100percent(): + "summarize up ranks: different values" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB','f_unique_to_query': 0.95}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + # now summarize up the ranks + assert len(q_res.raw_taxresults) == 2 + with pytest.raises(ValueError) as exc: + q_res.build_summarized_result() + print(str(exc)) + assert "The tax summary of query 'q1' is 1.05, which is > 100% of the query!! This should not be possible." in str(exc) + + +def test_build_summarized_result_rank_fail_not_available_resummarize(): + "build classification result" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.summarize_up_ranks('superkingdom') + with pytest.raises(ValueError) as exc: + q_res.build_summarized_result(single_rank='order') + print(str(exc)) + assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) + + +def test_build_classification_result_containment_threshold(): + "basic functionality: build classification result using containment threshold" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(containment_threshold=0.1) + print("classif: ", q_res.classification_result) + assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + q_res.build_classification_result(containment_threshold=0.4) + print("classif: ", q_res.classification_result) + assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4, + bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) + q_res.build_classification_result(containment_threshold=1.0) + print("classif: ", q_res.classification_result) + assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, + lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4, + bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) + + + +def test_build_classification_result_containment_threshold_fail(): + "classification result: improper containment threshold" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + with pytest.raises(ValueError) as exc: + q_res.build_classification_result(containment_threshold=1.2) + print(str(exc)) + assert "Containment threshold must be between 0 and 1 (input value: 1.2)." in str(exc) + with pytest.raises(ValueError) as exc: + q_res.build_classification_result(containment_threshold=-.1) + print(str(exc)) + assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str(exc) + + +def test_build_classification_result_ani_threshold(): + "basic functionality: build classification result" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(ani_threshold=.92) + print("classif: ", q_res.classification_result) + assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + q_res.build_classification_result(ani_threshold=0.94) + print("classif: ", q_res.classification_result) + assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, + lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4, + bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) + q_res.build_classification_result(ani_threshold=0.96) + print("classif: ", q_res.classification_result) + assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, + lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4, + bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) + + +def test_build_classification_result_ani_threshold_fail(): + "classification result: improper ANI threshold" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + with pytest.raises(ValueError) as exc: + q_res.build_classification_result(ani_threshold=1.2) + print(str(exc)) + assert "ANI threshold must be between 0 and 1 (input value: 1.2)." in str(exc) + with pytest.raises(ValueError) as exc: + q_res.build_classification_result(ani_threshold=-.1) + print(str(exc)) + assert "ANI threshold must be between 0 and 1 (input value: -0.1)." in str(exc) + + +def test_build_classification_result_rank_fail_not_filled(): + "classification result: rank not available (wasn't filled in tax lineage matches)" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + with pytest.raises(ValueError) as exc: + q_res.build_classification_result(rank='order') + print(str(exc)) + assert "Error: rank 'order' was not available for any matching lineages." in str(exc) + +def test_build_classification_result_rank_fail_not_available_resummarize(): + "classification result: rank not available (wasn't summarized)" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.summarize_up_ranks('superkingdom') + with pytest.raises(ValueError) as exc: + q_res.build_classification_result(rank='order') + print(str(exc)) + assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) + +def test_build_classification_result_rank_fail_not_available(): + "classification result: rank not available" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + with pytest.raises(ValueError) as exc: + q_res.build_classification_result(rank='NotARank') + print(str(exc)) + assert "Error: rank 'NotARank' not in available ranks (strain, species, genus, family, order, class, phylum, superkingdom)" in str(exc) + + +def test_build_classification_result_rank_containment_threshold(): + "classification result - rank and containment threshold (default)" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(rank='class') + print("classif: ", q_res.classification_result) + assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + q_res.build_classification_result(rank='class', containment_threshold=0.4) + assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + +def test_build_classification_result_rank_ani_threshold(): + "classification result with rank and ANI threshold" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(rank='class', ani_threshold=0.92) + assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + q_res.build_classification_result(rank='class', ani_threshold=0.95) + assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, + lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, + bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + + +def test_krona_classification_result(): + "basic functionality: build classification result using containment threshold" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result() + assert q_res.krona_classification_result == None + q_res.build_classification_result(rank='phylum')#, force_resummarize=True) + print(q_res.krona_classification_result) + assert q_res.krona_classification_result == (0.4, 'a', 'b') + q_res.build_classification_result(rank='superkingdom') + print(q_res.krona_classification_result) + assert q_res.krona_classification_result == (0.4, 'a') + q_res.build_classification_result() + assert q_res.krona_classification_result == None + +def test_make_krona_header_0(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + phy_header = ["fraction", "superkingdom", "phylum"] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(rank='phylum') + print(q_res.krona_classification_result) + print(q_res.krona_classification_header) + assert q_res.krona_classification_header == phy_header + hd = q_res.make_krona_header('phylum') + print("header: ", hd) + assert hd == phy_header + + +def test_make_krona_header_1(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + class_header = ["fraction", "superkingdom", "phylum", "class"] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(rank='class') + assert q_res.krona_classification_header == class_header + hd = q_res.make_krona_header(min_rank='class') + print("header: ", hd) + assert hd == class_header + + +def test_make_krona_header_fail(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + with pytest.raises(ValueError) as exc: + q_res.make_krona_header("order") + assert "Rank 'order' not present in summarized ranks." in str(exc.value) + with pytest.raises(ValueError) as exc: + q_res.make_krona_header("NotARank") + assert "Rank 'NotARank' not present in summarized ranks." in str(exc.value) + From 8c2c546d79bb19fbbe994d6bacbdb09da5bb73b2 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Sat, 7 Jan 2023 10:29:03 -0800 Subject: [PATCH 13/21] upd --- src/sourmash/tax/tax_utils.py | 5 +++- tests/test_tax_utils.py | 54 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index a2503a3e4e..c7b4e7e66d 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1586,7 +1586,10 @@ class SummarizedGatherResult(): def as_summary_dict(self, query_info=None, limit_float=False): sD = asdict(self) - sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) + if sD['lineage'] == (): # get rid of my by using blank RankLineageInfo() instead of () as empty lini? + sD['lineage'] = "unclassified" + else: + sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) if query_info is not None: sD['query_name'] = query_info.query_name sD['query_md5'] = query_info.query_md5 diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 2f620627e4..bde1dbf158 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2548,3 +2548,57 @@ def test_make_krona_header_fail(): q_res.make_krona_header("NotARank") assert "Rank 'NotARank' not present in summarized ranks." in str(exc.value) + +# def test_make_human_summary(): +# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) +# gather_results = [{}, {"name": 'gB'}] +# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) +# hs = q_res.make_human_summary(display_rank = "superkingdom") +# print(hs) +# assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', +# 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', +# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', +# 'total_weighted_hashes': 0}, +# {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a", +# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', +# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + + +# def test_make_human_summary_2(): +# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) +# gather_results = [{}, {"name": 'gB'}] +# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) +# hs = q_res.make_human_summary(display_rank = "phylum") +# print(hs) +# assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', +# 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', +# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', +# 'total_weighted_hashes': 0}, +# {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', +# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', +# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + + +# def test_make_human_summary_classification(): +# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) +# gather_results = [{}, {"name": 'gB'}] +# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom") +# hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True) +# print(hs) +# assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', +# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, +# 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', +# 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + + +# def test_make_human_summary_classification_2(): +# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) +# gather_results = [{}, {"name": 'gB'}] +# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum") +# hs = q_res.make_human_summary(display_rank = "phylum", classification=True) +# print(hs) +# assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', +# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, +# 'query_ani_at_rank': '94.9%', 'status': 'match', +# 'query_name': 'q1', 'query_md5': 'md5', +# 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] From 00f9a64c3517109f14d7afc80271c000a1f7130a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 9 Jan 2023 13:48:32 -0800 Subject: [PATCH 14/21] more tests; move status checking into ClassificationResult --- src/sourmash/tax/tax_utils.py | 157 ++++++++++++------------ tests/test_tax_utils.py | 220 ++++++++++++++++++++++++---------- 2 files changed, 242 insertions(+), 135 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index c7b4e7e66d..1f255e427e 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1491,21 +1491,29 @@ class GatherRow(): # all cols should match "gather_write_cols" in `search.py` total_weighted_hashes: int = None -@dataclass(frozen=True) +@dataclass() class QueryInfo(): - """Class for storing per-rank lineage information""" - query_name: str - query_md5: str - query_filename: str - query_bp: int - query_n_hashes: int - ksize: int - scaled: int - total_weighted_hashes: int = 0 + """Class for storing query information""" + query_name: str + query_md5: str + query_filename: str + query_bp: int + ksize: int + scaled: int + query_n_hashes: int = None + total_weighted_hashes: int = 0 + + def __post_init__(self): + "Initialize and cast types" + self.query_bp = int(self.query_bp) + self.ksize = int(self.ksize) + self.scaled = int(self.scaled) + self.query_n_hashes = int(self.query_n_hashes) if self.query_n_hashes else 0 + self.total_weighted_hashes = int(self.total_weighted_hashes) if self.total_weighted_hashes else 0 - @property - def total_weighted_bp(self): - return self.total_weighted_hashes * self.scaled + @property + def total_weighted_bp(self): + return self.total_weighted_hashes * self.scaled @dataclass class TaxResult(): @@ -1524,22 +1532,15 @@ class TaxResult(): def __post_init__(self): self.get_ident() - self.query_name = self.raw.query_name - if self.raw.query_n_hashes: - self.raw.query_n_hashes = int(self.raw.query_n_hashes) - if self.raw.total_weighted_hashes: - self.raw.total_weighted_hashes = int(self.raw.total_weighted_hashes) - else: - self.raw.total_weighted_hashes = 0 - + self.query_name = self.raw.query_name # convenience self.query_info = QueryInfo(query_name = self.raw.query_name, query_md5=self.raw.query_md5, query_filename = self.raw.query_filename, - query_bp = int(self.raw.query_bp), + query_bp = self.raw.query_bp, query_n_hashes = self.raw.query_n_hashes, total_weighted_hashes = self.raw.total_weighted_hashes, - ksize = int(self.raw.ksize), - scaled = int(self.raw.scaled) + ksize = self.raw.ksize, + scaled = self.raw.scaled ) # cast and store the imp bits self.f_unique_to_query = float(self.raw.f_unique_to_query) @@ -1582,7 +1583,12 @@ class SummarizedGatherResult(): lineage: RankLineageInfo f_weighted_at_rank: float bp_match_at_rank: int - query_ani_at_rank: float + query_ani_at_rank: float = None + + def set_query_ani(self, query_info): + self.query_ani_at_rank = containment_to_distance(self.fraction, query_info.ksize, query_info.scaled, + n_unique_kmers=query_info.query_n_hashes, + sequence_len_bp=query_info.query_bp).ani def as_summary_dict(self, query_info=None, limit_float=False): sD = asdict(self) @@ -1633,8 +1639,41 @@ def as_kreport_dict(self, rankCode, total_weighted_hashes): @dataclass class ClassificationResult(SummarizedGatherResult): # """Class for storing summarized lineage information""" - status: str + status: str = field(init=False) + def __post_init__(self): + self.status = None + + def set_status(self, query_info, containment_threshold=None, ani_threshold=None): + # check for out of bounds values, set "nomatch" if no match at all + if self.f_weighted_at_rank > 1: + raise ValueError(f"The tax classification of query '{query_info.query_name}' is {self.f_weighted_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") + elif self.f_weighted_at_rank == 0: #no annotated results for this query. This also shouldn't happen, right? + self.status="nomatch" + else: + # if any matches, use 'below_threshold' as default; set 'match' if meets threshold + self.status="below_threshold" + self.set_query_ani(query_info=query_info) + if ani_threshold: # if provided, just use ani thresh, don't use containment threshold + if self.query_ani_at_rank >= ani_threshold: + self.status = 'match' + elif containment_threshold and self.f_weighted_at_rank >= containment_threshold: + self.status = 'match' + + def build_krona_result(self, rank=None): + krona_classified, krona_unclassified = None, None + if rank is not None and rank == self.rank: + lin_as_list = self.lineage.display_lineage().split(';') + krona_classification = (self.f_weighted_at_rank, *lin_as_list) + krona_classified = (krona_classification) + # handle unclassified - do we want/need this? + unclassified_fraction= 1.0-self.f_weighted_at_rank + len_unclassified_lin = len(lin_as_list) + unclassifed_lin = ["unclassified"]*(len_unclassified_lin) + krona_unclassified = (unclassified_fraction, *unclassifed_lin) + return krona_classified, krona_unclassified + + @dataclass class QueryTaxResult(): @@ -1672,9 +1711,9 @@ def _init_classification_results(self): self.status = 'nomatch' self.classified_ranks = [] self.classification_result = None - self.krona_classification_result = None - self.krona_unclassified_result = None - self.krona_classification_header = [] + self.krona_classified = None + self.krona_unclassified = None + self.krona_header = [] def is_compatible(self, taxresult): return taxresult.query_info == self.query_info @@ -1759,19 +1798,15 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): continue f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage] bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage] - # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank) # now that I pulled best_only into separate fn, these don't need to be dicts... self.total_f_classified[rank] += f_unique self.total_f_weighted[rank] += f_weighted_at_rank self.total_bp_classified[rank] += bp_intersect_at_rank - - query_ani = containment_to_distance(f_unique, self.query_info.ksize, self.query_info.scaled, - n_unique_kmers=self.query_info.query_n_hashes, - sequence_len_bp=self.query_info.query_bp).ani sres = SummarizedGatherResult(lineage=lineage, rank=rank, f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique, - bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani) + bp_match_at_rank=bp_intersect_at_rank) + sres.set_query_ani(query_info=self.query_info) self.summarized_lineage_results[rank].append(sres) # record unclassified @@ -1791,7 +1826,6 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment if ani_threshold and not 0 <= ani_threshold <= 1: raise ValueError(f"ANI threshold must be between 0 and 1 (input value: {ani_threshold}).") self._init_classification_results() # init some fields - #for sg in self.summarized_lineage_results: if not self.summarized_ranks or force_resummarize: self.summarize_up_ranks(single_rank=rank, force_resummarize=force_resummarize) # catch potential error from running summarize_up_ranks separately and passing in different single_rank @@ -1815,52 +1849,27 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment sorted_sum_uniq_weighted.sort(key = lambda x: -x[1]) # best only this_lineage, f_weighted = sorted_sum_uniq_weighted[0] - if f_weighted > 1: - raise ValueError(f"The tax classification of query '{self.query_name}' is {f_weighted}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif f_weighted == 0: #no annotated results for this query. do we need to handle this differently? - continue - else: - status="below_threshold" f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage] - query_ani = containment_to_distance(f_unique_at_rank, self.query_info.ksize, self.query_info.scaled, - n_unique_kmers=self.query_info.query_n_hashes, - sequence_len_bp=self.query_info.query_bp).ani - # set classification status based on thresholds - if ani_threshold: # if provided, just use ani thresh - if query_ani and query_ani >= ani_threshold: - status = 'match' - else: - status = 'below_threshold' - elif f_weighted >= containment_threshold: - status = 'match' - # determine whether to move on to a higher tax rank (if avail) - if status == 'match': - break + bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage] - if this_rank == "superkingdom" and status == "nomatch": - status="below_threshold" + classif = ClassificationResult(rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage, + f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank) - # what happens when we don't have a gather match at all? - bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage] + classif.set_status(self.query_info, containment_threshold=containment_threshold, ani_threshold=ani_threshold) + # determine whether to move on to a higher tax rank (if avail) + if classif.status == 'match' or classif.status == "nomatch": # not sure we want/need the `nomatch` part... + break - classif = ClassificationResult(status=status, rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage, - f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank, - query_ani_at_rank= query_ani) + # store the final classification result self.classification_result = classif - - if rank is not None: - lin_as_list = this_lineage.display_lineage().split(';') - krona_classification = (f_weighted, *lin_as_list) - self.krona_classification_result = (krona_classification) - # handle unclassified - do we want/need this? - unclassified_fraction= 1.0-f_weighted - len_unclassified_lin = len(lin_as_list) - unclassifed_lin = ["unclassified"]*(len_unclassified_lin) - self.krona_unclassified_result = (unclassified_fraction, *unclassifed_lin) - self.krona_classification_header=self.make_krona_header(min_rank=rank) + # could do this later, in __main__.py, for example + self.krona_classified, self.krona_unclassified = self.classification_result.build_krona_result(rank=rank) + self.krona_header = self.make_krona_header(min_rank = rank) def make_krona_header(self, min_rank): "make header for krona output" + if min_rank is None: + return [] if min_rank not in self.summarized_ranks: raise ValueError(f"Rank '{min_rank}' not present in summarized ranks.") else: diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index bde1dbf158..e3021d93ab 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -119,6 +119,40 @@ def test_ascending_taxlist_2(): assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] +def test_QueryInfo_basic(): + "basic functionality of QueryInfo dataclass" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + assert qInf.query_name == 'q1' + assert isinstance(qInf.query_n_hashes, int) + assert isinstance(qInf.ksize, int) + assert isinstance(qInf.scaled, int) + assert qInf.total_weighted_hashes == 200 + assert qInf.total_weighted_bp == 2000 + + +def test_QueryInfo_no_hash_info(): + "QueryInfo dataclass for older gather results without query_n_hashes or total_weighted_hashes" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100',ksize=31,scaled=10) + assert qInf.query_name == 'q1' + assert qInf.query_n_hashes == 0 + assert qInf.total_weighted_hashes == 0 + assert qInf.total_weighted_bp == 0 + + +def test_QueryInfo_missing(): + "check that required args" + with pytest.raises(TypeError) as exc: + QueryInfo(query_name='q1', query_filename='f1',query_bp='100',query_n_hashes='10',ksize=31,scaled=10, total_weighted_hashes=200) + print(str(exc)) + assert "missing 1 required positional argument: 'query_md5'" in str(exc) + + +def test_SummarizedGatherResult(): + "basic functionality of SummarizedGatherResult dataclass" + sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + + + def test_GatherRow_old_gather(): # gather does not contain query_name column gA = {"name": "gA.1 name"} @@ -1337,11 +1371,17 @@ def test_lineage_db_sql_load(runtmp): def test_LineagePair(): lin = LineagePair(rank="rank1", name='name1') print(lin) + assert lin.rank=="rank1" + assert lin.name =="name1" + assert lin.taxid==None def test_LineagePair_1(): - lin1 = LineagePair(rank="rank1", name='name1', taxid=1) - print(lin1) + lin = LineagePair(rank="rank1", name='name1', taxid=1) + assert lin.rank=="rank1" + assert lin.name =="name1" + assert lin.taxid==1 + print(lin) def test_BaseLineageInfo_init_empty(): @@ -2358,29 +2398,6 @@ def test_build_summarized_result_rank_fail_not_available_resummarize(): assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) -def test_build_classification_result_containment_threshold(): - "basic functionality: build classification result using containment threshold" - taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) - q_res.build_classification_result(containment_threshold=0.1) - print("classif: ", q_res.classification_result) - assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, - lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) - q_res.build_classification_result(containment_threshold=0.4) - print("classif: ", q_res.classification_result) - assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, - lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4, - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) - q_res.build_classification_result(containment_threshold=1.0) - print("classif: ", q_res.classification_result) - assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, - lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4, - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) - - - def test_build_classification_result_containment_threshold_fail(): "classification result: improper containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) @@ -2396,26 +2413,80 @@ def test_build_classification_result_containment_threshold_fail(): assert "Containment threshold must be between 0 and 1 (input value: -0.1)." in str(exc) + +def test_build_classification_result_containment_threshold(): + "basic functionality: build classification result using containment threshold" + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + + q_res.build_classification_result(containment_threshold=0.1) + print("classif: ", q_res.classification_result) + assert q_res.classification_result.status == 'match' + assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.fraction == 0.1 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") + assert q_res.classification_result.f_weighted_at_rank == 0.2 + assert q_res.classification_result.bp_match_at_rank == 20 + assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) + + q_res.build_classification_result(containment_threshold=0.4) + print("classif: ", q_res.classification_result) + assert q_res.classification_result.status == 'match' + assert q_res.classification_result.rank == 'phylum' + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b") + assert q_res.classification_result.f_weighted_at_rank == 0.4 + assert q_res.classification_result.fraction == 0.2 + assert q_res.classification_result.bp_match_at_rank == 40 + assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2) + + q_res.build_classification_result(containment_threshold=1.0) + print("classif: ", q_res.classification_result) + assert q_res.classification_result.status == 'below_threshold' + assert q_res.classification_result.rank == 'superkingdom' + assert q_res.classification_result.fraction == 0.2 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a") + assert q_res.classification_result.f_weighted_at_rank == 0.4 + assert q_res.classification_result.bp_match_at_rank == 40 + assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2) + + def test_build_classification_result_ani_threshold(): "basic functionality: build classification result" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) gather_results = [{}, {"name": 'gB'}] q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(ani_threshold=.92) print("classif: ", q_res.classification_result) - assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, - lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) - q_res.build_classification_result(ani_threshold=0.94) + assert q_res.classification_result.status == 'match' + assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.fraction == 0.1 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") + assert q_res.classification_result.f_weighted_at_rank == 0.2 + assert q_res.classification_result.bp_match_at_rank == 20 + assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) + + q_res.build_classification_result(ani_threshold=0.94) # should classify at phylum print("classif: ", q_res.classification_result) - assert q_res.classification_result == ClassificationResult(status='match', rank='phylum', fraction=0.2, - lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.4, - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) + assert q_res.classification_result.status == 'match' + assert q_res.classification_result.rank == 'phylum' + assert q_res.classification_result.fraction == 0.2 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b") + assert q_res.classification_result.f_weighted_at_rank == 0.4 + assert q_res.classification_result.bp_match_at_rank == 40 + assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2) + + # superk result, but doesn't meet ANI threshold q_res.build_classification_result(ani_threshold=0.96) print("classif: ", q_res.classification_result) - assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='superkingdom', fraction=0.2, - lineage=RankLineageInfo(lineage_str="a"), f_weighted_at_rank=0.4, - bp_match_at_rank=40, query_ani_at_rank=approx(0.95, rel=1e-2)) + assert q_res.classification_result.status == 'below_threshold' + assert q_res.classification_result.rank == 'superkingdom' + assert q_res.classification_result.fraction == 0.2 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a") + assert q_res.classification_result.f_weighted_at_rank == 0.4 + assert q_res.classification_result.bp_match_at_rank == 40 + assert q_res.classification_result.query_ani_at_rank == approx(0.95, rel=1e-2) def test_build_classification_result_ani_threshold_fail(): @@ -2470,68 +2541,95 @@ def test_build_classification_result_rank_containment_threshold(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) gather_results = [{}, {"name": 'gB'}] q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(rank='class') print("classif: ", q_res.classification_result) - assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, - lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + assert q_res.classification_result.status == 'match' + assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.fraction == 0.1 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") + assert q_res.classification_result.f_weighted_at_rank == 0.2 + assert q_res.classification_result.bp_match_at_rank == 20 + assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) + q_res.build_classification_result(rank='class', containment_threshold=0.4) - assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, - lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) - + assert q_res.classification_result.status == 'below_threshold' + assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.fraction == 0.1 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") + assert q_res.classification_result.f_weighted_at_rank == 0.2 + assert q_res.classification_result.bp_match_at_rank == 20 + assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) + + def test_build_classification_result_rank_ani_threshold(): "classification result with rank and ANI threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) gather_results = [{}, {"name": 'gB'}] q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) + q_res.build_classification_result(rank='class', ani_threshold=0.92) - assert q_res.classification_result == ClassificationResult(status='match', rank='class', fraction=0.1, - lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + assert q_res.classification_result.status == 'match' + assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.fraction == 0.1 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") + assert q_res.classification_result.f_weighted_at_rank == 0.2 + assert q_res.classification_result.bp_match_at_rank == 20 + assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) + q_res.build_classification_result(rank='class', ani_threshold=0.95) - assert q_res.classification_result == ClassificationResult(status='below_threshold', rank='class', fraction=0.1, - lineage=RankLineageInfo(lineage_str="a;b;c"), f_weighted_at_rank=0.2, - bp_match_at_rank=20, query_ani_at_rank=approx(0.928, rel=1e-2)) + assert q_res.classification_result.status == 'below_threshold' + assert q_res.classification_result.rank == 'class' + assert q_res.classification_result.fraction == 0.1 + assert q_res.classification_result.lineage == RankLineageInfo(lineage_str="a;b;c") + assert q_res.classification_result.f_weighted_at_rank == 0.2 + assert q_res.classification_result.bp_match_at_rank == 20 + assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) -def test_krona_classification_result(): +def test_krona_classified(): "basic functionality: build classification result using containment threshold" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) gather_results = [{}, {"name": 'gB'}] q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) q_res.build_classification_result() - assert q_res.krona_classification_result == None + assert q_res.krona_classified == None q_res.build_classification_result(rank='phylum')#, force_resummarize=True) - print(q_res.krona_classification_result) - assert q_res.krona_classification_result == (0.4, 'a', 'b') + print(q_res.krona_classified) + assert q_res.krona_classified == (0.4, 'a', 'b') + assert q_res.krona_unclassified == (0.6, 'unclassified', 'unclassified') q_res.build_classification_result(rank='superkingdom') - print(q_res.krona_classification_result) - assert q_res.krona_classification_result == (0.4, 'a') + print(q_res.krona_classified) + assert q_res.krona_classified == (0.4, 'a') + assert q_res.krona_unclassified == (0.6, 'unclassified') + # make sure this goes back to None if we reclassify without rank q_res.build_classification_result() - assert q_res.krona_classification_result == None + assert q_res.krona_classified == None + assert q_res.krona_unclassified == None + assert q_res.krona_header == [] -def test_make_krona_header_0(): + +def test_make_krona_header_basic(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) gather_results = [{}, {"name": 'gB'}] phy_header = ["fraction", "superkingdom", "phylum"] q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) q_res.build_classification_result(rank='phylum') - print(q_res.krona_classification_result) - print(q_res.krona_classification_header) - assert q_res.krona_classification_header == phy_header + print(q_res.krona_classified) + print(q_res.krona_header) + assert q_res.krona_header == phy_header hd = q_res.make_krona_header('phylum') print("header: ", hd) assert hd == phy_header -def test_make_krona_header_1(): +def test_make_krona_header_basic_1(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) gather_results = [{}, {"name": 'gB'}] class_header = ["fraction", "superkingdom", "phylum", "class"] q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True) q_res.build_classification_result(rank='class') - assert q_res.krona_classification_header == class_header + assert q_res.krona_header == class_header hd = q_res.make_krona_header(min_rank='class') print("header: ", hd) assert hd == class_header From cbb1e65e71e6868f312e8bceb17117ad06ce70bb Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 9 Jan 2023 13:58:33 -0800 Subject: [PATCH 15/21] human summary dict tests --- src/sourmash/tax/tax_utils.py | 4 +- tests/test_tax_utils.py | 106 +++++++++++++++++----------------- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 1f255e427e..853f66f520 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1595,7 +1595,7 @@ def as_summary_dict(self, query_info=None, limit_float=False): if sD['lineage'] == (): # get rid of my by using blank RankLineageInfo() instead of () as empty lini? sD['lineage'] = "unclassified" else: - sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) + sD['lineage'] = self.lineage.display_lineage() # null_as_unclassified=True if query_info is not None: sD['query_name'] = query_info.query_name sD['query_md5'] = query_info.query_md5 @@ -1889,7 +1889,7 @@ def make_human_summary(self, display_rank, classification=False): display_rank_results.sort(key=lambda res: -res.f_weighted_at_rank) for res in display_rank_results: - results.append(res.as_human_friendly_dict()) + results.append(res.as_human_friendly_dict(query_info=self.query_info)) return results def make_full_summary(self, classification=False, limit_float=False): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e3021d93ab..2451a27575 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2647,56 +2647,56 @@ def test_make_krona_header_fail(): assert "Rank 'NotARank' not present in summarized ranks." in str(exc.value) -# def test_make_human_summary(): -# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) -# gather_results = [{}, {"name": 'gB'}] -# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) -# hs = q_res.make_human_summary(display_rank = "superkingdom") -# print(hs) -# assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', -# 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', -# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', -# 'total_weighted_hashes': 0}, -# {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a", -# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', -# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] - - -# def test_make_human_summary_2(): -# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) -# gather_results = [{}, {"name": 'gB'}] -# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) -# hs = q_res.make_human_summary(display_rank = "phylum") -# print(hs) -# assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', -# 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', -# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', -# 'total_weighted_hashes': 0}, -# {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', -# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', -# 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] - - -# def test_make_human_summary_classification(): -# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) -# gather_results = [{}, {"name": 'gB'}] -# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom") -# hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True) -# print(hs) -# assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', -# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, -# 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', -# 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] - - -# def test_make_human_summary_classification_2(): -# taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) -# gather_results = [{}, {"name": 'gB'}] -# q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum") -# hs = q_res.make_human_summary(display_rank = "phylum", classification=True) -# print(hs) -# assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', -# 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, -# 'query_ani_at_rank': '94.9%', 'status': 'match', -# 'query_name': 'q1', 'query_md5': 'md5', -# 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] +def test_make_human_summary(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + hs = q_res.make_human_summary(display_rank = "superkingdom") + print(hs) + assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', + 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', + 'total_weighted_hashes': 0}, + {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a", + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + + +def test_make_human_summary_2(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + hs = q_res.make_human_summary(display_rank = "phylum") + print(hs) + assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', + 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', + 'total_weighted_hashes': 0}, + {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + + +def test_make_human_summary_classification(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="superkingdom") + hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True) + print(hs) + assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, + 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + + +def test_make_human_summary_classification_2(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True, classify_rank="phylum") + hs = q_res.make_human_summary(display_rank = "phylum", classification=True) + print(hs) + assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, + 'query_ani_at_rank': '94.9%', 'status': 'match', + 'query_name': 'q1', 'query_md5': 'md5', + 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] From d4c43fdcae653ba8811edc59ba63b989b0f0d638 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 9 Jan 2023 15:40:17 -0800 Subject: [PATCH 16/21] add value checks and tests for SGR,CR classes --- src/sourmash/tax/tax_utils.py | 100 +++++++++-------- tests/test_tax_utils.py | 206 +++++++++++++++++++++++++++++++--- 2 files changed, 248 insertions(+), 58 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 853f66f520..879a486d37 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -37,6 +37,9 @@ # Essential Gather column names that must be in gather_csv to allow `tax` summarization EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename') +RANKCODE = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", + "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} + # import lca utils as needed for now from sourmash.lca import lca_utils from sourmash.lca.lca_utils import (taxlist, display_lineage, pop_to_rank) @@ -737,9 +740,6 @@ def write_kreport(summarized_gather, csv_fp, *, sep='\t'): columns = ["percent_containment", "num_bp_contained", "num_bp_assigned", "rank_code", "ncbi_taxid", "sci_name"] w = csv.DictWriter(csv_fp, columns, delimiter=sep) - rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", - "order": "O", "family":"F", "genus": "G", "species": "S"} # , "": "U" - # check - are we using v4.5.0 or later gather CSVs? for rank, rank_results in summarized_gather.items(): for res in rank_results: @@ -748,7 +748,7 @@ def write_kreport(summarized_gather, csv_fp, *, sep='\t'): unclassified_written=False for rank, rank_results in summarized_gather.items(): - rcode = rankCode[rank] + rcode = RANKCODE[rank] for res in rank_results: # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. # Here, we only need to report it once, since it will be the same fraction for all ranks @@ -1585,29 +1585,42 @@ class SummarizedGatherResult(): bp_match_at_rank: int query_ani_at_rank: float = None + def __post_init__(self): + self.check_values() + + def check_values(self): + if any([self.fraction > 1, self.f_weighted_at_rank > 1]): + raise ValueError(f"Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query.") + # is this true for weighted too, or is that set to 0 when --ignore-abundance is used? + if any([self.fraction <=0, self.f_weighted_at_rank <= 0]): # this shouldn't actually happen, but it breaks ANI estimation, so let's check for it. + raise ValueError(f"Summarized fraction is <=0% of the query! This should not occur.") + def set_query_ani(self, query_info): self.query_ani_at_rank = containment_to_distance(self.fraction, query_info.ksize, query_info.scaled, n_unique_kmers=query_info.query_n_hashes, sequence_len_bp=query_info.query_bp).ani - def as_summary_dict(self, query_info=None, limit_float=False): + def as_summary_dict(self, query_info, limit_float=False): sD = asdict(self) if sD['lineage'] == (): # get rid of my by using blank RankLineageInfo() instead of () as empty lini? sD['lineage'] = "unclassified" else: sD['lineage'] = self.lineage.display_lineage() # null_as_unclassified=True - if query_info is not None: - sD['query_name'] = query_info.query_name - sD['query_md5'] = query_info.query_md5 - sD['query_filename'] = query_info.query_filename - sD['total_weighted_hashes'] = query_info.total_weighted_hashes + sD['query_name'] = query_info.query_name + sD['query_md5'] = query_info.query_md5 + sD['query_filename'] = query_info.query_filename + sD['total_weighted_hashes'] = str(query_info.total_weighted_hashes) + sD['bp_match_at_rank'] = str(self.bp_match_at_rank) if limit_float: sD['fraction'] = f'{self.fraction:.3f}' sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}' + else: + sD['fraction'] = str(self. fraction) + sD['f_weighted_at_rank'] = str(self.f_weighted_at_rank) return(sD) - def as_human_friendly_dict(self, query_info = None): + def as_human_friendly_dict(self, query_info): sD = self.as_summary_dict(query_info=query_info, limit_float=True) sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%" if sD['query_ani_at_rank'] is not None: @@ -1616,25 +1629,26 @@ def as_human_friendly_dict(self, query_info = None): sD['query_ani_at_rank'] = '- ' return sD - def as_kreport_dict(self, rankCode, total_weighted_hashes): - sD = asdict(self) - this_rank = self.lineage.lowest_rank - this_sciname = self.lineage.lowest_lineage_name - sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid - sD['sci_name'] = this_sciname - #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) - sD['rank_code'] = rankCode[this_rank] - sD['num_bp_assigned'] = 0 - # total percent containment, weighted to include abundance info - sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' - sD["num_bp_contained"] = int(self.f_weighted_at_rank * total_weighted_hashes) - # the number of bp actually 'assigned' at this rank. Sourmash assigns everything - # at genome level, but since kreport traditionally doesn't include 'strain' or genome, - # it is reasonable to state that sourmash assigns at 'species' level for this. - # can be modified later. - lowest_assignment_rank = 'species' - if this_rank == lowest_assignment_rank or this_sciname == "unclassified": - sD["num_bp_assigned"] = sD["num_bp_contained"] + def as_kreport_dict(self, query_info): + sD = {} + this_rank = self.lineage.lowest_rank + this_sciname = self.lineage.lowest_lineage_name + sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid + sD['sci_name'] = this_sciname + #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) + sD['rank_code'] = RANKCODE[this_rank] + sD['num_bp_assigned'] = str(0) + # total percent containment, weighted to include abundance info + sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' + sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_hashes)) + # the number of bp actually 'assigned' at this rank. Sourmash assigns everything + # at genome level, but since kreport traditionally doesn't include 'strain' or genome, + # it is reasonable to state that sourmash assigns at 'species' level for this. + # can be modified later. + lowest_assignment_rank = 'species' + if this_rank == lowest_assignment_rank or this_sciname == "unclassified": + sD["num_bp_assigned"] = sD["num_bp_contained"] + return sD @dataclass class ClassificationResult(SummarizedGatherResult): @@ -1642,23 +1656,20 @@ class ClassificationResult(SummarizedGatherResult): status: str = field(init=False) def __post_init__(self): - self.status = None + # check for out of bounds values, default "nomatch" if no match at all + self.check_values() + self.status = 'nomatch' #None? def set_status(self, query_info, containment_threshold=None, ani_threshold=None): - # check for out of bounds values, set "nomatch" if no match at all - if self.f_weighted_at_rank > 1: - raise ValueError(f"The tax classification of query '{query_info.query_name}' is {self.f_weighted_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif self.f_weighted_at_rank == 0: #no annotated results for this query. This also shouldn't happen, right? - self.status="nomatch" - else: - # if any matches, use 'below_threshold' as default; set 'match' if meets threshold + # if any matches, use 'below_threshold' as default; set 'match' if meets threshold + if any([containment_threshold, ani_threshold]): self.status="below_threshold" - self.set_query_ani(query_info=query_info) - if ani_threshold: # if provided, just use ani thresh, don't use containment threshold - if self.query_ani_at_rank >= ani_threshold: - self.status = 'match' - elif containment_threshold and self.f_weighted_at_rank >= containment_threshold: - self.status = 'match' + self.set_query_ani(query_info=query_info) + if ani_threshold: # if provided, just use ani thresh, don't use containment threshold + if self.query_ani_at_rank >= ani_threshold: + self.status = 'match' + elif containment_threshold and self.f_weighted_at_rank >= containment_threshold: + self.status = 'match' def build_krona_result(self, rank=None): krona_classified, krona_unclassified = None, None @@ -1672,7 +1683,6 @@ def build_krona_result(self, rank=None): unclassifed_lin = ["unclassified"]*(len_unclassified_lin) krona_unclassified = (unclassified_fraction, *unclassifed_lin) return krona_classified, krona_unclassified - @dataclass diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 2451a27575..4852d653b5 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -149,8 +149,185 @@ def test_QueryInfo_missing(): def test_SummarizedGatherResult(): "basic functionality of SummarizedGatherResult dataclass" - sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + print(sgr) + assert sgr.rank=='phylum' + sumD = sgr.as_summary_dict(query_info=qInf) + print(sumD) + assert sumD == {'rank': 'phylum', 'fraction': "0.2", 'lineage': 'a;b', 'f_weighted_at_rank': "0.3", + 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + hD = sgr.as_human_friendly_dict(query_info=qInf) + print(hD) + assert hD == {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '30.0%', + 'bp_match_at_rank': "30", 'query_ani_at_rank': '- ', 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + krD = sgr.as_kreport_dict(query_info=qInf) + print(krD) + assert krD == {'ncbi_taxid': None, 'sci_name': 'b', 'rank_code': 'P', 'num_bp_assigned': "0", + 'percent_containment': '30.00', 'num_bp_contained': "60"} + + +def test_SummarizedGatherResult_set_query_ani(): + "Check ANI estimation within SummarizedGatherResult dataclass" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + sgr.set_query_ani(query_info=qInf) + print(sgr.query_ani_at_rank) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + # ANI can be calculated with query_bp OR query_n_hashes. Remove each and check the results are identical + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes=0,ksize='31',scaled='10', total_weighted_hashes='200') + sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + sgr.set_query_ani(query_info=qInf) + print(sgr.query_ani_at_rank) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + # try without query_bp + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp=0, + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + sgr.set_query_ani(query_info=qInf) + print(sgr.query_ani_at_rank) + assert sgr.query_ani_at_rank == approx(0.949, rel=1e-3) + + +def test_SummarizedGatherResult_greater_than_1(): + "basic functionality of SummarizedGatherResult dataclass" + # fraction > 1 + with pytest.raises(ValueError) as exc: + SummarizedGatherResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=1.2, bp_match_at_rank=30) + print(str(exc)) + assert "> 100% of the query!" in str(exc) + # f_weighted > 1 + with pytest.raises(ValueError) as exc: + SummarizedGatherResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + print(str(exc)) + assert "> 100% of the query!" in str(exc) + +def test_SummarizedGatherResult_0_fraction(): + with pytest.raises(ValueError) as exc: + SummarizedGatherResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + err_msg = "Summarized fraction is <=0% of the query! This should not occur." + assert err_msg in str(exc) + #assert cr.status == 'nomatch' + + with pytest.raises(ValueError) as exc: + SummarizedGatherResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0, bp_match_at_rank=30) + print(str(exc)) + assert err_msg in str(exc) + + +def test_SummarizedGatherResult_species_kreport(): + "basic functionality of SummarizedGatherResult dataclass" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + sgr = SummarizedGatherResult(rank="species", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b;c;d;e;f;g"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + print(sgr) + assert sgr.rank=='species' + krD = sgr.as_kreport_dict(query_info=qInf) + print(krD) + assert krD == {'ncbi_taxid': None, 'sci_name': 'g', 'rank_code': 'S', 'num_bp_assigned': "60", + 'percent_containment': '30.00', 'num_bp_contained': "60"} + + +def test_SummarizedGatherResult_summary_dict_limit_float(): + "basic functionality of SummarizedGatherResult dataclass" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + sgr = SummarizedGatherResult(rank="phylum", fraction=0.123456, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.345678, bp_match_at_rank=30) + print(sgr) + assert sgr.rank=='phylum' + sumD = sgr.as_summary_dict(query_info=qInf) + print(sumD) + assert sumD == {'rank': 'phylum', 'fraction': "0.123456", 'lineage': 'a;b', 'f_weighted_at_rank': "0.345678", + 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + + sumD = sgr.as_summary_dict(query_info=qInf, limit_float=True) + print(sumD) + assert sumD == {'rank': 'phylum', 'fraction': "0.123", 'lineage': 'a;b', 'f_weighted_at_rank': "0.346", + 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + + +def test_ClassificationResult(): + "basic functionality of ClassificationResult dataclass" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + cr.set_status(query_info=qInf, containment_threshold=0.1) + assert cr.status == 'match' + print(cr.query_ani_at_rank) + assert cr.query_ani_at_rank == approx(0.949, rel=1e-3) + cr.set_status(query_info=qInf, containment_threshold=0.35) + assert cr.status == 'below_threshold' + + +def test_ClassificationResult_greater_than_1(): + "basic functionality of SummarizedGatherResult dataclass" + # fraction > 1 + with pytest.raises(ValueError) as exc: + ClassificationResult(rank="phylum", fraction=0.3, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=1.2, bp_match_at_rank=30) + print(str(exc)) + assert "> 100% of the query!" in str(exc) + # f_weighted > 1 + with pytest.raises(ValueError) as exc: + ClassificationResult(rank="phylum", fraction=1.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + print(str(exc)) + assert "> 100% of the query!" in str(exc) + + +def test_ClassificationResult_0_fraction(): + with pytest.raises(ValueError) as exc: + ClassificationResult(rank="phylum", fraction=-.1, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + err_msg = "Summarized fraction is <=0% of the query! This should not occur." + assert err_msg in str(exc) + #assert cr.status == 'nomatch' + + with pytest.raises(ValueError) as exc: + ClassificationResult(rank="phylum", fraction=.1, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0, bp_match_at_rank=30) + print(str(exc)) + assert err_msg in str(exc) + + +def test_ClassificationResult_build_krona_result(): + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + #cr.set_status(query_info=qInf, rank='phylum') + kr, ukr = cr.build_krona_result(rank='phylum') + print(kr) + assert kr == (0.3, 'a', 'b') + print(ukr) + assert ukr == (0.7, 'unclassified', 'unclassified') + + +def test_ClassificationResult_build_krona_result_no_rank(): + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + cr = ClassificationResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage_str="a;b"), + f_weighted_at_rank=0.3, bp_match_at_rank=30, query_ani_at_rank=0.97) + cr.set_status(query_info=qInf, containment_threshold=0.1) def test_GatherRow_old_gather(): @@ -1902,6 +2079,9 @@ def test_QueryTaxResult(): assert q_res.skipped_idents == set() assert q_res.missed_idents == set() assert q_res.summarized_lineage_results == {} + taxranks = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + assert q_res.ranks == taxranks + assert q_res.ascending_ranks == taxranks[::-1] def test_QueryTaxResult_add_incompatible(): @@ -2654,12 +2834,12 @@ def test_make_human_summary(): hs = q_res.make_human_summary(display_rank = "superkingdom") print(hs) assert hs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', - 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', + 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '- ', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': 0}, + 'total_weighted_hashes': "0"}, {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': "a", - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] def test_make_human_summary_2(): @@ -2669,12 +2849,12 @@ def test_make_human_summary_2(): hs = q_res.make_human_summary(display_rank = "phylum") print(hs) assert hs == [{'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', - 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': 60, 'query_ani_at_rank': '- ', + 'f_weighted_at_rank': '60.0%', 'bp_match_at_rank': "60", 'query_ani_at_rank': '- ', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', - 'total_weighted_hashes': 0}, + 'total_weighted_hashes': "0"}, {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, 'query_ani_at_rank': '94.9%', - 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] def test_make_human_summary_classification(): @@ -2684,9 +2864,9 @@ def test_make_human_summary_classification(): hs = q_res.make_human_summary(display_rank = "superkingdom", classification=True) print(hs) assert hs == [{'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', - 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] def test_make_human_summary_classification_2(): @@ -2696,7 +2876,7 @@ def test_make_human_summary_classification_2(): hs = q_res.make_human_summary(display_rank = "phylum", classification=True) print(hs) assert hs == [{'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', - 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': 40, + 'f_weighted_at_rank': '40.0%', 'bp_match_at_rank': "40", 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', - 'query_filename': 'query_fn', 'total_weighted_hashes': 0}] + 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] From 724bf74157e90e49f80fcc7778ea86269c06c94c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 9 Jan 2023 16:20:11 -0800 Subject: [PATCH 17/21] test make_full_summary --- src/sourmash/tax/tax_utils.py | 45 ++++++++------ tests/test_tax_utils.py | 114 +++++++++++++++++++++++++++++++++- 2 files changed, 139 insertions(+), 20 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 879a486d37..f15b954db8 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1614,8 +1614,10 @@ def as_summary_dict(self, query_info, limit_float=False): if limit_float: sD['fraction'] = f'{self.fraction:.3f}' sD['f_weighted_at_rank'] = f'{self.f_weighted_at_rank:.3f}' + if self.query_ani_at_rank: + sD['query_ani_at_rank'] = f'{self.query_ani_at_rank:.3f}'#f"{self.query_ani_at_rank*100:>3.1f}%" else: - sD['fraction'] = str(self. fraction) + sD['fraction'] = str(self.fraction) sD['f_weighted_at_rank'] = str(self.f_weighted_at_rank) return(sD) @@ -1623,7 +1625,7 @@ def as_summary_dict(self, query_info, limit_float=False): def as_human_friendly_dict(self, query_info): sD = self.as_summary_dict(query_info=query_info, limit_float=True) sD['f_weighted_at_rank'] = f"{self.f_weighted_at_rank*100:>4.1f}%" - if sD['query_ani_at_rank'] is not None: + if self.query_ani_at_rank is not None: sD['query_ani_at_rank'] = f"{self.query_ani_at_rank*100:>3.1f}%" else: sD['query_ani_at_rank'] = '- ' @@ -1694,6 +1696,7 @@ def __post_init__(self): self.query_name = self.query_info.query_name # for convenience self._init_taxresult_vars() self._init_summarization_vars() + self._init_classification_results() def _init_taxresult_vars(self): self.ranks = [] @@ -1801,24 +1804,23 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): sorted_sum_uniq_to_query = list(sum_uniq_to_query.items()) sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) for lineage, f_unique in sorted_sum_uniq_to_query: - query_ani = None - if f_unique > 1: - raise ValueError(f"The tax summary of query '{self.query_name}' is {f_unique}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif f_unique == 0: #no annotated results for this query. do we need to handle this differently now? + # does this ever happen? do we need it? + if f_unique == 0: #no annotated results for this query. do we need to handle this differently now? continue f_weighted_at_rank = self.sum_uniq_weighted[rank][lineage] bp_intersect_at_rank = self.sum_uniq_bp[rank][lineage] - # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank) - # now that I pulled best_only into separate fn, these don't need to be dicts... - self.total_f_classified[rank] += f_unique - self.total_f_weighted[rank] += f_weighted_at_rank - self.total_bp_classified[rank] += bp_intersect_at_rank sres = SummarizedGatherResult(lineage=lineage, rank=rank, f_weighted_at_rank=f_weighted_at_rank, fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank) sres.set_query_ani(query_info=self.query_info) self.summarized_lineage_results[rank].append(sres) + # NTP Note: These change by rank ONLY when doing best_only (selecting top hit at that particular rank) + # now that I pulled best_only into separate fn, these don't need to be dicts... + self.total_f_classified[rank] += f_unique + self.total_f_weighted[rank] += f_weighted_at_rank + self.total_bp_classified[rank] += bp_intersect_at_rank + # record unclassified lineage = () query_ani = None @@ -1886,15 +1888,21 @@ def make_krona_header(self, min_rank): rank_index = self.ranks.index(min_rank) return ["fraction"] + list(self.ranks[:rank_index+1]) + def check_classification(self): + if not self.classification_result: + raise ValueError("query not classified yet.") + + def check_summarization(self): + if not self.summarized_lineage_results: + raise ValueError("lineages not summarized yet.") + def make_human_summary(self, display_rank, classification=False): results = [] if classification: - if not self.classification_result: - raise ValueError("query not classified yet.") + self.check_classification() display_rank_results = [self.classification_result] else: - if not self.summarized_lineage_results: - raise ValueError("lineages not summarized yet.") + self.check_summarization() display_rank_results = self.summarized_lineage_results[display_rank] display_rank_results.sort(key=lambda res: -res.f_weighted_at_rank) @@ -1905,19 +1913,17 @@ def make_human_summary(self, display_rank, classification=False): def make_full_summary(self, classification=False, limit_float=False): results = [] if classification: + self.check_classification() header= ["query_name", "status", "rank", "fraction", "lineage", "query_md5", "query_filename", "f_weighted_at_rank", "bp_match_at_rank", "query_ani_at_rank"] - if not self.classification_result: - raise ValueError("query not classified yet.") rD = self.classification_result.as_summary_dict(query_info = self.query_info, limit_float=limit_float) results.append(rD) else: + self.check_summarization() header= ["query_name", "rank", "fraction", "lineage", "query_md5", "query_filename", "f_weighted_at_rank", "bp_match_at_rank", "query_ani_at_rank", "total_weighted_hashes"] - if not self.summarized_lineage_results: - raise ValueError("lineages not summarized yet.") for rank in self.summarized_ranks[::-1]: rank_results = self.summarized_lineage_results[rank] @@ -1927,6 +1933,7 @@ def make_full_summary(self, classification=False, limit_float=False): return header, results def make_kreport_results(self): + self.check_summarization() rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} if self.query_info.total_weighted_hashes == 0: diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 4852d653b5..5a73060604 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2563,7 +2563,7 @@ def test_QueryTaxResult_build_summarized_result_over100percent(): with pytest.raises(ValueError) as exc: q_res.build_summarized_result() print(str(exc)) - assert "The tax summary of query 'q1' is 1.05, which is > 100% of the query!! This should not be possible." in str(exc) + assert "Summarized fraction is > 100% of the query! This should not be possible" in str(exc) def test_build_summarized_result_rank_fail_not_available_resummarize(): @@ -2694,6 +2694,7 @@ def test_build_classification_result_rank_fail_not_filled(): print(str(exc)) assert "Error: rank 'order' was not available for any matching lineages." in str(exc) + def test_build_classification_result_rank_fail_not_available_resummarize(): "classification result: rank not available (wasn't summarized)" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) @@ -2705,6 +2706,7 @@ def test_build_classification_result_rank_fail_not_available_resummarize(): print(str(exc)) assert "Error: rank 'order' not in summarized rank(s), superkingdom" in str(exc) + def test_build_classification_result_rank_fail_not_available(): "classification result: rank not available" taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) @@ -2880,3 +2882,113 @@ def test_make_human_summary_classification_2(): 'query_ani_at_rank': '94.9%', 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': "0"}] + + +def test_make_full_summary(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + header, fs = q_res.make_full_summary() + assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', + 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes'] + print(fs) + assert fs == [{'rank': 'superkingdom', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': + '0.6', 'bp_match_at_rank': '60', 'query_ani_at_rank': None, + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', + 'total_weighted_hashes': '0'}, + {'rank': 'superkingdom', 'fraction': '0.2', 'lineage': 'a', 'f_weighted_at_rank': '0.4', + 'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'phylum', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6', + 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', + 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'phylum', 'fraction': '0.2', 'lineage': 'a;b', 'f_weighted_at_rank': '0.4', + 'bp_match_at_rank': '40', 'query_ani_at_rank': approx(0.949,rel=1e-3), 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'class', 'fraction': '0.8', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.6', + 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', + 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2', + 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;d','f_weighted_at_rank': '0.2', + 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}] + + header, fs = q_res.make_full_summary(limit_float=True) + assert header == ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', + 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank', 'total_weighted_hashes'] + print(fs) + assert fs == [{'rank': 'superkingdom', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': + '0.600', 'bp_match_at_rank': '60', 'query_ani_at_rank': None, + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', + 'total_weighted_hashes': '0'}, + {'rank': 'superkingdom', 'fraction': '0.200', 'lineage': 'a', 'f_weighted_at_rank': '0.400', + 'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'phylum', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600', + 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', + 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '0.400', + 'bp_match_at_rank': '40', 'query_ani_at_rank': "0.949", 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'class', 'fraction': '0.800', 'lineage': 'unclassified', 'f_weighted_at_rank': '0.600', + 'bp_match_at_rank': '60', 'query_ani_at_rank': None, 'query_name': 'q1', 'query_md5': 'md5', + 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200', + 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", + 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}, + {'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;d','f_weighted_at_rank': '0.200', + 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'query_fn', 'total_weighted_hashes': '0'}] + + +def test_make_full_summary_summarization_fail(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False) + with pytest.raises(ValueError) as exc: + q_res.make_full_summary() + print(str(exc)) + assert 'not summarized yet' in str(exc) + + +def test_make_full_summary_classification(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True) + header, fs = q_res.make_full_summary(classification=True) + assert header == ["query_name", "status", "rank", "fraction", "lineage", + "query_md5", "query_filename", "f_weighted_at_rank", + "bp_match_at_rank", "query_ani_at_rank"] + print(fs) + assert fs == [{'rank': 'class', 'fraction': '0.1', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.2', + 'bp_match_at_rank': '20', 'query_ani_at_rank': approx(0.928, rel=1e-3), + 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', + 'total_weighted_hashes': '0'}] + + +def test_make_full_summary_classification_limit_float(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, classify=True) + header, fs = q_res.make_full_summary(classification=True, limit_float=True) + assert header == ["query_name", "status", "rank", "fraction", "lineage", + "query_md5", "query_filename", "f_weighted_at_rank", + "bp_match_at_rank", "query_ani_at_rank"] + print(fs) + assert fs == [{'rank': 'class', 'fraction': '0.100', 'lineage': 'a;b;c', 'f_weighted_at_rank': '0.200', + 'bp_match_at_rank': '20', 'query_ani_at_rank': "0.928", + 'status': 'match', 'query_name': 'q1', 'query_md5': 'md5', 'query_filename': 'query_fn', + 'total_weighted_hashes': '0'}] + + +def test_make_full_summary_classification_fail(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + with pytest.raises(ValueError) as exc: + q_res.make_full_summary(classification=True) + print(str(exc)) + assert 'not classified yet' in str(exc) + From 9dd9adffb0c5357703f57515e8204066561b9c38 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 9 Jan 2023 16:28:48 -0800 Subject: [PATCH 18/21] use f_unique, not f_weighted to preserve current functionality --- src/sourmash/tax/tax_utils.py | 19 ++++++++++--------- tests/test_tax_utils.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index f15b954db8..2e33c1cd68 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1670,8 +1670,10 @@ def set_status(self, query_info, containment_threshold=None, ani_threshold=None) if ani_threshold: # if provided, just use ani thresh, don't use containment threshold if self.query_ani_at_rank >= ani_threshold: self.status = 'match' - elif containment_threshold and self.f_weighted_at_rank >= containment_threshold: - self.status = 'match' + # should we switch to using weighted here? I think yes, but this would be behavior change + elif containment_threshold and self.fraction >= containment_threshold: + #elif containment_threshold and self.f_weighted_at_rank >= containment_threshold: + self.status = 'match' def build_krona_result(self, rank=None): krona_classified, krona_unclassified = None, None @@ -1854,15 +1856,14 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment f_weighted=0.0 f_unique_at_rank=0.0 bp_intersect_at_rank=0 - #sum_uniq_to_query = self.sum_uniq_to_query[this_rank] - sum_uniq_weighted = self.sum_uniq_weighted[this_rank] ##SHOULD WE BE USING WEIGHTED HERE? I THINK YES, but this is a change from before. + sum_uniq_to_query = self.sum_uniq_to_query[this_rank] # sort the results and grab best - sorted_sum_uniq_weighted = list(sum_uniq_weighted.items()) - sorted_sum_uniq_weighted.sort(key = lambda x: -x[1]) - # best only - this_lineage, f_weighted = sorted_sum_uniq_weighted[0] - f_unique_at_rank = self.sum_uniq_to_query[this_rank][this_lineage] + sorted_sum_uniq_to_query = list(sum_uniq_to_query.items()) + sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) + # select best-at-rank only + this_lineage, f_unique_at_rank = sorted_sum_uniq_to_query[0] bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage] + f_weighted = self.sum_uniq_weighted[this_rank][this_lineage] classif = ClassificationResult(rank=this_rank, fraction=f_unique_at_rank, lineage=this_lineage, f_weighted_at_rank=f_weighted, bp_match_at_rank=bp_intersect_at_rank) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 5a73060604..e1894a7ff5 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2610,7 +2610,7 @@ def test_build_classification_result_containment_threshold(): assert q_res.classification_result.bp_match_at_rank == 20 assert q_res.classification_result.query_ani_at_rank == approx(0.928, rel=1e-2) - q_res.build_classification_result(containment_threshold=0.4) + q_res.build_classification_result(containment_threshold=0.2) print("classif: ", q_res.classification_result) assert q_res.classification_result.status == 'match' assert q_res.classification_result.rank == 'phylum' From c33c1374418f46ab3b499a4bcd2129679666f01e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 9 Jan 2023 16:31:02 -0800 Subject: [PATCH 19/21] test no ranks --- tests/test_tax_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e1894a7ff5..e99625c23d 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2069,6 +2069,8 @@ def test_QueryTaxResult(): taxres = make_TaxResult(taxD=taxD) # initialize q_res = QueryTaxResult(taxres.query_info) + assert q_res.ranks == [] + assert q_res.ascending_ranks == [] q_res.add_taxresult(taxres) # check that new querytaxres is compatible with taxres assert q_res.is_compatible(taxres) From 15031ef059274cf2c24c00d0997b02b8aa7b249f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 9 Jan 2023 17:07:31 -0800 Subject: [PATCH 20/21] test make_kreport_results --- src/sourmash/tax/tax_utils.py | 40 ++++++++++++++------------ tests/test_tax_utils.py | 54 +++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 18 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 2e33c1cd68..4673803f79 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1632,24 +1632,29 @@ def as_human_friendly_dict(self, query_info): return sD def as_kreport_dict(self, query_info): + lowest_assignment_rank = 'species' sD = {} - this_rank = self.lineage.lowest_rank - this_sciname = self.lineage.lowest_lineage_name - sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid - sD['sci_name'] = this_sciname - #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) - sD['rank_code'] = RANKCODE[this_rank] sD['num_bp_assigned'] = str(0) # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_hashes)) - # the number of bp actually 'assigned' at this rank. Sourmash assigns everything - # at genome level, but since kreport traditionally doesn't include 'strain' or genome, - # it is reasonable to state that sourmash assigns at 'species' level for this. - # can be modified later. - lowest_assignment_rank = 'species' - if this_rank == lowest_assignment_rank or this_sciname == "unclassified": - sD["num_bp_assigned"] = sD["num_bp_contained"] + # could make this cleaner if used empty RankLineageInfo() + #sD['lineage'] = self.lineage.display_lineage(null_as_unclassified=True) + if self.lineage != (): + this_rank = self.lineage.lowest_rank + sD['rank_code'] = RANKCODE[this_rank] + sD['sci_name'] = self.lineage.lowest_lineage_name + sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid + # the number of bp actually 'assigned' at this rank. Sourmash assigns everything + # at genome level, but since kreport traditionally doesn't include 'strain' or genome, + # it is reasonable to state that sourmash assigns at 'species' level for this. + # can be modified later. + if this_rank == lowest_assignment_rank: + sD["num_bp_assigned"] = sD["num_bp_contained"] + else: + sD['sci_name'] = 'unclassified' + sD['rank_code'] = RANKCODE['unclassified'] + sD["num_bp_assigned"] = sD["num_bp_contained"] return sD @dataclass @@ -1935,12 +1940,11 @@ def make_full_summary(self, classification=False, limit_float=False): def make_kreport_results(self): self.check_summarization() - rankCode = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", - "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} if self.query_info.total_weighted_hashes == 0: raise ValueError("ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0") - required_ranks = set(rankCode.keys).pop('unclassified') - if not set(required_ranks).issubset(set(self.ranks)): + required_ranks = set(RANKCODE.keys()) + acceptable_ranks = list(self.ranks) + ['unclassified', 'kingdom'] + if not required_ranks.issubset(set(acceptable_ranks)): raise ValueError("ERROR: cannot produce 'kreport' format from ranks {', '.join(self.ranks)}") kreport_results = [] unclassified_recorded=False @@ -1950,7 +1954,7 @@ def make_kreport_results(self): continue rank_results = self.summarized_lineage_results[rank] for res in rank_results: - kresD = res.as_kreport_dict(rankCode, self.query_info.total_weighted_hashes) + kresD = res.as_kreport_dict(self.query_info) if kresD['sci_name'] == "unclassified": # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. # Here, we only need to report it once, since it will be the same fraction for all ranks diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e99625c23d..55f2543e9d 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2994,3 +2994,57 @@ def test_make_full_summary_classification_fail(): print(str(exc)) assert 'not classified yet' in str(exc) + +def test_make_kreport_results(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;c;d;e;f;g")]) + #need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + krepD = q_res.make_kreport_results() + print(krepD) + assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', + 'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': None}, + {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60', + 'sci_name': 'unclassified', 'rank_code': 'U'}, + {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', + 'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': None}, + {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', + 'rank_code': 'C', 'sci_name': 'c', 'ncbi_taxid': None}, + {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'O', 'sci_name': 'd', 'ncbi_taxid': None}, + {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'F', 'sci_name': 'e', 'ncbi_taxid': None}, + {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'G', 'sci_name': 'f', 'ncbi_taxid': None}, + {'num_bp_assigned': '20', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': None}] + + +def test_make_kreport_results_fail(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=False) + with pytest.raises(ValueError) as exc: + q_res.make_kreport_results() + print(str(exc)) + assert 'not summarized yet' in str(exc) + + +def test_make_kreport_results_fail_pre_v450(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + with pytest.raises(ValueError) as exc: + q_res.make_kreport_results() + print(str(exc)) + assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc) + + +def test_make_kreport_results_fail_pre_v450(): + taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + with pytest.raises(ValueError) as exc: + q_res.make_kreport_results() + print(str(exc)) + assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc) From 826bc6b859109634f05e9b1ba24cd301912085f2 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 10 Jan 2023 16:46:02 -0800 Subject: [PATCH 21/21] fix lineagepair --- tests/test_tax_utils.py | 116 ++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 58 deletions(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 181c93623b..832395252f 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -691,7 +691,7 @@ def test_summarize_gather_at_0(): assert sk_sum[0].query_md5 == "queryA_md5" assert sk_sum[0].query_filename == "queryA.sig" assert sk_sum[0].rank == 'superkingdom' - assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 1.0 assert sk_sum[0].f_weighted_at_rank == 1.0 assert sk_sum[0].bp_match_at_rank == 100 @@ -704,7 +704,7 @@ def test_summarize_gather_at_0(): assert phy_sum[0].query_md5 == "queryA_md5" assert phy_sum[0].query_filename == "queryA.sig" assert phy_sum[0].rank == 'phylum' - assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 1.0 assert phy_sum[0].f_weighted_at_rank == 1.0 assert phy_sum[0].bp_match_at_rank == 100 @@ -716,16 +716,16 @@ def test_summarize_gather_at_0(): assert cl_sum[0].query_md5 == "queryA_md5" assert cl_sum[0].query_filename == "queryA.sig" assert cl_sum[0].rank == 'class' - assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.5 assert cl_sum[0].f_weighted_at_rank == 0.5 assert cl_sum[0].bp_match_at_rank == 50 assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='d')) + assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='d')) assert cl_sum[1].fraction == 0.5 assert cl_sum[1].f_weighted_at_rank == 0.5 assert cl_sum[1].bp_match_at_rank == 50 @@ -750,7 +750,7 @@ def test_summarize_gather_at_1(): # superkingdom assert len(sk_sum) == 2 print("\nsuperkingdom summarized gather 0: ", sk_sum[0]) - assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 print("superkingdom summarized gather 1: ", sk_sum[1]) @@ -765,7 +765,7 @@ def test_summarize_gather_at_1(): print("phylum summarized gather 0: ", phy_sum[0]) print("phylum summarized gather 1: ", phy_sum[1]) assert len(phy_sum) == 2 - assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 assert phy_sum[0].f_weighted_at_rank == 0.6 assert phy_sum[0].bp_match_at_rank == 70 @@ -778,18 +778,18 @@ def test_summarize_gather_at_1(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) assert len(cl_sum) == 3 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='c')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].f_weighted_at_rank == 0.5 assert cl_sum[0].bp_match_at_rank == 60 assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='d')) + assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='d')) assert cl_sum[1].fraction == 0.1 assert cl_sum[1].f_weighted_at_rank == 0.1 assert cl_sum[1].bp_match_at_rank == 10 @@ -816,7 +816,7 @@ def test_summarize_gather_at_perfect_match(): # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 1.0 @@ -846,15 +846,15 @@ def test_summarize_gather_at_over100percent_f_unique_to_query(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='d')) + assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='d')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].bp_match_at_rank == 60 assert cl_sum[1].rank == 'class' - assert cl_sum[1].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='c')) + assert cl_sum[1].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='c')) assert cl_sum[1].fraction == 0.5 assert cl_sum[1].bp_match_at_rank == 50 @@ -875,7 +875,7 @@ def test_summarize_gather_at_missing_ignore(): # superkingdom assert len(sk_sum) == 2 print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.5 assert sk_sum[0].bp_match_at_rank == 50 assert sk_sum[1].lineage == () @@ -886,7 +886,7 @@ def test_summarize_gather_at_missing_ignore(): phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 2 - assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b')) assert phy_sum[0].fraction == 0.5 assert phy_sum[0].bp_match_at_rank == 50 assert phy_sum[1].lineage == () @@ -896,9 +896,9 @@ def test_summarize_gather_at_missing_ignore(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) assert len(cl_sum) == 2 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='c')) assert cl_sum[0].fraction == 0.5 assert cl_sum[0].bp_match_at_rank == 50 assert cl_sum[1].lineage == () @@ -941,7 +941,7 @@ def test_summarize_gather_at_best_only_0(): # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) - assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 print("superk ANI:",sk_sum[0].query_ani_at_rank) @@ -951,7 +951,7 @@ def test_summarize_gather_at_best_only_0(): phy_sum, _, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 - assert phy_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),lca_utils.LineagePair(rank='phylum', name='b')) + assert phy_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),LineagePair (rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 assert phy_sum[0].bp_match_at_rank == 70 print("phy ANI:",phy_sum[0].query_ani_at_rank) @@ -960,9 +960,9 @@ def test_summarize_gather_at_best_only_0(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='c')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].bp_match_at_rank == 60 print("cl ANI:",cl_sum[0].query_ani_at_rank) @@ -986,9 +986,9 @@ def test_summarize_gather_at_best_only_equal_choose_first(): cl_sum, _, _ = summarize_gather_at("class", taxD, g_res, best_only=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) - assert cl_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b'), - lca_utils.LineagePair(rank='class', name='c')) + assert cl_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b'), + LineagePair (rank='class', name='c')) assert cl_sum[0].fraction == 0.5 assert cl_sum[0].bp_match_at_rank == 50 @@ -999,14 +999,14 @@ def test_write_summary_csv(runtmp): sum_gather = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), + lineage=(LineagePair (rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b')), + lineage=(LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} @@ -1024,8 +1024,8 @@ def test_write_summary_csv(runtmp): def test_write_classification(runtmp): """test classification csv write function""" classif = ClassInf('queryA', 'match', 'phylum', 1.0, - (lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b')), + (LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b')), 'queryA_md5', 'queryA.sig', 1.0, 100, query_ani_at_rank=None) @@ -1083,7 +1083,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): print("superkingdom summarized gather results:", sk_sum) assert len(sk_sum) ==4 assert sk_sum[0].query_name == "queryA" - assert sk_sum[0].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[0].lineage == (LineagePair (rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.9 assert sk_sum[0].bp_match_at_rank == 160 # check for unassigned for queryA @@ -1093,7 +1093,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): assert round(sk_sum[1].fraction,1) == 0.1 # queryB assert sk_sum[2].query_name == "queryB" - assert sk_sum[2].lineage == (lca_utils.LineagePair(rank='superkingdom', name='a'),) + assert sk_sum[2].lineage == (LineagePair (rank='superkingdom', name='a'),) assert sk_sum[2].fraction == 0.3 assert sk_sum[2].bp_match_at_rank == 60 # check for unassigned for queryA @@ -1103,7 +1103,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): assert sk_sum[3].bp_match_at_rank == 140 sk_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True) print("superkingdom lineage summary:", sk_lin_sum, '\n') - assert sk_lin_sum == {(lca_utils.LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3}, + assert sk_lin_sum == {(LineagePair (rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3}, (): {'queryA': 0.09999999999999998, 'queryB': 0.7}} assert num_queries == 2 assert query_names == ['queryA', 'queryB'] @@ -1112,8 +1112,8 @@ def test_aggregate_by_lineage_at_rank_by_query(): print("phylum summary:", phy_sum, ']\n') phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) print("phylum lineage summary:", phy_lin_sum, '\n') - assert phy_lin_sum == {(lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='b')): {'queryA': 0.5}, - (lca_utils.LineagePair(rank='superkingdom', name='a'), lca_utils.LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3}, + assert phy_lin_sum == {(LineagePair (rank='superkingdom', name='a'), LineagePair (rank='phylum', name='b')): {'queryA': 0.5}, + (LineagePair (rank='superkingdom', name='a'), LineagePair (rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3}, (): {'queryA': 0.09999999999999998, 'queryB': 0.7}} assert num_queries == 2 assert query_names == ['queryA', 'queryB'] @@ -1223,27 +1223,27 @@ def test_combine_sumgather_csvs_by_lineage(runtmp): sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), + lineage=(LineagePair (rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b')), + lineage=(LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), + lineage=(LineagePair (rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='c')), + lineage=(LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='c')), query_ani_at_rank=None, total_weighted_hashes=0)]} @@ -1318,27 +1318,27 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): sum_gather1 = {'superkingdom': [SumGathInf(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), + lineage=(LineagePair (rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SumGathInf(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='b')), + lineage=(LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='b')), query_ani_at_rank=None, total_weighted_hashes=0)]} sum_gather2 = {'superkingdom': [SumGathInf(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'),), + lineage=(LineagePair (rank='superkingdom', name='a'),), query_ani_at_rank=None, total_weighted_hashes=0)], 'phylum': [SumGathInf(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(lca_utils.LineagePair(rank='superkingdom', name='a'), - lca_utils.LineagePair(rank='phylum', name='c')), + lineage=(LineagePair (rank='superkingdom', name='a'), + LineagePair (rank='phylum', name='c')), query_ani_at_rank=None, total_weighted_hashes=0)]}