From 7cc6e3f5687b33662630ad173ee059f3170cd8d8 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 7 Feb 2023 09:29:26 -0800
Subject: [PATCH 01/10] fix LineagePair usage?

---
 src/sourmash/tax/__main__.py  | 6 +++---
 src/sourmash/tax/tax_utils.py | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 8716a3843c..1d127cefb7 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -360,9 +360,9 @@ def grep(args):
 
     # determine if lineage matches.
     def find_pattern(lineage, select_rank):
-        for (rank, name) in lineage:
-            if select_rank is None or rank == select_rank:
-                if pattern.search(name):
+        for lp in lineage:
+            if select_rank is None or lp.rank == select_rank:
+                if pattern.search(lp.name):
                     return True
         return False
 
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index e010a3b4a5..3ef7a29173 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -778,7 +778,7 @@ def load(cls, filename, *, delimiter=',', force=False,
                 # read row into a lineage pair
                 for rank in lca_utils.taxlist(include_strain=include_strain):
                     lin = row[rank]
-                    lineage.append(lca_utils.LineagePair(rank, lin))
+                    lineage.append(LineagePair(rank, lin))
                 ident = row[identifier]
 
                 # fold, spindle, and mutilate ident?
@@ -787,8 +787,8 @@ def load(cls, filename, *, delimiter=',', force=False,
                                   keep_identifier_versions=keep_identifier_versions)
 
                 # clean lineage of null names, replace with 'unassigned'
-                lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
-                lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage ]
+                lineage = [ (lin.rank, lca_utils.filter_null(lin.name)) for lin in lineage ]
+                lineage = [ LineagePair(a, b) for (a, b) in lineage ]
 
                 # remove end nulls
                 while lineage and lineage[-1].name == 'unassigned':
@@ -942,7 +942,7 @@ def load(cls, location):
 
     def _make_tup(self, row):
         "build a tuple of LineagePairs for this sqlite row"
-        tup = [ lca_utils.LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
+        tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
         return tuple(tup)
 
     def __getitem__(self, ident):

From 95bcf8e600cfb6eacc3aae8d9ec2a6a4dbe6ad73 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Wed, 8 Feb 2023 11:38:21 -0800
Subject: [PATCH 02/10] read in taxids if avail and use for kreport

---
 src/sourmash/tax/tax_utils.py              | 18 +++++++---
 tests/test-data/tax/test.ncbi-taxonomy.csv |  7 ++++
 tests/test_tax.py                          | 41 +++++++++++++++++++++-
 3 files changed, 60 insertions(+), 6 deletions(-)
 create mode 100644 tests/test-data/tax/test.ncbi-taxonomy.csv

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 3ef7a29173..f108249cba 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -80,7 +80,7 @@ def __post_init__(self):
             self._init_empty()
 
     def __eq__(self, other):
-        if other == (): # just handy: if comparing to a null tuple, don't try to find it's lineage before returning False
+        if other == (): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False
             return False
         return all([self.ranks == other.ranks and self.lineage==other.lineage])
 
@@ -757,6 +757,9 @@ def load(cls, filename, *, delimiter=',', force=False,
             # is "strain" an available rank?
             if "strain" in header:
                 include_strain=True
+            load_taxids=False
+            if 'taxpath' in header:
+                load_taxids=True
 
             # check that all ranks are in header
             ranks = list(lca_utils.taxlist(include_strain=include_strain))
@@ -775,10 +778,15 @@ def load(cls, filename, *, delimiter=',', force=False,
             for n, row in enumerate(r):
                 num_rows += 1
                 lineage = []
+                taxid=None
                 # read row into a lineage pair
-                for rank in lca_utils.taxlist(include_strain=include_strain):
+                if load_taxids:
+                    taxpath = row['taxpath'].split('|')
+                for n, rank in enumerate(lca_utils.taxlist(include_strain=include_strain)):
                     lin = row[rank]
-                    lineage.append(LineagePair(rank, lin))
+                    if load_taxids:
+                        taxid = taxpath[n]
+                    lineage.append(LineagePair(rank, name=lin, taxid=taxid))
                 ident = row[identifier]
 
                 # fold, spindle, and mutilate ident?
@@ -787,8 +795,8 @@ def load(cls, filename, *, delimiter=',', force=False,
                                   keep_identifier_versions=keep_identifier_versions)
 
                 # clean lineage of null names, replace with 'unassigned'
-                lineage = [ (lin.rank, lca_utils.filter_null(lin.name)) for lin in lineage ]
-                lineage = [ LineagePair(a, b) for (a, b) in lineage ]
+                lineage = [ (lin.rank, lca_utils.filter_null(lin.name), lin.taxid) for lin in lineage ]
+                lineage = [ LineagePair(a, b, c) for (a, b, c) in lineage ]
 
                 # remove end nulls
                 while lineage and lineage[-1].name == 'unassigned':
diff --git a/tests/test-data/tax/test.ncbi-taxonomy.csv b/tests/test-data/tax/test.ncbi-taxonomy.csv
new file mode 100644
index 0000000000..ec3bfa530d
--- /dev/null
+++ b/tests/test-data/tax/test.ncbi-taxonomy.csv
@@ -0,0 +1,7 @@
+ident,taxid,superkingdom,phylum,class,order,family,genus,species,strain,taxpath
+GCF_001881345.1,562,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,,2|1224|1236|91347|543|561|562|
+GCF_009494285.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179|
+GCF_013368705.1,821,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Phocaeicola,Phocaeicola vulgatus,,2|976|200643|171549|815|909656|821|
+GCF_003471795.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179|
+GCF_000017325.1,402882,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS185,2|1224|1236|135622|267890|22|62322|402882
+GCF_000021665.1,407976,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS223,2|1224|1236|135622|267890|22|62322|407976
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 173a663e7b..c1831a7b21 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -49,7 +49,6 @@ def test_metagenome_stdout_0(runtmp):
     assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out
     assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out
     assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out
-    assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out
     assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out
     assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out
     assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out
@@ -205,6 +204,46 @@ def test_metagenome_kreport_out(runtmp):
     assert ['1.56', '192000', '192000', 'S', '', 's__Phocaeicola vulgatus'] == kreport_results[15]
 
 
+def test_metagenome_kreport_ncbi_taxid_out(runtmp):
+    # test 'kreport' kraken output format
+    g_csv = utils.get_test_data('tax/test1.gather.v450.csv')
+    tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv')
+    csv_base = "out"
+    sum_csv = csv_base + ".kreport.txt"
+    csvout = runtmp.output(sum_csv)
+    outdir = os.path.dirname(csvout)
+
+    runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport")
+
+    print(runtmp.last_result.status)
+    print(runtmp.last_result.out)
+    print(runtmp.last_result.err)
+
+    assert runtmp.last_result.status == 0
+    assert os.path.exists(csvout)
+
+    kreport_results = [x.rstrip().split('\t') for x in open(csvout)]
+    assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err
+    print(kreport_results)
+    assert ['13.08', '1605999', '0', 'D', '2', 'Bacteria'] == kreport_results[0]
+    assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1]
+    assert ['7.27', '892000', '0', 'P', '976', 'Bacteroidota'] == kreport_results[2]
+    assert ['5.82', '714000', '0', 'P', '1224', 'Pseudomonadota'] == kreport_results[3]
+    assert ['7.27', '892000', '0', 'C', '200643', 'Bacteroidia'] == kreport_results[4]
+    assert ['5.82', '714000', '0', 'C', '1236', 'Gammaproteobacteria'] == kreport_results[5]
+    assert ['7.27', '892000', '0', 'O', '171549', 'Bacteroidales'] == kreport_results[6]
+    assert ['5.82', '714000', '0', 'O', '91347', 'Enterobacterales'] == kreport_results[7]
+    assert ['5.70', '700000', '0', 'F', '171552', 'Prevotellaceae'] == kreport_results[8]
+    assert ['5.82', '714000', '0', 'F', '543', 'Enterobacteriaceae'] == kreport_results[9]
+    assert ['1.56', '192000', '0', 'F', '815', 'Bacteroidaceae'] == kreport_results[10]
+    assert ['5.70', '700000', '0', 'G', '838', 'Prevotella'] == kreport_results[11]
+    assert ['5.82', '714000', '0', 'G', '561', 'Escherichia'] == kreport_results[12]
+    assert ['1.56', '192000', '0', 'G', '909656', 'Phocaeicola'] == kreport_results[13]
+    assert ['5.70', '700000', '700000', 'S', '165179', 'Prevotella copri'] == kreport_results[14]
+    assert ['5.82', '714000', '714000', 'S', '562', 'Escherichia coli'] == kreport_results[15]
+    assert ['1.56', '192000', '192000', 'S', '821', 'Phocaeicola vulgatus'] == kreport_results[16]
+
+
 def test_metagenome_kreport_out_lemonade(runtmp):
     # test 'kreport' kraken output format against lemonade output
     g_csv = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.csv')

From 34185946c6c63438bdfff774c676acd99ca5c61b Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Wed, 8 Feb 2023 12:21:46 -0800
Subject: [PATCH 03/10] fix comment

---
 tests/test_tax.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tax.py b/tests/test_tax.py
index c1831a7b21..85d6970768 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -205,7 +205,7 @@ def test_metagenome_kreport_out(runtmp):
 
 
 def test_metagenome_kreport_ncbi_taxid_out(runtmp):
-    # test 'kreport' kraken output format
+    # test NCBI taxid output from kreport
     g_csv = utils.get_test_data('tax/test1.gather.v450.csv')
     tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv')
     csv_base = "out"

From 956c158e149886061674623e50c64dfe208491f2 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Wed, 8 Feb 2023 13:24:46 -0800
Subject: [PATCH 04/10] mod lineage_dict init for taxpath

---
 src/sourmash/tax/tax_utils.py | 75 +++++++++++++++-------------
 tests/test_tax_utils.py       | 93 ++++++++++++++++-------------------
 2 files changed, 83 insertions(+), 85 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index f108249cba..0fe9895847 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -51,7 +51,6 @@ class BaseLineageInfo:
         optional:
             lineage: tuple or list of LineagePair
             lineage_str: `;`- or `,`-separated string of names
-            lineage_dict: dictionary of {rank: name}
 
     If no lineage information is provided, result will be a BaseLineageInfo
     with provided ranks and no lineage names.
@@ -63,7 +62,6 @@ class BaseLineageInfo:
     ranks: tuple() # require ranks
     lineage: tuple = None # tuple of LineagePairs
     lineage_str: str = field(default=None, compare=False) # ';'- or ','-separated str of lineage names
-    lineage_dict: dict = field(default=None, compare=False) # dict of rank: name
 
     def __post_init__(self):
         "Initialize according to passed values"
@@ -74,8 +72,6 @@ def __post_init__(self):
             self._init_from_lineage_tuples()
         elif self.lineage_str is not None:
             self._init_from_lineage_str()
-        elif self.lineage_dict is not None:
-            self._init_from_lineage_dict()
         else:
             self._init_empty()
 
@@ -170,37 +166,6 @@ def _init_from_lineage_tuples(self):
         object.__setattr__(self, "lineage", tuple(new_lineage))
         object.__setattr__(self, "filled_ranks", filled_ranks)
 
-    def _init_from_lineage_dict(self):
-        'initialize from lineage dict, e.g. from gather csv, allowing empty ranks and reordering if necessary'
-        if not isinstance(self.lineage_dict, (dict)):
-            raise ValueError(f"{self.lineage_dict} is not dictionary")
-        # first, initialize_empty
-        new_lineage = []
-        # build empty lineage
-        for rank in self.ranks:
-            new_lineage.append(LineagePair(rank=rank))
-        # now add input information in correct spots. This corrects for order and allows empty values.
-        for rank, info in self.lineage_dict.items():
-            try:
-                rank_idx = self.rank_index(rank)
-            except ValueError as e:
-                raise ValueError(f"Rank '{rank}' not present in {', '.join(self.ranks)}") from e
-
-            name, taxid = None, None
-            if isinstance(info, dict):
-                if 'name' in info.keys():
-                    name = info['name']
-                if 'taxid' in info.keys():
-                    taxid = info['taxid']
-            elif isinstance(info, str):
-                name = info
-            new_lineage[rank_idx] =  LineagePair(rank=rank, name=name, taxid=taxid)
-        # build list of filled ranks
-        filled_ranks = [a.rank for a in new_lineage if a.name]
-        # set lineage and filled_ranks
-        object.__setattr__(self, "lineage", tuple(new_lineage))
-        object.__setattr__(self, "filled_ranks", filled_ranks)
-
     def _init_from_lineage_str(self):
         """
         Turn a ; or ,-separated set of lineages into a list of LineagePair objs.
@@ -329,6 +294,7 @@ class RankLineageInfo(BaseLineageInfo):
     and will not be used or compared in any other class methods.
     """
     ranks: tuple = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain')
+    lineage_dict: dict = field(default=None, compare=False) # dict of rank: name
 
     def __post_init__(self):
         "Initialize according to passed values"
@@ -344,6 +310,45 @@ def __post_init__(self):
         elif self.ranks:
             self._init_empty()
 
+    def _init_from_lineage_dict(self):
+        'initialize from lineage dict, e.g. from lineages csv, allowing empty ranks/extra columns and reordering if necessary'
+        if not isinstance(self.lineage_dict, (dict)):
+            raise ValueError(f"{self.lineage_dict} is not dictionary")
+        new_lineage = []
+        taxpath=[]
+        # build empty lineage and taxpath
+        for rank in self.ranks:
+            new_lineage.append(LineagePair(rank=rank))
+
+        # check for NCBI taxpath information
+        taxpath_str = self.lineage_dict.get('taxpath', [])
+        if taxpath_str:
+            taxpath = taxpath_str.split('|')
+            if len(taxpath) > len(self.ranks):
+                raise ValueError(f"Number of NCBI taxids ({len(taxpath)}) exceeds number of ranks ({len(self.ranks)})")
+
+        # now add rank information in correct spots. This corrects for order and allows empty ranks and extra dict keys
+        for key, val in self.lineage_dict.items():
+            name, taxid = None, None
+            try:
+                rank, name = key, val
+                rank_idx = self.rank_index(rank)
+            except ValueError:
+                continue # ignore dictionary entries (columns) that don't match a rank
+
+            if taxpath:
+                try:
+                    taxid = taxpath[rank_idx]
+                except IndexError:
+                    taxid = None
+            new_lineage[rank_idx] =  LineagePair(rank=rank, name=name, taxid=taxid)
+
+        # build list of filled ranks
+        filled_ranks = [a.rank for a in new_lineage if a.name]
+        # set lineage and filled_ranks
+        object.__setattr__(self, "lineage", tuple(new_lineage))
+        object.__setattr__(self, "filled_ranks", filled_ranks)
+
 
 def get_ident(ident, *,
               keep_full_identifiers=False, keep_identifier_versions=False):
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index fe0b00e1c3..05f23bec5a 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1074,46 +1074,6 @@ def test_BaseLineageInfo_init_lca_lineage_tups():
     assert taxinf.zip_lineage()== ['a', '', 'b']
 
 
-def test_BaseLineageInfo_init_lineage_dict_fail():
-    ranks=["A", "B", "C"]
-    lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b'))
-    with pytest.raises(ValueError) as exc:
-        taxinf = BaseLineageInfo(ranks=ranks, lineage_dict=lin_tups)
-    print(str(exc))
-
-    assert "is not dictionary" in str(exc)
-
-
-def test_BaseLineageInfo_init_lineage_dict  ():
-    x = {'rank1': 'name1', 'rank2': 'name2'}
-    taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"])
-    print("ranks: ", taxinf.ranks)
-    print("lineage: ", taxinf.lineage)
-    print("zipped lineage: ", taxinf.zip_lineage())
-    assert taxinf.zip_lineage()== ['name1', 'name2']
-
-
-def test_BaseLineageInfo_init_lineage_dict_withtaxid():
-    x = {'rank1': {'name': 'name1', 'taxid': 1}, 'rank2': {'name':'name2', 'taxid': 2}}
-    taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"])
-    print("ranks: ", taxinf.ranks)
-    print("lineage: ", taxinf.lineage)
-    print("zipped lineage: ", taxinf.zip_lineage())
-    assert taxinf.zip_lineage()== ['name1', 'name2']
-    assert taxinf.zip_taxid()== ['1', '2']
-    assert taxinf.lowest_lineage_taxid == 2
-    assert taxinf.lowest_lineage_name == "name2"
-
-
-def test_BaseLineageInfo_init_lineage_str_lineage_dict_test_eq():
-    x = "a;b;c"
-    ranks=["A", "B", "C"]
-    rankD = {"A": "a", "B": "b", "C": "c"}
-    lin1 = BaseLineageInfo(lineage_str=x, ranks=ranks)
-    lin2 = BaseLineageInfo(lineage_dict=rankD, ranks=ranks)
-    assert lin1 == lin2
-
-
 def test_BaseLineageInfo_init_no_ranks():
     x = "a;b;c"
     rankD = {"superkingdom": "a", "phylum": "b", "class": "c"}
@@ -1122,10 +1082,6 @@ def test_BaseLineageInfo_init_no_ranks():
         BaseLineageInfo(lineage_str=x)
     print(exc)
     assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc)
-    with pytest.raises(TypeError) as exc:
-        BaseLineageInfo(lineage_dict=rankD)
-    print(exc)
-    assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc)
     with pytest.raises(TypeError) as exc:
         BaseLineageInfo(lineage=lin_tups)
     print(exc)
@@ -1140,10 +1096,6 @@ def test_BaseLineageInfo_init_with_wrong_ranks():
         BaseLineageInfo(lineage=lin_tups, ranks=ranks)
     print(str(exc))
     assert "Rank 'rank1' not present in A, B, C" in str(exc)
-    with pytest.raises(ValueError) as exc:
-        BaseLineageInfo(lineage_dict=linD, ranks=ranks)
-    print(str(exc))
-    assert "Rank 'rank1' not present in A, B, C" in str(exc)
 
 
 def test_BaseLineageInfo_init_not_lineagepair():
@@ -1187,7 +1139,26 @@ def test_RankLineageInfo_init_lineage_tups():
     assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', '']
 
 
+def test_RankLineageInfo_init_lineage_dict_fail():
+    ranks=["A", "B", "C"]
+    lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b'))
+    with pytest.raises(ValueError) as exc:
+        taxinf = RankLineageInfo(ranks=ranks, lineage_dict=lin_tups)
+    print(str(exc))
+
+    assert "is not dictionary" in str(exc)
+
+
 def test_RankLineageInfo_init_lineage_dict():
+    x = {'rank1': 'name1', 'rank2': 'name2'}
+    taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"])
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    assert taxinf.zip_lineage()== ['name1', 'name2']
+
+
+def test_RankLineageInfo_init_lineage_dict_default_ranks():
     x = {"superkingdom":'a',"phylum":'b'}
     taxinf = RankLineageInfo(lineage_dict=x)
     print(taxinf.lineage)
@@ -1195,6 +1166,28 @@ def test_RankLineageInfo_init_lineage_dict():
     assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', '']
 
 
+def test_RankLineageInfo_init_lineage_dict_withtaxpath():
+    x = {'rank1': 'name1', 'rank2': 'name2', 'taxpath': "1|2"}
+    taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"])
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    print("zipped taxids: ", taxinf.zip_taxid())
+    assert taxinf.zip_lineage()== ['name1', 'name2']
+    assert taxinf.zip_taxid()== ['1', '2']
+    assert taxinf.lowest_lineage_taxid == "2"
+    assert taxinf.lowest_lineage_name == "name2"
+
+
+def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq():
+    x = "a;b;c"
+    ranks=["A", "B", "C"]
+    rankD = {"A": "a", "B": "b", "C": "c"}
+    lin1 = RankLineageInfo(lineage_str=x, ranks=ranks)
+    lin2 = RankLineageInfo(lineage_dict=rankD, ranks=ranks)
+    assert lin1 == lin2
+
+
 def test_RankLineageInfo_init_lineage_dict_missing_rank():
     x = {'superkingdom': 'name1', 'class': 'name2'}
     taxinf = RankLineageInfo(lineage_dict=x)
@@ -1205,8 +1198,8 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank():
     assert taxinf.zip_lineage(truncate_empty=True)== ['name1', '', 'name2']
 
 
-def test_RankLineageInfo_init_lineage_dict_missing_rank_withtaxid():
-    x = {'superkingdom': {'name': 'name1', 'taxid': 1}, 'class': {'name':'name2', 'taxid': 2}}
+def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath():
+    x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2'}
     taxinf = RankLineageInfo(lineage_dict=x)
     print("ranks: ", taxinf.ranks)
     print("lineage: ", taxinf.lineage)

From 50619cd3f00061aae117800832da8ad53976e344 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Wed, 8 Feb 2023 19:06:49 -0800
Subject: [PATCH 05/10] use RankLineageInfo to read and lineages csv

---
 src/sourmash/tax/tax_utils.py | 45 +++++++++++++++--------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 0fe9895847..844fd1ceef 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -311,7 +311,12 @@ def __post_init__(self):
             self._init_empty()
 
     def _init_from_lineage_dict(self):
-        'initialize from lineage dict, e.g. from lineages csv, allowing empty ranks/extra columns and reordering if necessary'
+        """
+        Initialize from lineage dict, e.g. from lineages csv.
+        Use NCBI taxids if available as '|'-separated 'taxpath' column.
+        Allows empty ranks/extra columns and reordering if necessary
+        """
+        null_names = set(['[Blank]', 'na', 'null', 'NA', ''])
         if not isinstance(self.lineage_dict, (dict)):
             raise ValueError(f"{self.lineage_dict} is not dictionary")
         new_lineage = []
@@ -341,6 +346,9 @@ def _init_from_lineage_dict(self):
                     taxid = taxpath[rank_idx]
                 except IndexError:
                     taxid = None
+            # filter null
+            if name is not None and name.strip() in null_names:
+                 name = None
             new_lineage[rank_idx] =  LineagePair(rank=rank, name=name, taxid=taxid)
 
         # build list of filled ranks
@@ -759,15 +767,14 @@ def load(cls, filename, *, delimiter=',', force=False,
                 else:
                     header_str = ",".join([repr(x) for x in header])
                     raise ValueError(f'No taxonomic identifiers found; headers are {header_str}')
+
             # is "strain" an available rank?
             if "strain" in header:
                 include_strain=True
-            load_taxids=False
-            if 'taxpath' in header:
-                load_taxids=True
-
             # check that all ranks are in header
-            ranks = list(lca_utils.taxlist(include_strain=include_strain))
+            ranks = list(RankLineageInfo().taxlist)
+            if not include_strain:
+                ranks.remove('strain')
             if not set(ranks).issubset(header):
                 # for now, just raise err if not all ranks are present.
                 # in future, we can define `ranks` differently if desired
@@ -782,16 +789,9 @@ def load(cls, filename, *, delimiter=',', force=False,
             # now parse and load lineages
             for n, row in enumerate(r):
                 num_rows += 1
-                lineage = []
-                taxid=None
-                # read row into a lineage pair
-                if load_taxids:
-                    taxpath = row['taxpath'].split('|')
-                for n, rank in enumerate(lca_utils.taxlist(include_strain=include_strain)):
-                    lin = row[rank]
-                    if load_taxids:
-                        taxid = taxpath[n]
-                    lineage.append(LineagePair(rank, name=lin, taxid=taxid))
+                # read lineage from row dictionary
+                lineageInfo = RankLineageInfo(lineage_dict=row)
+                # get identifier
                 ident = row[identifier]
 
                 # fold, spindle, and mutilate ident?
@@ -799,23 +799,16 @@ def load(cls, filename, *, delimiter=',', force=False,
                                   keep_full_identifiers=keep_full_identifiers,
                                   keep_identifier_versions=keep_identifier_versions)
 
-                # clean lineage of null names, replace with 'unassigned'
-                lineage = [ (lin.rank, lca_utils.filter_null(lin.name), lin.taxid) for lin in lineage ]
-                lineage = [ LineagePair(a, b, c) for (a, b, c) in lineage ]
-
-                # remove end nulls
-                while lineage and lineage[-1].name == 'unassigned':
-                    lineage = lineage[:-1]
-
                 # store lineage tuple
+                lineage = lineageInfo.filled_lineage
                 if lineage:
                     # check duplicates
                     if ident in assignments:
-                        if assignments[ident] != tuple(lineage):
+                        if assignments[ident] != lineage:
                             if not force:
                                 raise ValueError(f"multiple lineages for identifier {ident}")
                     else:
-                        assignments[ident] = tuple(lineage)
+                        assignments[ident] = lineage
 
                         if lineage[-1].rank == 'species':
                             n_species += 1

From 03cf9e378c9f73b3c242aaf8087f2e606fd99be9 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Wed, 8 Feb 2023 19:36:06 -0800
Subject: [PATCH 06/10] addl tests

---
 tests/test_tax_utils.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 05f23bec5a..52e553b622 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1208,6 +1208,25 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath():
     assert taxinf.zip_taxid()== ['1', '', '2', '', '', '', '', '']
 
 
+def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch():
+    # if there's no name, we don't store the taxpath. Is this desired behavior?
+    x = {'superkingdom': 'name1', 'taxpath': '1||2'}
+    taxinf = RankLineageInfo(lineage_dict=x)
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    assert taxinf.zip_lineage()== ['name1', '', '', '', '', '', '', '']
+    assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', '']
+
+
+def test_RankLineageInfo_init_lineage_dict_taxpath_too_long():
+    x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'}
+    with pytest.raises(ValueError) as exc:
+        RankLineageInfo(lineage_dict=x)
+    print(str(exc))
+    assert f"Number of NCBI taxids (13) exceeds number of ranks (8)" in str(exc)
+
+
 def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq():
     x = "a;b;c"
     rankD = {"superkingdom": "a", "phylum": "b", "class": "c"}

From 0f882d7eb18b2a8a83e61b44f10b0a3741bf901d Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 9 Feb 2023 12:14:13 -0800
Subject: [PATCH 07/10] test for missing taxids; taxpath shorter than provided
 rank names

---
 tests/test_tax_utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 52e553b622..a254818415 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1219,6 +1219,18 @@ def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch():
     assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', '']
 
 
+def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids():
+    # if there's no name, we don't store the taxpath. Is this desired behavior?
+    x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'}
+    taxinf = RankLineageInfo(lineage_dict=x)
+    print("ranks: ", taxinf.ranks)
+    print("lineage: ", taxinf.lineage)
+    print("zipped lineage: ", taxinf.zip_lineage())
+    print("zipped taxids: ", taxinf.zip_taxid())
+    assert taxinf.zip_lineage()== ['name1', 'name2', 'name3', '', '', '', '', '']
+    assert taxinf.zip_taxid()== ['', '2', '', '', '', '', '', '']
+
+
 def test_RankLineageInfo_init_lineage_dict_taxpath_too_long():
     x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'}
     with pytest.raises(ValueError) as exc:

From 842ba397245ed9f6744ea2f08551d7f4383996ae Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 9 Feb 2023 13:01:56 -0800
Subject: [PATCH 08/10] clarify comment

---
 tests/test_tax_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index a254818415..11db0f4d5a 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1209,7 +1209,8 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath():
 
 
 def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch():
-    # if there's no name, we don't store the taxpath. Is this desired behavior?
+    # If there's no name, we don't report the taxpath, because lineage is not "filled".
+    # Is this desired behavior?
     x = {'superkingdom': 'name1', 'taxpath': '1||2'}
     taxinf = RankLineageInfo(lineage_dict=x)
     print("ranks: ", taxinf.ranks)

From ce3c9919bc91d8b9a0fade2f05c3080b9d14cc08 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Thu, 9 Feb 2023 13:15:01 -0800
Subject: [PATCH 09/10] clarify comment2

---
 tests/test_tax_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 11db0f4d5a..db4a277363 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1221,7 +1221,8 @@ def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch():
 
 
 def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids():
-    # if there's no name, we don't store the taxpath. Is this desired behavior?
+    # If there's no name, we don't report the taxpath, because lineage is not "filled".
+    # Is this desired behavior?
     x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'}
     taxinf = RankLineageInfo(lineage_dict=x)
     print("ranks: ", taxinf.ranks)

From 87f7e500a79c074e880451c8479e482fb506e0b7 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce-Ward" <ntpierce@gmail.com>
Date: Fri, 10 Feb 2023 08:12:28 -0800
Subject: [PATCH 10/10] undelete line

---
 tests/test_tax.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_tax.py b/tests/test_tax.py
index 85d6970768..f852825068 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -49,6 +49,7 @@ def test_metagenome_stdout_0(runtmp):
     assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out
     assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out
     assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out
+    assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out
     assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out
     assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out
     assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out