From 7cc6e3f5687b33662630ad173ee059f3170cd8d8 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 7 Feb 2023 09:29:26 -0800 Subject: [PATCH 01/10] fix LineagePair usage? --- src/sourmash/tax/__main__.py | 6 +++--- src/sourmash/tax/tax_utils.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 8716a3843c..1d127cefb7 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -360,9 +360,9 @@ def grep(args): # determine if lineage matches. def find_pattern(lineage, select_rank): - for (rank, name) in lineage: - if select_rank is None or rank == select_rank: - if pattern.search(name): + for lp in lineage: + if select_rank is None or lp.rank == select_rank: + if pattern.search(lp.name): return True return False diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e010a3b4a5..3ef7a29173 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -778,7 +778,7 @@ def load(cls, filename, *, delimiter=',', force=False, # read row into a lineage pair for rank in lca_utils.taxlist(include_strain=include_strain): lin = row[rank] - lineage.append(lca_utils.LineagePair(rank, lin)) + lineage.append(LineagePair(rank, lin)) ident = row[identifier] # fold, spindle, and mutilate ident? @@ -787,8 +787,8 @@ def load(cls, filename, *, delimiter=',', force=False, keep_identifier_versions=keep_identifier_versions) # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage ] + lineage = [ (lin.rank, lca_utils.filter_null(lin.name)) for lin in lineage ] + lineage = [ LineagePair(a, b) for (a, b) in lineage ] # remove end nulls while lineage and lineage[-1].name == 'unassigned': @@ -942,7 +942,7 @@ def load(cls, location): def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" - tup = [ lca_utils.LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] + tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] return tuple(tup) def __getitem__(self, ident): From 95bcf8e600cfb6eacc3aae8d9ec2a6a4dbe6ad73 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 11:38:21 -0800 Subject: [PATCH 02/10] read in taxids if avail and use for kreport --- src/sourmash/tax/tax_utils.py | 18 +++++++--- tests/test-data/tax/test.ncbi-taxonomy.csv | 7 ++++ tests/test_tax.py | 41 +++++++++++++++++++++- 3 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 tests/test-data/tax/test.ncbi-taxonomy.csv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 3ef7a29173..f108249cba 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -80,7 +80,7 @@ def __post_init__(self): self._init_empty() def __eq__(self, other): - if other == (): # just handy: if comparing to a null tuple, don't try to find it's lineage before returning False + if other == (): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False return False return all([self.ranks == other.ranks and self.lineage==other.lineage]) @@ -757,6 +757,9 @@ def load(cls, filename, *, delimiter=',', force=False, # is "strain" an available rank? if "strain" in header: include_strain=True + load_taxids=False + if 'taxpath' in header: + load_taxids=True # check that all ranks are in header ranks = list(lca_utils.taxlist(include_strain=include_strain)) @@ -775,10 +778,15 @@ def load(cls, filename, *, delimiter=',', force=False, for n, row in enumerate(r): num_rows += 1 lineage = [] + taxid=None # read row into a lineage pair - for rank in lca_utils.taxlist(include_strain=include_strain): + if load_taxids: + taxpath = row['taxpath'].split('|') + for n, rank in enumerate(lca_utils.taxlist(include_strain=include_strain)): lin = row[rank] - lineage.append(LineagePair(rank, lin)) + if load_taxids: + taxid = taxpath[n] + lineage.append(LineagePair(rank, name=lin, taxid=taxid)) ident = row[identifier] # fold, spindle, and mutilate ident? @@ -787,8 +795,8 @@ def load(cls, filename, *, delimiter=',', force=False, keep_identifier_versions=keep_identifier_versions) # clean lineage of null names, replace with 'unassigned' - lineage = [ (lin.rank, lca_utils.filter_null(lin.name)) for lin in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] + lineage = [ (lin.rank, lca_utils.filter_null(lin.name), lin.taxid) for lin in lineage ] + lineage = [ LineagePair(a, b, c) for (a, b, c) in lineage ] # remove end nulls while lineage and lineage[-1].name == 'unassigned': diff --git a/tests/test-data/tax/test.ncbi-taxonomy.csv b/tests/test-data/tax/test.ncbi-taxonomy.csv new file mode 100644 index 0000000000..ec3bfa530d --- /dev/null +++ b/tests/test-data/tax/test.ncbi-taxonomy.csv @@ -0,0 +1,7 @@ +ident,taxid,superkingdom,phylum,class,order,family,genus,species,strain,taxpath +GCF_001881345.1,562,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,,2|1224|1236|91347|543|561|562| +GCF_009494285.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179| +GCF_013368705.1,821,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Phocaeicola,Phocaeicola vulgatus,,2|976|200643|171549|815|909656|821| +GCF_003471795.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179| +GCF_000017325.1,402882,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS185,2|1224|1236|135622|267890|22|62322|402882 +GCF_000021665.1,407976,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS223,2|1224|1236|135622|267890|22|62322|407976 diff --git a/tests/test_tax.py b/tests/test_tax.py index 173a663e7b..c1831a7b21 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -49,7 +49,6 @@ def test_metagenome_stdout_0(runtmp): assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out @@ -205,6 +204,46 @@ def test_metagenome_kreport_out(runtmp): assert ['1.56', '192000', '192000', 'S', '', 's__Phocaeicola vulgatus'] == kreport_results[15] +def test_metagenome_kreport_ncbi_taxid_out(runtmp): + # test 'kreport' kraken output format + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + csv_base = "out" + sum_csv = csv_base + ".kreport.txt" + csvout = runtmp.output(sum_csv) + outdir = os.path.dirname(csvout) + + runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert os.path.exists(csvout) + + kreport_results = [x.rstrip().split('\t') for x in open(csvout)] + assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err + print(kreport_results) + assert ['13.08', '1605999', '0', 'D', '2', 'Bacteria'] == kreport_results[0] + assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1] + assert ['7.27', '892000', '0', 'P', '976', 'Bacteroidota'] == kreport_results[2] + assert ['5.82', '714000', '0', 'P', '1224', 'Pseudomonadota'] == kreport_results[3] + assert ['7.27', '892000', '0', 'C', '200643', 'Bacteroidia'] == kreport_results[4] + assert ['5.82', '714000', '0', 'C', '1236', 'Gammaproteobacteria'] == kreport_results[5] + assert ['7.27', '892000', '0', 'O', '171549', 'Bacteroidales'] == kreport_results[6] + assert ['5.82', '714000', '0', 'O', '91347', 'Enterobacterales'] == kreport_results[7] + assert ['5.70', '700000', '0', 'F', '171552', 'Prevotellaceae'] == kreport_results[8] + assert ['5.82', '714000', '0', 'F', '543', 'Enterobacteriaceae'] == kreport_results[9] + assert ['1.56', '192000', '0', 'F', '815', 'Bacteroidaceae'] == kreport_results[10] + assert ['5.70', '700000', '0', 'G', '838', 'Prevotella'] == kreport_results[11] + assert ['5.82', '714000', '0', 'G', '561', 'Escherichia'] == kreport_results[12] + assert ['1.56', '192000', '0', 'G', '909656', 'Phocaeicola'] == kreport_results[13] + assert ['5.70', '700000', '700000', 'S', '165179', 'Prevotella copri'] == kreport_results[14] + assert ['5.82', '714000', '714000', 'S', '562', 'Escherichia coli'] == kreport_results[15] + assert ['1.56', '192000', '192000', 'S', '821', 'Phocaeicola vulgatus'] == kreport_results[16] + + def test_metagenome_kreport_out_lemonade(runtmp): # test 'kreport' kraken output format against lemonade output g_csv = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.csv') From 34185946c6c63438bdfff774c676acd99ca5c61b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 12:21:46 -0800 Subject: [PATCH 03/10] fix comment --- tests/test_tax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index c1831a7b21..85d6970768 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -205,7 +205,7 @@ def test_metagenome_kreport_out(runtmp): def test_metagenome_kreport_ncbi_taxid_out(runtmp): - # test 'kreport' kraken output format + # test NCBI taxid output from kreport g_csv = utils.get_test_data('tax/test1.gather.v450.csv') tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') csv_base = "out" From 956c158e149886061674623e50c64dfe208491f2 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 13:24:46 -0800 Subject: [PATCH 04/10] mod lineage_dict init for taxpath --- src/sourmash/tax/tax_utils.py | 75 +++++++++++++++------------- tests/test_tax_utils.py | 93 ++++++++++++++++------------------- 2 files changed, 83 insertions(+), 85 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index f108249cba..0fe9895847 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -51,7 +51,6 @@ class BaseLineageInfo: optional: lineage: tuple or list of LineagePair lineage_str: `;`- or `,`-separated string of names - lineage_dict: dictionary of {rank: name} If no lineage information is provided, result will be a BaseLineageInfo with provided ranks and no lineage names. @@ -63,7 +62,6 @@ class BaseLineageInfo: ranks: tuple() # require ranks lineage: tuple = None # tuple of LineagePairs lineage_str: str = field(default=None, compare=False) # ';'- or ','-separated str of lineage names - lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): "Initialize according to passed values" @@ -74,8 +72,6 @@ def __post_init__(self): self._init_from_lineage_tuples() elif self.lineage_str is not None: self._init_from_lineage_str() - elif self.lineage_dict is not None: - self._init_from_lineage_dict() else: self._init_empty() @@ -170,37 +166,6 @@ def _init_from_lineage_tuples(self): object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", filled_ranks) - def _init_from_lineage_dict(self): - 'initialize from lineage dict, e.g. from gather csv, allowing empty ranks and reordering if necessary' - if not isinstance(self.lineage_dict, (dict)): - raise ValueError(f"{self.lineage_dict} is not dictionary") - # first, initialize_empty - new_lineage = [] - # build empty lineage - for rank in self.ranks: - new_lineage.append(LineagePair(rank=rank)) - # now add input information in correct spots. This corrects for order and allows empty values. - for rank, info in self.lineage_dict.items(): - try: - rank_idx = self.rank_index(rank) - except ValueError as e: - raise ValueError(f"Rank '{rank}' not present in {', '.join(self.ranks)}") from e - - name, taxid = None, None - if isinstance(info, dict): - if 'name' in info.keys(): - name = info['name'] - if 'taxid' in info.keys(): - taxid = info['taxid'] - elif isinstance(info, str): - name = info - new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) - # build list of filled ranks - filled_ranks = [a.rank for a in new_lineage if a.name] - # set lineage and filled_ranks - object.__setattr__(self, "lineage", tuple(new_lineage)) - object.__setattr__(self, "filled_ranks", filled_ranks) - def _init_from_lineage_str(self): """ Turn a ; or ,-separated set of lineages into a list of LineagePair objs. @@ -329,6 +294,7 @@ class RankLineageInfo(BaseLineageInfo): and will not be used or compared in any other class methods. """ ranks: tuple = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): "Initialize according to passed values" @@ -344,6 +310,45 @@ def __post_init__(self): elif self.ranks: self._init_empty() + def _init_from_lineage_dict(self): + 'initialize from lineage dict, e.g. from lineages csv, allowing empty ranks/extra columns and reordering if necessary' + if not isinstance(self.lineage_dict, (dict)): + raise ValueError(f"{self.lineage_dict} is not dictionary") + new_lineage = [] + taxpath=[] + # build empty lineage and taxpath + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + + # check for NCBI taxpath information + taxpath_str = self.lineage_dict.get('taxpath', []) + if taxpath_str: + taxpath = taxpath_str.split('|') + if len(taxpath) > len(self.ranks): + raise ValueError(f"Number of NCBI taxids ({len(taxpath)}) exceeds number of ranks ({len(self.ranks)})") + + # now add rank information in correct spots. This corrects for order and allows empty ranks and extra dict keys + for key, val in self.lineage_dict.items(): + name, taxid = None, None + try: + rank, name = key, val + rank_idx = self.rank_index(rank) + except ValueError: + continue # ignore dictionary entries (columns) that don't match a rank + + if taxpath: + try: + taxid = taxpath[rank_idx] + except IndexError: + taxid = None + new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) + + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", filled_ranks) + def get_ident(ident, *, keep_full_identifiers=False, keep_identifier_versions=False): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index fe0b00e1c3..05f23bec5a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1074,46 +1074,6 @@ def test_BaseLineageInfo_init_lca_lineage_tups(): assert taxinf.zip_lineage()== ['a', '', 'b'] -def test_BaseLineageInfo_init_lineage_dict_fail(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) - with pytest.raises(ValueError) as exc: - taxinf = BaseLineageInfo(ranks=ranks, lineage_dict=lin_tups) - print(str(exc)) - - assert "is not dictionary" in str(exc) - - -def test_BaseLineageInfo_init_lineage_dict (): - x = {'rank1': 'name1', 'rank2': 'name2'} - taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) - print("ranks: ", taxinf.ranks) - print("lineage: ", taxinf.lineage) - print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', 'name2'] - - -def test_BaseLineageInfo_init_lineage_dict_withtaxid(): - x = {'rank1': {'name': 'name1', 'taxid': 1}, 'rank2': {'name':'name2', 'taxid': 2}} - taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) - print("ranks: ", taxinf.ranks) - print("lineage: ", taxinf.lineage) - print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', 'name2'] - assert taxinf.zip_taxid()== ['1', '2'] - assert taxinf.lowest_lineage_taxid == 2 - assert taxinf.lowest_lineage_name == "name2" - - -def test_BaseLineageInfo_init_lineage_str_lineage_dict_test_eq(): - x = "a;b;c" - ranks=["A", "B", "C"] - rankD = {"A": "a", "B": "b", "C": "c"} - lin1 = BaseLineageInfo(lineage_str=x, ranks=ranks) - lin2 = BaseLineageInfo(lineage_dict=rankD, ranks=ranks) - assert lin1 == lin2 - - def test_BaseLineageInfo_init_no_ranks(): x = "a;b;c" rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} @@ -1122,10 +1082,6 @@ def test_BaseLineageInfo_init_no_ranks(): BaseLineageInfo(lineage_str=x) print(exc) assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc) - with pytest.raises(TypeError) as exc: - BaseLineageInfo(lineage_dict=rankD) - print(exc) - assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc) with pytest.raises(TypeError) as exc: BaseLineageInfo(lineage=lin_tups) print(exc) @@ -1140,10 +1096,6 @@ def test_BaseLineageInfo_init_with_wrong_ranks(): BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(str(exc)) assert "Rank 'rank1' not present in A, B, C" in str(exc) - with pytest.raises(ValueError) as exc: - BaseLineageInfo(lineage_dict=linD, ranks=ranks) - print(str(exc)) - assert "Rank 'rank1' not present in A, B, C" in str(exc) def test_BaseLineageInfo_init_not_lineagepair(): @@ -1187,7 +1139,26 @@ def test_RankLineageInfo_init_lineage_tups(): assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_fail(): + ranks=["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + with pytest.raises(ValueError) as exc: + taxinf = RankLineageInfo(ranks=ranks, lineage_dict=lin_tups) + print(str(exc)) + + assert "is not dictionary" in str(exc) + + def test_RankLineageInfo_init_lineage_dict(): + x = {'rank1': 'name1', 'rank2': 'name2'} + taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', 'name2'] + + +def test_RankLineageInfo_init_lineage_dict_default_ranks(): x = {"superkingdom":'a',"phylum":'b'} taxinf = RankLineageInfo(lineage_dict=x) print(taxinf.lineage) @@ -1195,6 +1166,28 @@ def test_RankLineageInfo_init_lineage_dict(): assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_withtaxpath(): + x = {'rank1': 'name1', 'rank2': 'name2', 'taxpath': "1|2"} + taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + print("zipped taxids: ", taxinf.zip_taxid()) + assert taxinf.zip_lineage()== ['name1', 'name2'] + assert taxinf.zip_taxid()== ['1', '2'] + assert taxinf.lowest_lineage_taxid == "2" + assert taxinf.lowest_lineage_name == "name2" + + +def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): + x = "a;b;c" + ranks=["A", "B", "C"] + rankD = {"A": "a", "B": "b", "C": "c"} + lin1 = RankLineageInfo(lineage_str=x, ranks=ranks) + lin2 = RankLineageInfo(lineage_dict=rankD, ranks=ranks) + assert lin1 == lin2 + + def test_RankLineageInfo_init_lineage_dict_missing_rank(): x = {'superkingdom': 'name1', 'class': 'name2'} taxinf = RankLineageInfo(lineage_dict=x) @@ -1205,8 +1198,8 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank(): assert taxinf.zip_lineage(truncate_empty=True)== ['name1', '', 'name2'] -def test_RankLineageInfo_init_lineage_dict_missing_rank_withtaxid(): - x = {'superkingdom': {'name': 'name1', 'taxid': 1}, 'class': {'name':'name2', 'taxid': 2}} +def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): + x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2'} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) From 50619cd3f00061aae117800832da8ad53976e344 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 19:06:49 -0800 Subject: [PATCH 05/10] use RankLineageInfo to read and lineages csv --- src/sourmash/tax/tax_utils.py | 45 +++++++++++++++-------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0fe9895847..844fd1ceef 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -311,7 +311,12 @@ def __post_init__(self): self._init_empty() def _init_from_lineage_dict(self): - 'initialize from lineage dict, e.g. from lineages csv, allowing empty ranks/extra columns and reordering if necessary' + """ + Initialize from lineage dict, e.g. from lineages csv. + Use NCBI taxids if available as '|'-separated 'taxpath' column. + Allows empty ranks/extra columns and reordering if necessary + """ + null_names = set(['[Blank]', 'na', 'null', 'NA', '']) if not isinstance(self.lineage_dict, (dict)): raise ValueError(f"{self.lineage_dict} is not dictionary") new_lineage = [] @@ -341,6 +346,9 @@ def _init_from_lineage_dict(self): taxid = taxpath[rank_idx] except IndexError: taxid = None + # filter null + if name is not None and name.strip() in null_names: + name = None new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) # build list of filled ranks @@ -759,15 +767,14 @@ def load(cls, filename, *, delimiter=',', force=False, else: header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') + # is "strain" an available rank? if "strain" in header: include_strain=True - load_taxids=False - if 'taxpath' in header: - load_taxids=True - # check that all ranks are in header - ranks = list(lca_utils.taxlist(include_strain=include_strain)) + ranks = list(RankLineageInfo().taxlist) + if not include_strain: + ranks.remove('strain') if not set(ranks).issubset(header): # for now, just raise err if not all ranks are present. # in future, we can define `ranks` differently if desired @@ -782,16 +789,9 @@ def load(cls, filename, *, delimiter=',', force=False, # now parse and load lineages for n, row in enumerate(r): num_rows += 1 - lineage = [] - taxid=None - # read row into a lineage pair - if load_taxids: - taxpath = row['taxpath'].split('|') - for n, rank in enumerate(lca_utils.taxlist(include_strain=include_strain)): - lin = row[rank] - if load_taxids: - taxid = taxpath[n] - lineage.append(LineagePair(rank, name=lin, taxid=taxid)) + # read lineage from row dictionary + lineageInfo = RankLineageInfo(lineage_dict=row) + # get identifier ident = row[identifier] # fold, spindle, and mutilate ident? @@ -799,23 +799,16 @@ def load(cls, filename, *, delimiter=',', force=False, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions) - # clean lineage of null names, replace with 'unassigned' - lineage = [ (lin.rank, lca_utils.filter_null(lin.name), lin.taxid) for lin in lineage ] - lineage = [ LineagePair(a, b, c) for (a, b, c) in lineage ] - - # remove end nulls - while lineage and lineage[-1].name == 'unassigned': - lineage = lineage[:-1] - # store lineage tuple + lineage = lineageInfo.filled_lineage if lineage: # check duplicates if ident in assignments: - if assignments[ident] != tuple(lineage): + if assignments[ident] != lineage: if not force: raise ValueError(f"multiple lineages for identifier {ident}") else: - assignments[ident] = tuple(lineage) + assignments[ident] = lineage if lineage[-1].rank == 'species': n_species += 1 From 03cf9e378c9f73b3c242aaf8087f2e606fd99be9 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 19:36:06 -0800 Subject: [PATCH 06/10] addl tests --- tests/test_tax_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 05f23bec5a..52e553b622 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1208,6 +1208,25 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): assert taxinf.zip_taxid()== ['1', '', '2', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): + # if there's no name, we don't store the taxpath. Is this desired behavior? + x = {'superkingdom': 'name1', 'taxpath': '1||2'} + taxinf = RankLineageInfo(lineage_dict=x) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', '', '', '', '', '', '', ''] + assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', ''] + + +def test_RankLineageInfo_init_lineage_dict_taxpath_too_long(): + x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'} + with pytest.raises(ValueError) as exc: + RankLineageInfo(lineage_dict=x) + print(str(exc)) + assert f"Number of NCBI taxids (13) exceeds number of ranks (8)" in str(exc) + + def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): x = "a;b;c" rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} From 0f882d7eb18b2a8a83e61b44f10b0a3741bf901d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 12:14:13 -0800 Subject: [PATCH 07/10] test for missing taxids; taxpath shorter than provided rank names --- tests/test_tax_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 52e553b622..a254818415 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1219,6 +1219,18 @@ def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids(): + # if there's no name, we don't store the taxpath. Is this desired behavior? + x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'} + taxinf = RankLineageInfo(lineage_dict=x) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + print("zipped taxids: ", taxinf.zip_taxid()) + assert taxinf.zip_lineage()== ['name1', 'name2', 'name3', '', '', '', '', ''] + assert taxinf.zip_taxid()== ['', '2', '', '', '', '', '', ''] + + def test_RankLineageInfo_init_lineage_dict_taxpath_too_long(): x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'} with pytest.raises(ValueError) as exc: From 842ba397245ed9f6744ea2f08551d7f4383996ae Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 13:01:56 -0800 Subject: [PATCH 08/10] clarify comment --- tests/test_tax_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a254818415..11db0f4d5a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1209,7 +1209,8 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): - # if there's no name, we don't store the taxpath. Is this desired behavior? + # If there's no name, we don't report the taxpath, because lineage is not "filled". + # Is this desired behavior? x = {'superkingdom': 'name1', 'taxpath': '1||2'} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) From ce3c9919bc91d8b9a0fade2f05c3080b9d14cc08 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 13:15:01 -0800 Subject: [PATCH 09/10] clarify comment2 --- tests/test_tax_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 11db0f4d5a..db4a277363 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1221,7 +1221,8 @@ def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids(): - # if there's no name, we don't store the taxpath. Is this desired behavior? + # If there's no name, we don't report the taxpath, because lineage is not "filled". + # Is this desired behavior? x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) From 87f7e500a79c074e880451c8479e482fb506e0b7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 10 Feb 2023 08:12:28 -0800 Subject: [PATCH 10/10] undelete line --- tests/test_tax.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index 85d6970768..f852825068 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -49,6 +49,7 @@ def test_metagenome_stdout_0(runtmp): assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out