From 7cc6e3f5687b33662630ad173ee059f3170cd8d8 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 7 Feb 2023 09:29:26 -0800 Subject: [PATCH 01/78] fix LineagePair usage? --- src/sourmash/tax/__main__.py | 6 +++--- src/sourmash/tax/tax_utils.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 8716a3843c..1d127cefb7 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -360,9 +360,9 @@ def grep(args): # determine if lineage matches. def find_pattern(lineage, select_rank): - for (rank, name) in lineage: - if select_rank is None or rank == select_rank: - if pattern.search(name): + for lp in lineage: + if select_rank is None or lp.rank == select_rank: + if pattern.search(lp.name): return True return False diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e010a3b4a5..3ef7a29173 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -778,7 +778,7 @@ def load(cls, filename, *, delimiter=',', force=False, # read row into a lineage pair for rank in lca_utils.taxlist(include_strain=include_strain): lin = row[rank] - lineage.append(lca_utils.LineagePair(rank, lin)) + lineage.append(LineagePair(rank, lin)) ident = row[identifier] # fold, spindle, and mutilate ident? @@ -787,8 +787,8 @@ def load(cls, filename, *, delimiter=',', force=False, keep_identifier_versions=keep_identifier_versions) # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ lca_utils.LineagePair(a, b) for (a, b) in lineage ] + lineage = [ (lin.rank, lca_utils.filter_null(lin.name)) for lin in lineage ] + lineage = [ LineagePair(a, b) for (a, b) in lineage ] # remove end nulls while lineage and lineage[-1].name == 'unassigned': @@ -942,7 +942,7 @@ def load(cls, location): def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" - tup = [ lca_utils.LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] + tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] return tuple(tup) def __getitem__(self, ident): From 95bcf8e600cfb6eacc3aae8d9ec2a6a4dbe6ad73 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 11:38:21 -0800 Subject: [PATCH 02/78] read in taxids if avail and use for kreport --- src/sourmash/tax/tax_utils.py | 18 +++++++--- tests/test-data/tax/test.ncbi-taxonomy.csv | 7 ++++ tests/test_tax.py | 41 +++++++++++++++++++++- 3 files changed, 60 insertions(+), 6 deletions(-) create mode 100644 tests/test-data/tax/test.ncbi-taxonomy.csv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 3ef7a29173..f108249cba 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -80,7 +80,7 @@ def __post_init__(self): self._init_empty() def __eq__(self, other): - if other == (): # just handy: if comparing to a null tuple, don't try to find it's lineage before returning False + if other == (): # just handy: if comparing to a null tuple, don't try to find its lineage before returning False return False return all([self.ranks == other.ranks and self.lineage==other.lineage]) @@ -757,6 +757,9 @@ def load(cls, filename, *, delimiter=',', force=False, # is "strain" an available rank? if "strain" in header: include_strain=True + load_taxids=False + if 'taxpath' in header: + load_taxids=True # check that all ranks are in header ranks = list(lca_utils.taxlist(include_strain=include_strain)) @@ -775,10 +778,15 @@ def load(cls, filename, *, delimiter=',', force=False, for n, row in enumerate(r): num_rows += 1 lineage = [] + taxid=None # read row into a lineage pair - for rank in lca_utils.taxlist(include_strain=include_strain): + if load_taxids: + taxpath = row['taxpath'].split('|') + for n, rank in enumerate(lca_utils.taxlist(include_strain=include_strain)): lin = row[rank] - lineage.append(LineagePair(rank, lin)) + if load_taxids: + taxid = taxpath[n] + lineage.append(LineagePair(rank, name=lin, taxid=taxid)) ident = row[identifier] # fold, spindle, and mutilate ident? @@ -787,8 +795,8 @@ def load(cls, filename, *, delimiter=',', force=False, keep_identifier_versions=keep_identifier_versions) # clean lineage of null names, replace with 'unassigned' - lineage = [ (lin.rank, lca_utils.filter_null(lin.name)) for lin in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] + lineage = [ (lin.rank, lca_utils.filter_null(lin.name), lin.taxid) for lin in lineage ] + lineage = [ LineagePair(a, b, c) for (a, b, c) in lineage ] # remove end nulls while lineage and lineage[-1].name == 'unassigned': diff --git a/tests/test-data/tax/test.ncbi-taxonomy.csv b/tests/test-data/tax/test.ncbi-taxonomy.csv new file mode 100644 index 0000000000..ec3bfa530d --- /dev/null +++ b/tests/test-data/tax/test.ncbi-taxonomy.csv @@ -0,0 +1,7 @@ +ident,taxid,superkingdom,phylum,class,order,family,genus,species,strain,taxpath +GCF_001881345.1,562,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli,,2|1224|1236|91347|543|561|562| +GCF_009494285.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179| +GCF_013368705.1,821,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Bacteroidaceae,Phocaeicola,Phocaeicola vulgatus,,2|976|200643|171549|815|909656|821| +GCF_003471795.1,165179,Bacteria,Bacteroidota,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri,,2|976|200643|171549|171552|838|165179| +GCF_000017325.1,402882,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS185,2|1224|1236|135622|267890|22|62322|402882 +GCF_000021665.1,407976,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella baltica,Shewanella baltica OS223,2|1224|1236|135622|267890|22|62322|407976 diff --git a/tests/test_tax.py b/tests/test_tax.py index 173a663e7b..c1831a7b21 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -49,7 +49,6 @@ def test_metagenome_stdout_0(runtmp): assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out - assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out @@ -205,6 +204,46 @@ def test_metagenome_kreport_out(runtmp): assert ['1.56', '192000', '192000', 'S', '', 's__Phocaeicola vulgatus'] == kreport_results[15] +def test_metagenome_kreport_ncbi_taxid_out(runtmp): + # test 'kreport' kraken output format + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + csv_base = "out" + sum_csv = csv_base + ".kreport.txt" + csvout = runtmp.output(sum_csv) + outdir = os.path.dirname(csvout) + + runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-dir', outdir, '-F', "kreport") + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert os.path.exists(csvout) + + kreport_results = [x.rstrip().split('\t') for x in open(csvout)] + assert f"saving 'kreport' output to '{csvout}'" in runtmp.last_result.err + print(kreport_results) + assert ['13.08', '1605999', '0', 'D', '2', 'Bacteria'] == kreport_results[0] + assert ['86.92', '10672000', '10672000', 'U', '', 'unclassified'] == kreport_results[1] + assert ['7.27', '892000', '0', 'P', '976', 'Bacteroidota'] == kreport_results[2] + assert ['5.82', '714000', '0', 'P', '1224', 'Pseudomonadota'] == kreport_results[3] + assert ['7.27', '892000', '0', 'C', '200643', 'Bacteroidia'] == kreport_results[4] + assert ['5.82', '714000', '0', 'C', '1236', 'Gammaproteobacteria'] == kreport_results[5] + assert ['7.27', '892000', '0', 'O', '171549', 'Bacteroidales'] == kreport_results[6] + assert ['5.82', '714000', '0', 'O', '91347', 'Enterobacterales'] == kreport_results[7] + assert ['5.70', '700000', '0', 'F', '171552', 'Prevotellaceae'] == kreport_results[8] + assert ['5.82', '714000', '0', 'F', '543', 'Enterobacteriaceae'] == kreport_results[9] + assert ['1.56', '192000', '0', 'F', '815', 'Bacteroidaceae'] == kreport_results[10] + assert ['5.70', '700000', '0', 'G', '838', 'Prevotella'] == kreport_results[11] + assert ['5.82', '714000', '0', 'G', '561', 'Escherichia'] == kreport_results[12] + assert ['1.56', '192000', '0', 'G', '909656', 'Phocaeicola'] == kreport_results[13] + assert ['5.70', '700000', '700000', 'S', '165179', 'Prevotella copri'] == kreport_results[14] + assert ['5.82', '714000', '714000', 'S', '562', 'Escherichia coli'] == kreport_results[15] + assert ['1.56', '192000', '192000', 'S', '821', 'Phocaeicola vulgatus'] == kreport_results[16] + + def test_metagenome_kreport_out_lemonade(runtmp): # test 'kreport' kraken output format against lemonade output g_csv = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.csv') From 34185946c6c63438bdfff774c676acd99ca5c61b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 12:21:46 -0800 Subject: [PATCH 03/78] fix comment --- tests/test_tax.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index c1831a7b21..85d6970768 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -205,7 +205,7 @@ def test_metagenome_kreport_out(runtmp): def test_metagenome_kreport_ncbi_taxid_out(runtmp): - # test 'kreport' kraken output format + # test NCBI taxid output from kreport g_csv = utils.get_test_data('tax/test1.gather.v450.csv') tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') csv_base = "out" From 956c158e149886061674623e50c64dfe208491f2 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 13:24:46 -0800 Subject: [PATCH 04/78] mod lineage_dict init for taxpath --- src/sourmash/tax/tax_utils.py | 75 +++++++++++++++------------- tests/test_tax_utils.py | 93 ++++++++++++++++------------------- 2 files changed, 83 insertions(+), 85 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index f108249cba..0fe9895847 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -51,7 +51,6 @@ class BaseLineageInfo: optional: lineage: tuple or list of LineagePair lineage_str: `;`- or `,`-separated string of names - lineage_dict: dictionary of {rank: name} If no lineage information is provided, result will be a BaseLineageInfo with provided ranks and no lineage names. @@ -63,7 +62,6 @@ class BaseLineageInfo: ranks: tuple() # require ranks lineage: tuple = None # tuple of LineagePairs lineage_str: str = field(default=None, compare=False) # ';'- or ','-separated str of lineage names - lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): "Initialize according to passed values" @@ -74,8 +72,6 @@ def __post_init__(self): self._init_from_lineage_tuples() elif self.lineage_str is not None: self._init_from_lineage_str() - elif self.lineage_dict is not None: - self._init_from_lineage_dict() else: self._init_empty() @@ -170,37 +166,6 @@ def _init_from_lineage_tuples(self): object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", filled_ranks) - def _init_from_lineage_dict(self): - 'initialize from lineage dict, e.g. from gather csv, allowing empty ranks and reordering if necessary' - if not isinstance(self.lineage_dict, (dict)): - raise ValueError(f"{self.lineage_dict} is not dictionary") - # first, initialize_empty - new_lineage = [] - # build empty lineage - for rank in self.ranks: - new_lineage.append(LineagePair(rank=rank)) - # now add input information in correct spots. This corrects for order and allows empty values. - for rank, info in self.lineage_dict.items(): - try: - rank_idx = self.rank_index(rank) - except ValueError as e: - raise ValueError(f"Rank '{rank}' not present in {', '.join(self.ranks)}") from e - - name, taxid = None, None - if isinstance(info, dict): - if 'name' in info.keys(): - name = info['name'] - if 'taxid' in info.keys(): - taxid = info['taxid'] - elif isinstance(info, str): - name = info - new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) - # build list of filled ranks - filled_ranks = [a.rank for a in new_lineage if a.name] - # set lineage and filled_ranks - object.__setattr__(self, "lineage", tuple(new_lineage)) - object.__setattr__(self, "filled_ranks", filled_ranks) - def _init_from_lineage_str(self): """ Turn a ; or ,-separated set of lineages into a list of LineagePair objs. @@ -329,6 +294,7 @@ class RankLineageInfo(BaseLineageInfo): and will not be used or compared in any other class methods. """ ranks: tuple = ('superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'strain') + lineage_dict: dict = field(default=None, compare=False) # dict of rank: name def __post_init__(self): "Initialize according to passed values" @@ -344,6 +310,45 @@ def __post_init__(self): elif self.ranks: self._init_empty() + def _init_from_lineage_dict(self): + 'initialize from lineage dict, e.g. from lineages csv, allowing empty ranks/extra columns and reordering if necessary' + if not isinstance(self.lineage_dict, (dict)): + raise ValueError(f"{self.lineage_dict} is not dictionary") + new_lineage = [] + taxpath=[] + # build empty lineage and taxpath + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + + # check for NCBI taxpath information + taxpath_str = self.lineage_dict.get('taxpath', []) + if taxpath_str: + taxpath = taxpath_str.split('|') + if len(taxpath) > len(self.ranks): + raise ValueError(f"Number of NCBI taxids ({len(taxpath)}) exceeds number of ranks ({len(self.ranks)})") + + # now add rank information in correct spots. This corrects for order and allows empty ranks and extra dict keys + for key, val in self.lineage_dict.items(): + name, taxid = None, None + try: + rank, name = key, val + rank_idx = self.rank_index(rank) + except ValueError: + continue # ignore dictionary entries (columns) that don't match a rank + + if taxpath: + try: + taxid = taxpath[rank_idx] + except IndexError: + taxid = None + new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) + + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", filled_ranks) + def get_ident(ident, *, keep_full_identifiers=False, keep_identifier_versions=False): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index fe0b00e1c3..05f23bec5a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1074,46 +1074,6 @@ def test_BaseLineageInfo_init_lca_lineage_tups(): assert taxinf.zip_lineage()== ['a', '', 'b'] -def test_BaseLineageInfo_init_lineage_dict_fail(): - ranks=["A", "B", "C"] - lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) - with pytest.raises(ValueError) as exc: - taxinf = BaseLineageInfo(ranks=ranks, lineage_dict=lin_tups) - print(str(exc)) - - assert "is not dictionary" in str(exc) - - -def test_BaseLineageInfo_init_lineage_dict (): - x = {'rank1': 'name1', 'rank2': 'name2'} - taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) - print("ranks: ", taxinf.ranks) - print("lineage: ", taxinf.lineage) - print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', 'name2'] - - -def test_BaseLineageInfo_init_lineage_dict_withtaxid(): - x = {'rank1': {'name': 'name1', 'taxid': 1}, 'rank2': {'name':'name2', 'taxid': 2}} - taxinf = BaseLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) - print("ranks: ", taxinf.ranks) - print("lineage: ", taxinf.lineage) - print("zipped lineage: ", taxinf.zip_lineage()) - assert taxinf.zip_lineage()== ['name1', 'name2'] - assert taxinf.zip_taxid()== ['1', '2'] - assert taxinf.lowest_lineage_taxid == 2 - assert taxinf.lowest_lineage_name == "name2" - - -def test_BaseLineageInfo_init_lineage_str_lineage_dict_test_eq(): - x = "a;b;c" - ranks=["A", "B", "C"] - rankD = {"A": "a", "B": "b", "C": "c"} - lin1 = BaseLineageInfo(lineage_str=x, ranks=ranks) - lin2 = BaseLineageInfo(lineage_dict=rankD, ranks=ranks) - assert lin1 == lin2 - - def test_BaseLineageInfo_init_no_ranks(): x = "a;b;c" rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} @@ -1122,10 +1082,6 @@ def test_BaseLineageInfo_init_no_ranks(): BaseLineageInfo(lineage_str=x) print(exc) assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc) - with pytest.raises(TypeError) as exc: - BaseLineageInfo(lineage_dict=rankD) - print(exc) - assert "__init__() missing 1 required positional argument: 'ranks'" in str(exc) with pytest.raises(TypeError) as exc: BaseLineageInfo(lineage=lin_tups) print(exc) @@ -1140,10 +1096,6 @@ def test_BaseLineageInfo_init_with_wrong_ranks(): BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(str(exc)) assert "Rank 'rank1' not present in A, B, C" in str(exc) - with pytest.raises(ValueError) as exc: - BaseLineageInfo(lineage_dict=linD, ranks=ranks) - print(str(exc)) - assert "Rank 'rank1' not present in A, B, C" in str(exc) def test_BaseLineageInfo_init_not_lineagepair(): @@ -1187,7 +1139,26 @@ def test_RankLineageInfo_init_lineage_tups(): assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_fail(): + ranks=["A", "B", "C"] + lin_tups = (LineagePair(rank="A", name='a'), LineagePair(rank="C", name='b')) + with pytest.raises(ValueError) as exc: + taxinf = RankLineageInfo(ranks=ranks, lineage_dict=lin_tups) + print(str(exc)) + + assert "is not dictionary" in str(exc) + + def test_RankLineageInfo_init_lineage_dict(): + x = {'rank1': 'name1', 'rank2': 'name2'} + taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', 'name2'] + + +def test_RankLineageInfo_init_lineage_dict_default_ranks(): x = {"superkingdom":'a',"phylum":'b'} taxinf = RankLineageInfo(lineage_dict=x) print(taxinf.lineage) @@ -1195,6 +1166,28 @@ def test_RankLineageInfo_init_lineage_dict(): assert taxinf.zip_lineage()== ['a', 'b', '', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_withtaxpath(): + x = {'rank1': 'name1', 'rank2': 'name2', 'taxpath': "1|2"} + taxinf = RankLineageInfo(lineage_dict=x, ranks=["rank1", "rank2"]) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + print("zipped taxids: ", taxinf.zip_taxid()) + assert taxinf.zip_lineage()== ['name1', 'name2'] + assert taxinf.zip_taxid()== ['1', '2'] + assert taxinf.lowest_lineage_taxid == "2" + assert taxinf.lowest_lineage_name == "name2" + + +def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): + x = "a;b;c" + ranks=["A", "B", "C"] + rankD = {"A": "a", "B": "b", "C": "c"} + lin1 = RankLineageInfo(lineage_str=x, ranks=ranks) + lin2 = RankLineageInfo(lineage_dict=rankD, ranks=ranks) + assert lin1 == lin2 + + def test_RankLineageInfo_init_lineage_dict_missing_rank(): x = {'superkingdom': 'name1', 'class': 'name2'} taxinf = RankLineageInfo(lineage_dict=x) @@ -1205,8 +1198,8 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank(): assert taxinf.zip_lineage(truncate_empty=True)== ['name1', '', 'name2'] -def test_RankLineageInfo_init_lineage_dict_missing_rank_withtaxid(): - x = {'superkingdom': {'name': 'name1', 'taxid': 1}, 'class': {'name':'name2', 'taxid': 2}} +def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): + x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2'} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) print("lineage: ", taxinf.lineage) From 50619cd3f00061aae117800832da8ad53976e344 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 19:06:49 -0800 Subject: [PATCH 05/78] use RankLineageInfo to read and lineages csv --- src/sourmash/tax/tax_utils.py | 45 +++++++++++++++-------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0fe9895847..844fd1ceef 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -311,7 +311,12 @@ def __post_init__(self): self._init_empty() def _init_from_lineage_dict(self): - 'initialize from lineage dict, e.g. from lineages csv, allowing empty ranks/extra columns and reordering if necessary' + """ + Initialize from lineage dict, e.g. from lineages csv. + Use NCBI taxids if available as '|'-separated 'taxpath' column. + Allows empty ranks/extra columns and reordering if necessary + """ + null_names = set(['[Blank]', 'na', 'null', 'NA', '']) if not isinstance(self.lineage_dict, (dict)): raise ValueError(f"{self.lineage_dict} is not dictionary") new_lineage = [] @@ -341,6 +346,9 @@ def _init_from_lineage_dict(self): taxid = taxpath[rank_idx] except IndexError: taxid = None + # filter null + if name is not None and name.strip() in null_names: + name = None new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) # build list of filled ranks @@ -759,15 +767,14 @@ def load(cls, filename, *, delimiter=',', force=False, else: header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') + # is "strain" an available rank? if "strain" in header: include_strain=True - load_taxids=False - if 'taxpath' in header: - load_taxids=True - # check that all ranks are in header - ranks = list(lca_utils.taxlist(include_strain=include_strain)) + ranks = list(RankLineageInfo().taxlist) + if not include_strain: + ranks.remove('strain') if not set(ranks).issubset(header): # for now, just raise err if not all ranks are present. # in future, we can define `ranks` differently if desired @@ -782,16 +789,9 @@ def load(cls, filename, *, delimiter=',', force=False, # now parse and load lineages for n, row in enumerate(r): num_rows += 1 - lineage = [] - taxid=None - # read row into a lineage pair - if load_taxids: - taxpath = row['taxpath'].split('|') - for n, rank in enumerate(lca_utils.taxlist(include_strain=include_strain)): - lin = row[rank] - if load_taxids: - taxid = taxpath[n] - lineage.append(LineagePair(rank, name=lin, taxid=taxid)) + # read lineage from row dictionary + lineageInfo = RankLineageInfo(lineage_dict=row) + # get identifier ident = row[identifier] # fold, spindle, and mutilate ident? @@ -799,23 +799,16 @@ def load(cls, filename, *, delimiter=',', force=False, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions) - # clean lineage of null names, replace with 'unassigned' - lineage = [ (lin.rank, lca_utils.filter_null(lin.name), lin.taxid) for lin in lineage ] - lineage = [ LineagePair(a, b, c) for (a, b, c) in lineage ] - - # remove end nulls - while lineage and lineage[-1].name == 'unassigned': - lineage = lineage[:-1] - # store lineage tuple + lineage = lineageInfo.filled_lineage if lineage: # check duplicates if ident in assignments: - if assignments[ident] != tuple(lineage): + if assignments[ident] != lineage: if not force: raise ValueError(f"multiple lineages for identifier {ident}") else: - assignments[ident] = tuple(lineage) + assignments[ident] = lineage if lineage[-1].rank == 'species': n_species += 1 From 03cf9e378c9f73b3c242aaf8087f2e606fd99be9 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Feb 2023 19:36:06 -0800 Subject: [PATCH 06/78] addl tests --- tests/test_tax_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 05f23bec5a..52e553b622 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1208,6 +1208,25 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): assert taxinf.zip_taxid()== ['1', '', '2', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): + # if there's no name, we don't store the taxpath. Is this desired behavior? + x = {'superkingdom': 'name1', 'taxpath': '1||2'} + taxinf = RankLineageInfo(lineage_dict=x) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + assert taxinf.zip_lineage()== ['name1', '', '', '', '', '', '', ''] + assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', ''] + + +def test_RankLineageInfo_init_lineage_dict_taxpath_too_long(): + x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'} + with pytest.raises(ValueError) as exc: + RankLineageInfo(lineage_dict=x) + print(str(exc)) + assert f"Number of NCBI taxids (13) exceeds number of ranks (8)" in str(exc) + + def test_RankLineageInfo_init_lineage_str_lineage_dict_test_eq(): x = "a;b;c" rankD = {"superkingdom": "a", "phylum": "b", "class": "c"} From 63d1d9dc4472a00ebeb11f2d4a53da5e64a8dbdf Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 17 Jan 2023 12:03:38 -0800 Subject: [PATCH 07/78] init cami output --- src/sourmash/tax/tax_utils.py | 84 +++++++++++++++++++++++++++++++++-- 1 file changed, 80 insertions(+), 4 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 844fd1ceef..f911de4172 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -206,17 +206,17 @@ def zip_taxid(self, truncate_empty=False): return zipped - def display_lineage(self, truncate_empty=True, null_as_unclassified=False): + def display_lineage(self, truncate_empty=True, null_as_unclassified=False, sep = ';'): "Return lineage names as ';'-separated list" - lin = ";".join(self.zip_lineage(truncate_empty=truncate_empty)) + lin = sep.join(self.zip_lineage(truncate_empty=truncate_empty)) if null_as_unclassified and lin == "" or lin is None: return "unclassified" else: return lin - def display_taxid(self, truncate_empty=True): + def display_taxid(self, truncate_empty=True, sep = ";"): "Return lineage taxids as ';'-separated list" - return ";".join(self.zip_taxid(truncate_empty=truncate_empty)) + return sep.join(self.zip_taxid(truncate_empty=truncate_empty)) def check_rank_availability(self, rank): if rank in self.ranks: # rank is available @@ -1461,6 +1461,22 @@ def as_kreport_dict(self, query_info): sD['rank_code'] = RANKCODE['unclassified'] sD["num_bp_assigned"] = sD["num_bp_contained"] return sD + + def as_cami_bioboxes(self): + ''' + Format taxonomy-summarized gather results + as CAMI profiling Bioboxes format. + + Columns are: TAXID RANK TAXPATH TAXPATHSN PERCENTAGE + + ''' + # if this is filled (should always be true here, right? So don't actually need to check this?) + taxid = self.lineage.lowest_lineage_taxid + taxpath = self.lineage.display_taxid(sep="|") + taxpathsn = self.lineage.display_lineage(sep="|") + percentage = f"{(self.f_weighted_at_rank * 100):.2f}" # fix at 2 decimal points + return [taxid, self.rank, taxpath, taxpathsn, percentage] + @dataclass class ClassificationResult(SummarizedGatherResult): @@ -1840,3 +1856,63 @@ def make_kreport_results(self): unclassified_recorded = True kreport_results.append(kresD) return header, kreport_results + + def make_cami_bioboxes(self): + """ + info: https://github.com/CAMI-challenge/contest_information/blob/master/file_formats/CAMI_TP_specification.mkd + + columns: + TAXID - specifies a unique alphanumeric ID for a node in a reference tree such as the NCBI taxonomy + RANK - superkingdom --> strain + TAXPATH - the path from the root of the reference taxonomy to the respective taxon + TAXPATHSN - scientific names of taxpath + PERCENTAGE (0-100) - field specifies what percentage of the sample was assigned to the respective TAXID + + example: + + #CAMI Submission for Taxonomic Profiling + @Version:0.9.1 + @SampleID:SAMPLEID + @Ranks:superkingdom|phylum|class|order|family|genus|species|strain + + @@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE + 2 superkingdom 2 Bacteria 98.81211 + 2157 superkingdom 2157 Archaea 1.18789 + 1239 phylum 2|1239 Bacteria|Firmicutes 59.75801 + 1224 phylum 2|1224 Bacteria|Proteobacteria 18.94674 + 28890 phylum 2157|28890 Archaea|Euryarchaeotes 1.18789 + 91061 class 2|1239|91061 Bacteria|Firmicutes|Bacilli 59.75801 + 28211 class 2|1224|28211 Bacteria|Proteobacteria|Alphaproteobacteria 18.94674 + 183925 class 2157|28890|183925 Archaea|Euryarchaeotes|Methanobacteria 1.18789 + 1385 order 2|1239|91061|1385 Bacteria|Firmicutes|Bacilli|Bacillales 59.75801 + 356 order 2|1224|28211|356 Bacteria|Proteobacteria|Alphaproteobacteria|Rhizobacteria 10.52311 + 204455 order 2|1224|28211|204455 Bacteria|Proteobacteria|Alphaproteobacteria|Rhodobacterales 8.42263 + 2158 order 2157|28890|183925|2158 Archaea|Euryarchaeotes|Methanobacteria|Methanobacteriales 1.18789 + """ + # see https://github.com/luizirber/2020-cami/blob/master/scripts/gather_to_opal.py + + # starting from https://github.com/sourmash-bio/sourmash/pull/1606/files + cami_results = [] + # build CAMI header info + header_title = "# Taxonomic Profiling Output" + version_info = "@Version:0.10.0" + program = "@__program__:sourmash" + sample_info = f"@SampleID:{self.query_info.query_name}" + # taxonomy_id = "@TaxonomyID:2021-10-01" # store this with LineageDB, maybe? + ranks = list(self.ranks) + # if 'strain' in ranks: + # ranks.remove('strain') + rank_info = f"@Ranks:{'|'.join(ranks)}" + header_lines = [header_title, sample_info, version_info, rank_info, program] + + # now build results in CAMI format + # order results by rank (descending), then percentage + for rank in ranks: + rank_results = self.summarized_lineage_results[rank] + for res in rank_results: + cami_info = res.as_cami_bioboxes() + cami_results.append(cami_info) + + return header_lines, cami_results + + \ No newline at end of file From 8f722afdbea6b249d6a684f6cb7b0dfd8789ecbf Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 17 Jan 2023 13:36:30 -0800 Subject: [PATCH 08/78] err if n_positions insufficient for provided lineage_str --- src/sourmash/tax/tax_utils.py | 38 ++++++++++++++++++++++++++++++++- tests/test_tax_utils.py | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 844fd1ceef..284742fe7f 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -310,7 +310,7 @@ def __post_init__(self): elif self.ranks: self._init_empty() - def _init_from_lineage_dict(self): +def _init_from_lineage_dict(self): """ Initialize from lineage dict, e.g. from lineages csv. Use NCBI taxids if available as '|'-separated 'taxpath' column. @@ -351,6 +351,42 @@ def _init_from_lineage_dict(self): name = None new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) +@dataclass(frozen=True, order=True) +class LINSLineageInfo(BaseLineageInfo): + """ + This LINSLineageInfo class usees the BaseLineageInfo methods for hierarchical LINS taxonomic 'ranks'. + + Inputs (at least one required): + n_lin_positions: the number of lineage positions + lineage_str: `;`- or `,`-separated LINS string + + If both `n_lin_positions` and `lineage_str` are provided, we will initialize a `LINSLineageInfo` + with the provided n_lin_positions, and fill positions with `lineage_str` values. If the number of + positions is less than provided lineages, initialization will fail. Otherwise, we will insert blanks + beyond provided data in `lineage_str`. + + LINSLineageInfo must be initialized with lineage or n_lin_positions + defau and no lineage names. + + Input lineage information is only used for initialization of the final `lineage` + and will not be used or compared in any other class methods. + """ + ranks: tuple = field(default=None, init=False, compare=False)# we will set this within class instead + n_lin_positions: int = None # init with this to make empty LINSLineageInfo with correct n_lin_positions + + def _init_from_lineage_str(self): + """ + Turn a ; or ,-separated set of lineages into a list of LineagePair objs. + """ + new_lineage = self.lineage_str.split(';') + if len(new_lineage) == 1: + new_lineage = self.lineage_str.split(',') + if self.n_lin_positions is not None: + self._init_ranks_from_n_lin_positions() + else: + n_lin_positions = len(new_lineage) + object.__setattr__(self, "n_lin_positions", n_lin_positions) + self._init_ranks_from_n_lin_positions() # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name] # set lineage and filled_ranks diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 52e553b622..7f5f14e1ae 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1122,6 +1122,46 @@ def test_RankLineageInfo_init_lineage_str(): assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] +<<<<<<< HEAD +======= +def test_LINSLineageInfo_init_n_pos(): + n_pos = 5 + taxinf = LINSLineageInfo(n_lin_positions=n_pos) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.n_lin_positions == 5 + assert taxinf.zip_lineage()== ['', '', '', '', ''] + + +def test_LINSLineageInfo_init_n_pos_and_lineage_str(): + x = "0;0;1" + n_pos = 5 + taxinf = LINSLineageInfo(lineage_str=x, n_lin_positions=n_pos) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.n_lin_positions == 5 + assert taxinf.zip_lineage()== ['0', '0', '1', '', ''] + + +def test_LINSLineageInfo_init_n_pos_and_lineage_str_fail(): + x = "0;0;1" + n_pos = 2 + with pytest.raises(ValueError) as exc: + LINSLineageInfo(lineage_str=x, n_lin_positions=n_pos) + print(str(exc)) + assert "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." in str(exc) + + +def test_LINSLineageInfo_init_lineage_str_only(): + x = "0,0,1" + taxinf = LINSLineageInfo(lineage_str=x) + print(taxinf.lineage) + print(taxinf.lineage_str) + assert taxinf.n_lin_positions == 3 + assert taxinf.zip_lineage()== ['0', '0', '1'] + + +>>>>>>> 41c44d44 (err if n_positions insufficient for provided lineage_str) def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): x = "a;b;c" taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] From 4ae1c7cae1f747510737e02e06810bc2cf1a7705 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 17 Jan 2023 13:38:09 -0800 Subject: [PATCH 09/78] wording --- src/sourmash/tax/tax_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 284742fe7f..81bb79bbd6 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -362,11 +362,10 @@ class LINSLineageInfo(BaseLineageInfo): If both `n_lin_positions` and `lineage_str` are provided, we will initialize a `LINSLineageInfo` with the provided n_lin_positions, and fill positions with `lineage_str` values. If the number of - positions is less than provided lineages, initialization will fail. Otherwise, we will insert blanks - beyond provided data in `lineage_str`. + positions is less than provided lineages, initialization will fail. Otherwise, blank entries will be + inserted beyond provided data in `lineage_str`. - LINSLineageInfo must be initialized with lineage or n_lin_positions - defau and no lineage names. + LINSLineageInfo must be initialized with lineage_str or n_lin_positions. Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. From 0db767e20bd49934ce148e5bbaeff55a55da1854 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 17 Jan 2023 14:12:17 -0800 Subject: [PATCH 10/78] test init fail --- src/sourmash/tax/tax_utils.py | 35 +++++++++++++++++++++++++++++++---- tests/test_tax_utils.py | 10 ++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 81bb79bbd6..378a3112bc 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -362,17 +362,42 @@ class LINSLineageInfo(BaseLineageInfo): If both `n_lin_positions` and `lineage_str` are provided, we will initialize a `LINSLineageInfo` with the provided n_lin_positions, and fill positions with `lineage_str` values. If the number of - positions is less than provided lineages, initialization will fail. Otherwise, blank entries will be - inserted beyond provided data in `lineage_str`. + positions is less than provided lineages, initialization will fail. Otherwise, we will insert blanks + beyond provided data in `lineage_str`. - LINSLineageInfo must be initialized with lineage_str or n_lin_positions. + LINSLineageInfo must be initialized with lineage or n_lin_positions. Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ ranks: tuple = field(default=None, init=False, compare=False)# we will set this within class instead n_lin_positions: int = None # init with this to make empty LINSLineageInfo with correct n_lin_positions - + + def __post_init__(self): + "Initialize according to passed values" + # ranks must be tuple for hashability + if self.lineage_str is not None: + self._init_from_lineage_str() + elif self.n_lin_positions is not None: + self._init_empty() + else: + raise ValueError("Please initialize 'LINSLineageInfo' with 'lineage_str' or 'n_lin_positions'.") + + def _init_ranks_from_n_lin_positions(self): + new_ranks = [x for x in range(0,self.n_lin_positions)] # or str(x) -- does rank need to be str? + object.__setattr__(self, "ranks", new_ranks) + + def _init_empty(self): + "initialize empty genome lineage" + # first, set ranks from n_positions + self._init_ranks_from_n_lin_positions() + new_lineage=[] + for rank in self.ranks: + new_lineage.append(LineagePair(rank=rank)) + # set lineage and filled_ranks (because frozen, need to do it this way) + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", ()) + def _init_from_lineage_str(self): """ Turn a ; or ,-separated set of lineages into a list of LineagePair objs. @@ -381,6 +406,8 @@ def _init_from_lineage_str(self): if len(new_lineage) == 1: new_lineage = self.lineage_str.split(',') if self.n_lin_positions is not None: + if self.n_lin_positions < len(new_lineage): + raise(ValueError("Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'.")) self._init_ranks_from_n_lin_positions() else: n_lin_positions = len(new_lineage) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 7f5f14e1ae..cfce5ea2d0 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1123,7 +1123,17 @@ def test_RankLineageInfo_init_lineage_str(): <<<<<<< HEAD +<<<<<<< HEAD +======= ======= +def test_LINSLineageInfo_init_fail(): + with pytest.raises(ValueError) as exc: + LINSLineageInfo() + print(str(exc)) + assert "Please initialize 'LINSLineageInfo' with 'lineage_str' or 'n_lin_positions'." in str(exc) + + +>>>>>>> b2b1a688 (test init fail) def test_LINSLineageInfo_init_n_pos(): n_pos = 5 taxinf = LINSLineageInfo(n_lin_positions=n_pos) From a3cf4a1e634d724c0b5ca2e6cd3ee3b83b576242 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 11:54:34 -0800 Subject: [PATCH 11/78] fix --- tests/test_tax_utils.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index cfce5ea2d0..7e7b81969c 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -12,9 +12,9 @@ from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, collect_gather_csvs, check_and_load_gather_csvs, + LineagePair, QueryInfo, GatherRow, TaxResult, QueryTaxResult, SummarizedGatherResult, ClassificationResult, - QueryInfo, GatherRow, TaxResult, QueryTaxResult, - BaseLineageInfo, RankLineageInfo, LineagePair, + BaseLineageInfo, RankLineageInfo, LINSLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, write_krona, write_lineage_sample_frac, LineageDB, LineageDB_Sqlite, MultiLineageDB) @@ -1122,10 +1122,6 @@ def test_RankLineageInfo_init_lineage_str(): assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] -<<<<<<< HEAD -<<<<<<< HEAD -======= -======= def test_LINSLineageInfo_init_fail(): with pytest.raises(ValueError) as exc: LINSLineageInfo() @@ -1133,7 +1129,6 @@ def test_LINSLineageInfo_init_fail(): assert "Please initialize 'LINSLineageInfo' with 'lineage_str' or 'n_lin_positions'." in str(exc) ->>>>>>> b2b1a688 (test init fail) def test_LINSLineageInfo_init_n_pos(): n_pos = 5 taxinf = LINSLineageInfo(n_lin_positions=n_pos) @@ -1171,7 +1166,6 @@ def test_LINSLineageInfo_init_lineage_str_only(): assert taxinf.zip_lineage()== ['0', '0', '1'] ->>>>>>> 41c44d44 (err if n_positions insufficient for provided lineage_str) def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): x = "a;b;c" taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] From 7386e5b7e9ef4fe028419e8565eb2b8e4f9ee296 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 11:55:30 -0800 Subject: [PATCH 12/78] fix2 --- src/sourmash/tax/tax_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 378a3112bc..0f45b595e3 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -310,7 +310,7 @@ def __post_init__(self): elif self.ranks: self._init_empty() -def _init_from_lineage_dict(self): + def _init_from_lineage_dict(self): """ Initialize from lineage dict, e.g. from lineages csv. Use NCBI taxids if available as '|'-separated 'taxpath' column. From 6b5f2cdb58ba1f42849a5ca6c2adc4d31e0dfed1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 11:58:25 -0800 Subject: [PATCH 13/78] resolve issues from merge --- src/sourmash/tax/tax_utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0f45b595e3..2dd4954d9b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -351,6 +351,12 @@ def _init_from_lineage_dict(self): name = None new_lineage[rank_idx] = LineagePair(rank=rank, name=name, taxid=taxid) + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", filled_ranks) + @dataclass(frozen=True, order=True) class LINSLineageInfo(BaseLineageInfo): """ From 0f882d7eb18b2a8a83e61b44f10b0a3741bf901d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 12:14:13 -0800 Subject: [PATCH 14/78] test for missing taxids; taxpath shorter than provided rank names --- tests/test_tax_utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 52e553b622..a254818415 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1219,6 +1219,18 @@ def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): assert taxinf.zip_taxid()== ['1', '', '', '', '', '', '', ''] +def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids(): + # if there's no name, we don't store the taxpath. Is this desired behavior? + x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'} + taxinf = RankLineageInfo(lineage_dict=x) + print("ranks: ", taxinf.ranks) + print("lineage: ", taxinf.lineage) + print("zipped lineage: ", taxinf.zip_lineage()) + print("zipped taxids: ", taxinf.zip_taxid()) + assert taxinf.zip_lineage()== ['name1', 'name2', 'name3', '', '', '', '', ''] + assert taxinf.zip_taxid()== ['', '2', '', '', '', '', '', ''] + + def test_RankLineageInfo_init_lineage_dict_taxpath_too_long(): x = {'superkingdom': 'name1', 'class': 'name2', 'taxpath': '1||2||||||||||'} with pytest.raises(ValueError) as exc: From 842ba397245ed9f6744ea2f08551d7f4383996ae Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 13:01:56 -0800 Subject: [PATCH 15/78] clarify comment --- tests/test_tax_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a254818415..11db0f4d5a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1209,7 +1209,8 @@ def test_RankLineageInfo_init_lineage_dict_missing_rank_with_taxpath(): def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): - # if there's no name, we don't store the taxpath. Is this desired behavior? + # If there's no name, we don't report the taxpath, because lineage is not "filled". + # Is this desired behavior? x = {'superkingdom': 'name1', 'taxpath': '1||2'} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) From ce3c9919bc91d8b9a0fade2f05c3080b9d14cc08 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Feb 2023 13:15:01 -0800 Subject: [PATCH 16/78] clarify comment2 --- tests/test_tax_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 11db0f4d5a..db4a277363 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1221,7 +1221,8 @@ def test_RankLineageInfo_init_lineage_dict_name_taxpath_mismatch(): def test_RankLineageInfo_init_lineage_dict_name_taxpath_missing_taxids(): - # if there's no name, we don't store the taxpath. Is this desired behavior? + # If there's no name, we don't report the taxpath, because lineage is not "filled". + # Is this desired behavior? x = {'superkingdom': 'name1', 'phylum': "name2", "class": "name3", 'taxpath': '|2'} taxinf = RankLineageInfo(lineage_dict=x) print("ranks: ", taxinf.ranks) From 87f7e500a79c074e880451c8479e482fb506e0b7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 10 Feb 2023 08:12:28 -0800 Subject: [PATCH 17/78] undelete line --- tests/test_tax.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index 85d6970768..f852825068 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -49,6 +49,7 @@ def test_metagenome_stdout_0(runtmp): assert 'test1,order,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,order,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,order,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out + assert 'test1,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000' in c.last_result.out assert 'test1,family,0.088,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae,md5,test1.sig,0.058,442000' in c.last_result.out assert 'test1,family,0.796,unclassified,md5,test1.sig,0.869,3990000' in c.last_result.out assert 'test1,genus,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella,md5,test1.sig,0.057,444000' in c.last_result.out From 9b139b66c42a0ff1fe906d946506ced644bcb48a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 10 Feb 2023 08:34:51 -0800 Subject: [PATCH 18/78] add filled_pos --- src/sourmash/tax/tax_utils.py | 19 +++++++++++-------- tests/test_tax_utils.py | 7 ++++++- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 2dd4954d9b..e501589089 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -159,12 +159,12 @@ def _init_from_lineage_tuples(self): new_lineage[rank_idx] = LineagePair(rank=lin_tup.rank, name=lin_tup.name) else: new_lineage[rank_idx] = lin_tup - + # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name] # set lineage and filled_ranks object.__setattr__(self, "lineage", tuple(new_lineage)) - object.__setattr__(self, "filled_ranks", filled_ranks) + object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) def _init_from_lineage_str(self): """ @@ -177,7 +177,7 @@ def _init_from_lineage_str(self): # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name] object.__setattr__(self, "lineage", tuple(new_lineage)) - object.__setattr__(self, "filled_ranks", filled_ranks) + object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) def zip_lineage(self, truncate_empty=False): """ @@ -355,7 +355,7 @@ def _init_from_lineage_dict(self): filled_ranks = [a.rank for a in new_lineage if a.name] # set lineage and filled_ranks object.__setattr__(self, "lineage", tuple(new_lineage)) - object.__setattr__(self, "filled_ranks", filled_ranks) + object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) @dataclass(frozen=True, order=True) class LINSLineageInfo(BaseLineageInfo): @@ -390,7 +390,7 @@ def __post_init__(self): raise ValueError("Please initialize 'LINSLineageInfo' with 'lineage_str' or 'n_lin_positions'.") def _init_ranks_from_n_lin_positions(self): - new_ranks = [x for x in range(0,self.n_lin_positions)] # or str(x) -- does rank need to be str? + new_ranks = [x for x in range(0, self.n_lin_positions)] object.__setattr__(self, "ranks", new_ranks) def _init_empty(self): @@ -403,6 +403,7 @@ def _init_empty(self): # set lineage and filled_ranks (because frozen, need to do it this way) object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", ()) + object.__setattr__(self, "filled_pos", 0) def _init_from_lineage_str(self): """ @@ -419,11 +420,13 @@ def _init_from_lineage_str(self): n_lin_positions = len(new_lineage) object.__setattr__(self, "n_lin_positions", n_lin_positions) self._init_ranks_from_n_lin_positions() - # build list of filled ranks + + # build lineage and filled_pos, filled_ranks + new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] filled_ranks = [a.rank for a in new_lineage if a.name] - # set lineage and filled_ranks object.__setattr__(self, "lineage", tuple(new_lineage)) - object.__setattr__(self, "filled_ranks", filled_ranks) + object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) + object.__setattr__(self, "filled_pos", len(filled_ranks)) def get_ident(ident, *, diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 2921eaad6e..3d8dfd4e61 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1136,7 +1136,8 @@ def test_LINSLineageInfo_init_n_pos(): print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 assert taxinf.zip_lineage()== ['', '', '', '', ''] - + assert taxinf.filled_ranks == () + assert taxinf.filled_pos == 0 def test_LINSLineageInfo_init_n_pos_and_lineage_str(): x = "0;0;1" @@ -1146,6 +1147,8 @@ def test_LINSLineageInfo_init_n_pos_and_lineage_str(): print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 assert taxinf.zip_lineage()== ['0', '0', '1', '', ''] + assert taxinf.filled_ranks == (0,1,2) + assert taxinf.filled_pos == 3 def test_LINSLineageInfo_init_n_pos_and_lineage_str_fail(): @@ -1164,6 +1167,8 @@ def test_LINSLineageInfo_init_lineage_str_only(): print(taxinf.lineage_str) assert taxinf.n_lin_positions == 3 assert taxinf.zip_lineage()== ['0', '0', '1'] + assert taxinf.filled_ranks == (0,1,2) + assert taxinf.filled_pos == 3 def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): From d72df571ef734e76d94d60e77568abf5b938161f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 10 Feb 2023 10:53:12 -0800 Subject: [PATCH 19/78] read LIN into LineageDB --- src/sourmash/tax/tax_utils.py | 55 ++++++++++++++++++++++------------- tests/test_tax_utils.py | 36 +++++++++++++++++++++++ 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e501589089..d7363f8388 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -797,7 +797,7 @@ def __bool__(self): @classmethod def load(cls, filename, *, delimiter=',', force=False, - keep_full_identifiers=False, keep_identifier_versions=True): + keep_full_identifiers=False, keep_identifier_versions=True, LINS_taxonomy=False): """ Load a taxonomy assignment CSV file into a LineageDB. @@ -822,6 +822,9 @@ def load(cls, filename, *, delimiter=',', force=False, if not header: raise ValueError(f'cannot read taxonomy assignments from {filename}') + if LINS_taxonomy and "LIN" not in header: + raise ValueError(f"'LIN' column not found: cannot read LIN taxonomy assignments from {filename}.") + identifier = "ident" # check for ident/identifier, handle some common alternatives if "ident" not in header: @@ -839,29 +842,40 @@ def load(cls, filename, *, delimiter=',', force=False, header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') - # is "strain" an available rank? - if "strain" in header: - include_strain=True - # check that all ranks are in header - ranks = list(RankLineageInfo().taxlist) - if not include_strain: - ranks.remove('strain') - if not set(ranks).issubset(header): - # for now, just raise err if not all ranks are present. - # in future, we can define `ranks` differently if desired - # return them from this function so we can check the `available` ranks - raise ValueError('Not all taxonomy ranks present') + if not LINS_taxonomy: + # is "strain" an available rank? + if "strain" in header: + include_strain=True + # check that all ranks are in header + ranks = list(RankLineageInfo().taxlist) + if not include_strain: + ranks.remove('strain') + if not set(ranks).issubset(header): + # for now, just raise err if not all ranks are present. + # in future, we can define `ranks` differently if desired + # return them from this function so we can check the `available` ranks + raise ValueError('Not all taxonomy ranks present') assignments = {} num_rows = 0 n_species = 0 n_strains = 0 + n_pos = None # now parse and load lineages for n, row in enumerate(r): num_rows += 1 - # read lineage from row dictionary - lineageInfo = RankLineageInfo(lineage_dict=row) + if LINS_taxonomy: + lineageInfo = LINSLineageInfo(lineage_str=row['LIN']) + if n_pos is not None: + if lineageInfo.n_lin_positions != n_pos: + raise ValueError(f"For taxonomic summarization, all LIN assignments must use the same number of LIN positions.") + else: + n_pos = lineageInfo.n_lin_positions # set n_pos with first entry + ranks=lineageInfo.ranks + else: + # read lineage from row dictionary + lineageInfo = RankLineageInfo(lineage_dict=row) # get identifier ident = row[identifier] @@ -881,11 +895,12 @@ def load(cls, filename, *, delimiter=',', force=False, else: assignments[ident] = lineage - if lineage[-1].rank == 'species': - n_species += 1 - elif lineage[-1].rank == 'strain': - n_species += 1 - n_strains += 1 + if not LINS_taxonomy: + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 return LineageDB(assignments, ranks) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 3d8dfd4e61..764d92a919 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -574,6 +574,42 @@ def test_load_taxonomy_csv(): assert len(tax_assign) == 6 # should have read 6 rows +def test_load_taxonomy_csv_LIN(): + taxonomy_csv = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax_assign = MultiLineageDB.load([taxonomy_csv], LINS_taxonomy=True) + print("taxonomy assignments: \n", tax_assign) + assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] + assert len(tax_assign) == 6 # should have read 6 rows + print(tax_assign.available_ranks) + assert tax_assign.available_ranks == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19} + + +def test_load_taxonomy_csv_LIN_fail(): + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + with pytest.raises(ValueError) as exc: + MultiLineageDB.load([taxonomy_csv], LINS_taxonomy=True) + assert f"'LIN' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." in str(exc.value) + + +def test_load_taxonomy_csv_LIN_mismatch_in_taxfile(runtmp): + taxonomy_csv = utils.get_test_data('tax/test.LINS-taxonomy.csv') + mimatchLIN_csv = runtmp.output('mmLINS-taxonomy.csv') + with open(mimatchLIN_csv, 'w') as mm: + tax21=[] + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + for n, taxline in enumerate(tax): + if n == 2: # add ;0 to a LIN + taxlist = taxline.split(',') + taxlist[2] += ';0' # add 21st position to LIN + tax21.append(",".join(taxlist)) + else: + tax21.append(taxline) + mm.write("\n".join(tax21)) + with pytest.raises(ValueError) as exc: + MultiLineageDB.load([mimatchLIN_csv], LINS_taxonomy=True) + assert "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." in str(exc.value) + + def test_load_taxonomy_csv_gzip(runtmp): # test loading a gzipped taxonomy csv file taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') From f39aa54e21a6ca618f73c0c65cdd479fe49d780c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 10 Feb 2023 10:54:06 -0800 Subject: [PATCH 20/78] actually add LIN test taxonomy --- tests/test-data/tax/test.LINS-taxonomy.csv | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/test-data/tax/test.LINS-taxonomy.csv diff --git a/tests/test-data/tax/test.LINS-taxonomy.csv b/tests/test-data/tax/test.LINS-taxonomy.csv new file mode 100644 index 0000000000..b9afe550d0 --- /dev/null +++ b/tests/test-data/tax/test.LINS-taxonomy.csv @@ -0,0 +1,7 @@ +ident,TaxID,LIN,superkingdom,phylum,class,order,family,genus,species,subspecies,strain +GCF_000010525.1,438753.0,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Rhizobiales,Xanthobacteraceae,Azorhizobium,Azorhizobium caulinodans,,ORS 571 +GCF_000007365.1,198804.0,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,Sg +GCF_000007725.1,224915.0,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,Bp (Baizongia pistaciae) +GCF_000009605.1,107806.0,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,APS +GCF_000021065.1,561501.0,1;0;1;0;0;0;0;0;0;0;0;0;0;1;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,Tuc7 +GCF_000021085.1,563178.0,1;0;1;0;0;0;0;0;0;0;0;0;0;1;1;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,5A From 360cfd984d0631054097c10a99716878dc399b86 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 10 Feb 2023 14:31:29 -0800 Subject: [PATCH 21/78] allow LIN with tax metagenome --- src/sourmash/cli/tax/genome.py | 11 ++++ src/sourmash/cli/tax/metagenome.py | 11 ++++ src/sourmash/tax/__main__.py | 5 +- src/sourmash/tax/tax_utils.py | 68 ++++++++++++++++------ tests/test-data/tax/test.LINS-taxonomy.csv | 14 ++--- tests/test_tax.py | 40 +++++++++++++ tests/test_tax_utils.py | 19 +++--- 7 files changed, 131 insertions(+), 37 deletions(-) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 54c40fd681..20764f34f6 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -90,6 +90,14 @@ def subparser(subparsers): '-f', '--force', action = 'store_true', help='continue past survivable errors in loading taxonomy database or gather results', ) + subparser.add_argument( + '--LIN-taxonomy', action='store_true', + help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + ) + subparser.add_argument( + '--LIN-position', type=int, + help='For non-default output formats: summarize taxonomy at this LIN position and above. Replaces "--rank" for standard taxonomy. Note that the taxonomy CSV must contain LIN with information at this position.' + ) add_tax_threshold_arg(subparser, 0.1) @@ -100,6 +108,9 @@ def main(args): if len(args.output_format) > 1: if args.output_base == "-": raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + if args.LIN_taxonomy: + if args.LIN_position: + args.rank = args.LIN_position if not args.rank: if any(x in ["krona"] for x in args.output_format): raise ValueError(f"Rank (--rank) is required for krona output format.") diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index df81789360..addee9ce31 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -78,6 +78,14 @@ def subparser(subparsers): '-f', '--force', action = 'store_true', help='continue past errors in taxonomy database loading', ) + subparser.add_argument( + '--LIN-taxonomy', action='store_true', + help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + ) + subparser.add_argument( + '--LIN-position', type=int, + help='For non-default output formats: summarize taxonomy at this LIN position and above. Replaces "--rank" for standard taxonomy. Note that the taxonomy CSV must contain LIN with information at this position.' + ) def main(args): import sourmash @@ -86,6 +94,9 @@ def main(args): if len(args.output_format) > 1: if args.output_base == "-": raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + if args.LIN_taxonomy: + if args.LIN_position: + args.rank = args.LIN_position if not args.rank: if any(x in ["krona", "lineage_summary"] for x in args.output_format): raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 1d127cefb7..0f2c341922 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -72,7 +72,7 @@ def metagenome(args): tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - force=args.force) + force=args.force, LIN_taxonomy=args.LIN_taxonomy) available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -93,6 +93,7 @@ def metagenome(args): fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, + LIN_taxonomy=args.LIN_taxonomy, ) except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -174,7 +175,7 @@ def genome(args): tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - force=args.force) + force=args.force, LIN_taxonomy=args.LIN_taxonomy) available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d7363f8388..a2714631a1 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -22,7 +22,7 @@ 'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank' 'format_for_krona', 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', - 'MultiLineageDB', 'RankLineageInfo'] + 'MultiLineageDB', 'RankLineageInfo', 'LINSLineageInfo'] from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file @@ -161,7 +161,7 @@ def _init_from_lineage_tuples(self): new_lineage[rank_idx] = lin_tup # build list of filled ranks - filled_ranks = [a.rank for a in new_lineage if a.name] + filled_ranks = [a.rank for a in new_lineage if a.name is not None] # set lineage and filled_ranks object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) @@ -175,7 +175,7 @@ def _init_from_lineage_str(self): new_lineage = self.lineage_str.split(',') new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] # build list of filled ranks - filled_ranks = [a.rank for a in new_lineage if a.name] + filled_ranks = [a.rank for a in new_lineage if a.name is not None] object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) @@ -377,20 +377,23 @@ class LINSLineageInfo(BaseLineageInfo): and will not be used or compared in any other class methods. """ ranks: tuple = field(default=None, init=False, compare=False)# we will set this within class instead + lineage: tuple = None n_lin_positions: int = None # init with this to make empty LINSLineageInfo with correct n_lin_positions def __post_init__(self): "Initialize according to passed values" # ranks must be tuple for hashability - if self.lineage_str is not None: + if self.lineage is not None: + self._init_from_lineage_tuples() + elif self.lineage_str is not None: self._init_from_lineage_str() elif self.n_lin_positions is not None: self._init_empty() else: - raise ValueError("Please initialize 'LINSLineageInfo' with 'lineage_str' or 'n_lin_positions'.") + raise ValueError("Please initialize 'LINSLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'.") def _init_ranks_from_n_lin_positions(self): - new_ranks = [x for x in range(0, self.n_lin_positions)] + new_ranks = [str(x) for x in range(0, self.n_lin_positions)] object.__setattr__(self, "ranks", new_ranks) def _init_empty(self): @@ -423,11 +426,31 @@ def _init_from_lineage_str(self): # build lineage and filled_pos, filled_ranks new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] - filled_ranks = [a.rank for a in new_lineage if a.name] + filled_ranks = [a.rank for a in new_lineage if a.name is not None] object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) object.__setattr__(self, "filled_pos", len(filled_ranks)) + def _init_from_lineage_tuples(self): + 'initialize from tuple/list of LineagePairs, building ranks as you go' + new_lineage = [] + # check this is a list or tuple of lineage tuples: + for lin_tup in self.lineage: + if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)): + raise ValueError(f"{lin_tup} is not LineagePair.") + # make sure we're adding tax_utils.LineagePairs, not lca_utils.LineagePairs for consistency + if isinstance(lin_tup, lca_utils.LineagePair): + new_lineage.append(LineagePair(rank=lin_tup.rank, name=lin_tup.name)) + else: + new_lineage.append(lin_tup) + # build list of filled ranks + filled_ranks = [a.rank for a in new_lineage if a.name is not None] + # set lineage and filled_ranks + object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) + object.__setattr__(self, "filled_pos", len(filled_ranks)) + object.__setattr__(self, "ranks", tuple(filled_ranks)) + def get_ident(ident, *, keep_full_identifiers=False, keep_identifier_versions=False): @@ -477,7 +500,8 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False, skip_idents = None, fail_on_missing_taxonomy=False, - keep_full_identifiers=False, keep_identifier_versions=False): + keep_full_identifiers=False, keep_identifier_versions=False, + LIN_taxonomy=False): "Load a single gather csv" if not seen_queries: seen_queries=set() @@ -503,7 +527,8 @@ def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force taxres = TaxResult(raw=gatherRow, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions) taxres.get_match_lineage(tax_assignments=tax_assignments, skip_idents=skip_idents, - fail_on_missing_taxonomy=fail_on_missing_taxonomy) + fail_on_missing_taxonomy=fail_on_missing_taxonomy, + LIN_taxonomy=LIN_taxonomy) # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new @@ -519,7 +544,7 @@ def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False, - keep_full_identifiers=False,keep_identifier_versions=False): + keep_full_identifiers=False,keep_identifier_versions=False, LIN_taxonomy=False): ''' Load gather csvs, checking for empties and ids missing from taxonomic assignments. ''' @@ -537,7 +562,8 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon seen_queries=gather_results.keys(), force=force, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions = keep_identifier_versions, - fail_on_missing_taxonomy=fail_on_missing_taxonomy) + fail_on_missing_taxonomy=fail_on_missing_taxonomy, + LIN_taxonomy=LIN_taxonomy) except ValueError as exc: if force: if "found in more than one CSV" in str(exc): @@ -797,7 +823,7 @@ def __bool__(self): @classmethod def load(cls, filename, *, delimiter=',', force=False, - keep_full_identifiers=False, keep_identifier_versions=True, LINS_taxonomy=False): + keep_full_identifiers=False, keep_identifier_versions=True, LIN_taxonomy=False): """ Load a taxonomy assignment CSV file into a LineageDB. @@ -822,7 +848,7 @@ def load(cls, filename, *, delimiter=',', force=False, if not header: raise ValueError(f'cannot read taxonomy assignments from {filename}') - if LINS_taxonomy and "LIN" not in header: + if LIN_taxonomy and "LIN" not in header: raise ValueError(f"'LIN' column not found: cannot read LIN taxonomy assignments from {filename}.") identifier = "ident" @@ -842,7 +868,7 @@ def load(cls, filename, *, delimiter=',', force=False, header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') - if not LINS_taxonomy: + if not LIN_taxonomy: # is "strain" an available rank? if "strain" in header: include_strain=True @@ -865,7 +891,7 @@ def load(cls, filename, *, delimiter=',', force=False, # now parse and load lineages for n, row in enumerate(r): num_rows += 1 - if LINS_taxonomy: + if LIN_taxonomy: lineageInfo = LINSLineageInfo(lineage_str=row['LIN']) if n_pos is not None: if lineageInfo.n_lin_positions != n_pos: @@ -895,7 +921,7 @@ def load(cls, filename, *, delimiter=',', force=False, else: assignments[ident] = lineage - if not LINS_taxonomy: + if not LIN_taxonomy: if lineage[-1].rank == 'species': n_species += 1 elif lineage[-1].rank == 'strain': @@ -1403,10 +1429,11 @@ class TaxResult: query_name: str = field(init=False) query_info: QueryInfo = field(init=False) match_ident: str = field(init=False) - lineageInfo: RankLineageInfo = RankLineageInfo() + lineageInfo: RankLineageInfo = RankLineageInfo() #None#field(init=False) #RankLineageInfo() skipped_ident: bool = False missed_ident: bool = False match_lineage_attempted: bool = False + LIN_taxonomy: bool = False def __post_init__(self): self.get_ident() @@ -1439,13 +1466,16 @@ def get_ident(self): self.match_ident = self.match_ident.split('.')[0] - def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False): + def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False, LIN_taxonomy=False): if skip_idents and self.match_ident in skip_idents: self.skipped_ident = True else: lin = tax_assignments.get(self.match_ident) if lin: - self.lineageInfo = RankLineageInfo(lineage=lin) + if LIN_taxonomy: + self.lineageInfo = LINSLineageInfo(lineage = lin) + else: + self.lineageInfo = RankLineageInfo(lineage = lin) else: self.missed_ident=True self.match_lineage_attempted = True diff --git a/tests/test-data/tax/test.LINS-taxonomy.csv b/tests/test-data/tax/test.LINS-taxonomy.csv index b9afe550d0..7185e5679c 100644 --- a/tests/test-data/tax/test.LINS-taxonomy.csv +++ b/tests/test-data/tax/test.LINS-taxonomy.csv @@ -1,7 +1,7 @@ -ident,TaxID,LIN,superkingdom,phylum,class,order,family,genus,species,subspecies,strain -GCF_000010525.1,438753.0,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Rhizobiales,Xanthobacteraceae,Azorhizobium,Azorhizobium caulinodans,,ORS 571 -GCF_000007365.1,198804.0,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,Sg -GCF_000007725.1,224915.0,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,Bp (Baizongia pistaciae) -GCF_000009605.1,107806.0,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,APS -GCF_000021065.1,561501.0,1;0;1;0;0;0;0;0;0;0;0;0;0;1;0;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,Tuc7 -GCF_000021085.1,563178.0,1;0;1;0;0;0;0;0;0;0;0;0;0;1;1;0;0;0;0;0,Bacteria,Proteobacteria,,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,5A +ident,LIN +GCF_001881345.1,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 +GCF_009494285.1,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 +GCF_013368705.1,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 +GCF_003471795.1,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 +GCF_000017325.1,1;0;1;0;0;0;0;0;0;0;0;0;0;1;0;0;0;0;0;0 +GCF_000021665.1,1;0;1;0;0;0;0;0;0;0;0;0;0;1;1;0;0;0;0;0 diff --git a/tests/test_tax.py b/tests/test_tax.py index f852825068..e9a56f7db5 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3287,3 +3287,43 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): assert c['2'] == 5 assert c['6'] == 1 assert c['1'] == 11 + + +def test_metagenome_LINS(runtmp): + # test basic metagenome with LIN taxonomy + # get/design better test data for this? + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + # 0th rank/position + assert "test1,0,0.089,1,md5,test1.sig,0.057,444000,0.925,0" in c.last_result.out + assert "test1,0,0.088,0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out + assert "test1,0,0.028,2,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out + assert "test1,0,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + # 1st rank/position + assert "test1,1,0.089,1;0,md5,test1.sig,0.057,444000,0.925,0" in c.last_result.out + assert "test1,1,0.088,0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out + assert "test1,1,0.028,2;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out + assert "test1,1,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + # 2nd rank/position + assert "test1,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out + assert "test1,2,0.078,1;0;0,md5,test1.sig,0.050,390000,0.921,0" in c.last_result.out + assert "test1,2,0.028,2;0;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out + assert "test1,2,0.011,1;0;1,md5,test1.sig,0.007,54000,0.864,0" in c.last_result.out + assert "test1,2,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + # 19th rank/position + assert "test1,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925,0" in c.last_result.out + assert "test1,19,0.078,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.050,390000,0.921,0" in c.last_result.out + assert "test1,19,0.028,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out + assert "test1,19,0.011,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.007,54000,0.864,0" in c.last_result.out + assert "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 764d92a919..1930d6dc95 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -576,18 +576,19 @@ def test_load_taxonomy_csv(): def test_load_taxonomy_csv_LIN(): taxonomy_csv = utils.get_test_data('tax/test.LINS-taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], LINS_taxonomy=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], LIN_taxonomy=True) print("taxonomy assignments: \n", tax_assign) - assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] + assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] + #assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] assert len(tax_assign) == 6 # should have read 6 rows print(tax_assign.available_ranks) - assert tax_assign.available_ranks == {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19} + assert tax_assign.available_ranks == {str(x) for x in range(0,20)} def test_load_taxonomy_csv_LIN_fail(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') with pytest.raises(ValueError) as exc: - MultiLineageDB.load([taxonomy_csv], LINS_taxonomy=True) + MultiLineageDB.load([taxonomy_csv], LIN_taxonomy=True) assert f"'LIN' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." in str(exc.value) @@ -600,13 +601,13 @@ def test_load_taxonomy_csv_LIN_mismatch_in_taxfile(runtmp): for n, taxline in enumerate(tax): if n == 2: # add ;0 to a LIN taxlist = taxline.split(',') - taxlist[2] += ';0' # add 21st position to LIN + taxlist[1] += ';0' # add 21st position to LIN tax21.append(",".join(taxlist)) else: tax21.append(taxline) mm.write("\n".join(tax21)) with pytest.raises(ValueError) as exc: - MultiLineageDB.load([mimatchLIN_csv], LINS_taxonomy=True) + MultiLineageDB.load([mimatchLIN_csv], LIN_taxonomy=True) assert "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." in str(exc.value) @@ -1162,7 +1163,7 @@ def test_LINSLineageInfo_init_fail(): with pytest.raises(ValueError) as exc: LINSLineageInfo() print(str(exc)) - assert "Please initialize 'LINSLineageInfo' with 'lineage_str' or 'n_lin_positions'." in str(exc) + assert "Please initialize 'LINSLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'." in str(exc) def test_LINSLineageInfo_init_n_pos(): @@ -1183,7 +1184,7 @@ def test_LINSLineageInfo_init_n_pos_and_lineage_str(): print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 assert taxinf.zip_lineage()== ['0', '0', '1', '', ''] - assert taxinf.filled_ranks == (0,1,2) + assert taxinf.filled_ranks == ("0","1","2") assert taxinf.filled_pos == 3 @@ -1203,7 +1204,7 @@ def test_LINSLineageInfo_init_lineage_str_only(): print(taxinf.lineage_str) assert taxinf.n_lin_positions == 3 assert taxinf.zip_lineage()== ['0', '0', '1'] - assert taxinf.filled_ranks == (0,1,2) + assert taxinf.filled_ranks == ("0","1","2") assert taxinf.filled_pos == 3 From ee2bb20540b62b37e3484998485a8e37779b6a3a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 13 Feb 2023 08:39:15 -0800 Subject: [PATCH 22/78] actually save conflict resolution --- src/sourmash/tax/tax_utils.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6caa993261..a2714631a1 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -164,11 +164,7 @@ def _init_from_lineage_tuples(self): filled_ranks = [a.rank for a in new_lineage if a.name is not None] # set lineage and filled_ranks object.__setattr__(self, "lineage", tuple(new_lineage)) -<<<<<<< HEAD object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) -======= - object.__setattr__(self, "filled_ranks", filled_ranks) ->>>>>>> latest def _init_from_lineage_str(self): """ @@ -359,7 +355,6 @@ def _init_from_lineage_dict(self): filled_ranks = [a.rank for a in new_lineage if a.name] # set lineage and filled_ranks object.__setattr__(self, "lineage", tuple(new_lineage)) -<<<<<<< HEAD object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) @dataclass(frozen=True, order=True) @@ -455,9 +450,6 @@ def _init_from_lineage_tuples(self): object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) object.__setattr__(self, "filled_pos", len(filled_ranks)) object.__setattr__(self, "ranks", tuple(filled_ranks)) -======= - object.__setattr__(self, "filled_ranks", filled_ranks) ->>>>>>> latest def get_ident(ident, *, @@ -876,7 +868,6 @@ def load(cls, filename, *, delimiter=',', force=False, header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') -<<<<<<< HEAD if not LIN_taxonomy: # is "strain" an available rank? if "strain" in header: @@ -890,20 +881,6 @@ def load(cls, filename, *, delimiter=',', force=False, # in future, we can define `ranks` differently if desired # return them from this function so we can check the `available` ranks raise ValueError('Not all taxonomy ranks present') -======= - # is "strain" an available rank? - if "strain" in header: - include_strain=True - # check that all ranks are in header - ranks = list(RankLineageInfo().taxlist) - if not include_strain: - ranks.remove('strain') - if not set(ranks).issubset(header): - # for now, just raise err if not all ranks are present. - # in future, we can define `ranks` differently if desired - # return them from this function so we can check the `available` ranks - raise ValueError('Not all taxonomy ranks present') ->>>>>>> latest assignments = {} num_rows = 0 @@ -914,7 +891,6 @@ def load(cls, filename, *, delimiter=',', force=False, # now parse and load lineages for n, row in enumerate(r): num_rows += 1 -<<<<<<< HEAD if LIN_taxonomy: lineageInfo = LINSLineageInfo(lineage_str=row['LIN']) if n_pos is not None: @@ -926,10 +902,6 @@ def load(cls, filename, *, delimiter=',', force=False, else: # read lineage from row dictionary lineageInfo = RankLineageInfo(lineage_dict=row) -======= - # read lineage from row dictionary - lineageInfo = RankLineageInfo(lineage_dict=row) ->>>>>>> latest # get identifier ident = row[identifier] From 9c727409410704d5e32cf87e9fec97e13ec33f4c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 13 Feb 2023 11:38:08 -0800 Subject: [PATCH 23/78] add init LINSLineageInfo from tuples (for LineageDB compatibility) --- src/sourmash/tax/tax_utils.py | 13 +++++++----- tests/test_tax_utils.py | 39 ++++++++++++++++++++++++++++++++--- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index a2714631a1..1f10c7fe94 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -406,7 +406,7 @@ def _init_empty(self): # set lineage and filled_ranks (because frozen, need to do it this way) object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", ()) - object.__setattr__(self, "filled_pos", 0) + object.__setattr__(self, "n_filled_pos", 0) def _init_from_lineage_str(self): """ @@ -424,16 +424,17 @@ def _init_from_lineage_str(self): object.__setattr__(self, "n_lin_positions", n_lin_positions) self._init_ranks_from_n_lin_positions() - # build lineage and filled_pos, filled_ranks + # build lineage and n_filled_pos, filled_ranks new_lineage = [ LineagePair(rank=rank, name=n) for (rank, n) in zip_longest(self.ranks, new_lineage) ] filled_ranks = [a.rank for a in new_lineage if a.name is not None] object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) - object.__setattr__(self, "filled_pos", len(filled_ranks)) + object.__setattr__(self, "n_filled_pos", len(filled_ranks)) def _init_from_lineage_tuples(self): 'initialize from tuple/list of LineagePairs, building ranks as you go' new_lineage = [] + ranks = [] # check this is a list or tuple of lineage tuples: for lin_tup in self.lineage: if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)): @@ -443,13 +444,15 @@ def _init_from_lineage_tuples(self): new_lineage.append(LineagePair(rank=lin_tup.rank, name=lin_tup.name)) else: new_lineage.append(lin_tup) + ranks.append(lin_tup.rank) # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name is not None] # set lineage and filled_ranks object.__setattr__(self, "lineage", tuple(new_lineage)) + object.__setattr__(self, "n_lin_positions", len(new_lineage)) + object.__setattr__(self, "ranks", tuple(ranks)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) - object.__setattr__(self, "filled_pos", len(filled_ranks)) - object.__setattr__(self, "ranks", tuple(filled_ranks)) + object.__setattr__(self, "n_filled_pos", len(filled_ranks)) def get_ident(ident, *, diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 1930d6dc95..71f4f5240f 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1174,7 +1174,8 @@ def test_LINSLineageInfo_init_n_pos(): assert taxinf.n_lin_positions == 5 assert taxinf.zip_lineage()== ['', '', '', '', ''] assert taxinf.filled_ranks == () - assert taxinf.filled_pos == 0 + assert taxinf.n_filled_pos == 0 + def test_LINSLineageInfo_init_n_pos_and_lineage_str(): x = "0;0;1" @@ -1185,7 +1186,7 @@ def test_LINSLineageInfo_init_n_pos_and_lineage_str(): assert taxinf.n_lin_positions == 5 assert taxinf.zip_lineage()== ['0', '0', '1', '', ''] assert taxinf.filled_ranks == ("0","1","2") - assert taxinf.filled_pos == 3 + assert taxinf.n_filled_pos == 3 def test_LINSLineageInfo_init_n_pos_and_lineage_str_fail(): @@ -1205,7 +1206,39 @@ def test_LINSLineageInfo_init_lineage_str_only(): assert taxinf.n_lin_positions == 3 assert taxinf.zip_lineage()== ['0', '0', '1'] assert taxinf.filled_ranks == ("0","1","2") - assert taxinf.filled_pos == 3 + assert taxinf.n_filled_pos == 3 + + +def test_LINSLineageInfo_init_not_lineagepair(): + lin_tups = (("rank1", "name1"),) + with pytest.raises(ValueError) as exc: + LINSLineageInfo(lineage=lin_tups) + print(str(exc)) + assert "is not LineagePair" in str(exc) + + +def test_LINSLineageInfo_init_lineagepair(): + lin_tups = (LineagePair("rank1", "name1"), LineagePair("rank2", None),) + taxinf = LINSLineageInfo(lineage=lin_tups) + print(taxinf.lineage) + assert taxinf.n_lin_positions == 2 + assert taxinf.zip_lineage()== ["name1", ""] + assert taxinf.zip_lineage(truncate_empty=True)== ["name1"] + assert taxinf.filled_ranks == ("rank1",) + assert taxinf.ranks == ("rank1", "rank2") + assert taxinf.n_filled_pos == 1 + + +def test_LINSLineageInfo_init_lca_lineagepair(): + lin_tups = (lca_utils.LineagePair("rank1", "name1"), lca_utils.LineagePair("rank2", None),) + taxinf = LINSLineageInfo(lineage=lin_tups) + print(taxinf.lineage) + assert taxinf.n_lin_positions == 2 + assert taxinf.zip_lineage()== ["name1", ""] + assert taxinf.zip_lineage(truncate_empty=True)== ["name1"] + assert taxinf.filled_ranks == ("rank1",) + assert taxinf.ranks == ("rank1", "rank2") + assert taxinf.n_filled_pos == 1 def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): From 97e52cb58a7d0cdffa91853b40b260ecbc798375 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 14 Feb 2023 11:43:29 -0800 Subject: [PATCH 24/78] naming --- src/sourmash/tax/tax_utils.py | 18 +++++++++--------- tests/test_tax_utils.py | 36 +++++++++++++++++------------------ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 1f10c7fe94..b7e4c13aab 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -22,7 +22,7 @@ 'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank' 'format_for_krona', 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', - 'MultiLineageDB', 'RankLineageInfo', 'LINSLineageInfo'] + 'MultiLineageDB', 'RankLineageInfo', 'LINLineageInfo'] from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file @@ -358,27 +358,27 @@ def _init_from_lineage_dict(self): object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) @dataclass(frozen=True, order=True) -class LINSLineageInfo(BaseLineageInfo): +class LINLineageInfo(BaseLineageInfo): """ - This LINSLineageInfo class usees the BaseLineageInfo methods for hierarchical LINS taxonomic 'ranks'. + This LINLineageInfo class uses the BaseLineageInfo methods for hierarchical LIN taxonomic 'ranks'. Inputs (at least one required): n_lin_positions: the number of lineage positions lineage_str: `;`- or `,`-separated LINS string - If both `n_lin_positions` and `lineage_str` are provided, we will initialize a `LINSLineageInfo` + If both `n_lin_positions` and `lineage_str` are provided, we will initialize a `LINLineageInfo` with the provided n_lin_positions, and fill positions with `lineage_str` values. If the number of positions is less than provided lineages, initialization will fail. Otherwise, we will insert blanks beyond provided data in `lineage_str`. - LINSLineageInfo must be initialized with lineage or n_lin_positions. + LINLineageInfo must be initialized with lineage or n_lin_positions. Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. """ ranks: tuple = field(default=None, init=False, compare=False)# we will set this within class instead lineage: tuple = None - n_lin_positions: int = None # init with this to make empty LINSLineageInfo with correct n_lin_positions + n_lin_positions: int = None # init with this to make empty LINLineageInfo with correct n_lin_positions def __post_init__(self): "Initialize according to passed values" @@ -390,7 +390,7 @@ def __post_init__(self): elif self.n_lin_positions is not None: self._init_empty() else: - raise ValueError("Please initialize 'LINSLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'.") + raise ValueError("Please initialize 'LINLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'.") def _init_ranks_from_n_lin_positions(self): new_ranks = [str(x) for x in range(0, self.n_lin_positions)] @@ -895,7 +895,7 @@ def load(cls, filename, *, delimiter=',', force=False, for n, row in enumerate(r): num_rows += 1 if LIN_taxonomy: - lineageInfo = LINSLineageInfo(lineage_str=row['LIN']) + lineageInfo = LINLineageInfo(lineage_str=row['LIN']) if n_pos is not None: if lineageInfo.n_lin_positions != n_pos: raise ValueError(f"For taxonomic summarization, all LIN assignments must use the same number of LIN positions.") @@ -1476,7 +1476,7 @@ def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_t lin = tax_assignments.get(self.match_ident) if lin: if LIN_taxonomy: - self.lineageInfo = LINSLineageInfo(lineage = lin) + self.lineageInfo = LINLineageInfo(lineage = lin) else: self.lineageInfo = RankLineageInfo(lineage = lin) else: diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 71f4f5240f..a0ba207743 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -14,7 +14,7 @@ collect_gather_csvs, check_and_load_gather_csvs, LineagePair, QueryInfo, GatherRow, TaxResult, QueryTaxResult, SummarizedGatherResult, ClassificationResult, - BaseLineageInfo, RankLineageInfo, LINSLineageInfo, + BaseLineageInfo, RankLineageInfo, LINLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, write_krona, write_lineage_sample_frac, LineageDB, LineageDB_Sqlite, MultiLineageDB) @@ -1159,16 +1159,16 @@ def test_RankLineageInfo_init_lineage_str(): assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] -def test_LINSLineageInfo_init_fail(): +def test_LINLineageInfo_init_fail(): with pytest.raises(ValueError) as exc: - LINSLineageInfo() + LINLineageInfo() print(str(exc)) - assert "Please initialize 'LINSLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'." in str(exc) + assert "Please initialize 'LINLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'." in str(exc) -def test_LINSLineageInfo_init_n_pos(): +def test_LINLineageInfo_init_n_pos(): n_pos = 5 - taxinf = LINSLineageInfo(n_lin_positions=n_pos) + taxinf = LINLineageInfo(n_lin_positions=n_pos) print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 @@ -1177,10 +1177,10 @@ def test_LINSLineageInfo_init_n_pos(): assert taxinf.n_filled_pos == 0 -def test_LINSLineageInfo_init_n_pos_and_lineage_str(): +def test_LINLineageInfo_init_n_pos_and_lineage_str(): x = "0;0;1" n_pos = 5 - taxinf = LINSLineageInfo(lineage_str=x, n_lin_positions=n_pos) + taxinf = LINLineageInfo(lineage_str=x, n_lin_positions=n_pos) print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 5 @@ -1189,18 +1189,18 @@ def test_LINSLineageInfo_init_n_pos_and_lineage_str(): assert taxinf.n_filled_pos == 3 -def test_LINSLineageInfo_init_n_pos_and_lineage_str_fail(): +def test_LINLineageInfo_init_n_pos_and_lineage_str_fail(): x = "0;0;1" n_pos = 2 with pytest.raises(ValueError) as exc: - LINSLineageInfo(lineage_str=x, n_lin_positions=n_pos) + LINLineageInfo(lineage_str=x, n_lin_positions=n_pos) print(str(exc)) assert "Provided 'n_lin_positions' has fewer positions than provided 'lineage_str'." in str(exc) -def test_LINSLineageInfo_init_lineage_str_only(): +def test_LINLineageInfo_init_lineage_str_only(): x = "0,0,1" - taxinf = LINSLineageInfo(lineage_str=x) + taxinf = LINLineageInfo(lineage_str=x) print(taxinf.lineage) print(taxinf.lineage_str) assert taxinf.n_lin_positions == 3 @@ -1209,17 +1209,17 @@ def test_LINSLineageInfo_init_lineage_str_only(): assert taxinf.n_filled_pos == 3 -def test_LINSLineageInfo_init_not_lineagepair(): +def test_LINLineageInfo_init_not_lineagepair(): lin_tups = (("rank1", "name1"),) with pytest.raises(ValueError) as exc: - LINSLineageInfo(lineage=lin_tups) + LINLineageInfo(lineage=lin_tups) print(str(exc)) assert "is not LineagePair" in str(exc) -def test_LINSLineageInfo_init_lineagepair(): +def test_LINLineageInfo_init_lineagepair(): lin_tups = (LineagePair("rank1", "name1"), LineagePair("rank2", None),) - taxinf = LINSLineageInfo(lineage=lin_tups) + taxinf = LINLineageInfo(lineage=lin_tups) print(taxinf.lineage) assert taxinf.n_lin_positions == 2 assert taxinf.zip_lineage()== ["name1", ""] @@ -1229,9 +1229,9 @@ def test_LINSLineageInfo_init_lineagepair(): assert taxinf.n_filled_pos == 1 -def test_LINSLineageInfo_init_lca_lineagepair(): +def test_LINLineageInfo_init_lca_lineagepair(): lin_tups = (lca_utils.LineagePair("rank1", "name1"), lca_utils.LineagePair("rank2", None),) - taxinf = LINSLineageInfo(lineage=lin_tups) + taxinf = LINLineageInfo(lineage=lin_tups) print(taxinf.lineage) assert taxinf.n_lin_positions == 2 assert taxinf.zip_lineage()== ["name1", ""] From e7efbf709f86447cd778ff6a39d8d4b402ea7f29 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 14 Feb 2023 13:27:25 -0800 Subject: [PATCH 25/78] tmp save --- src/sourmash/tax/tax_utils.py | 79 +++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b7e4c13aab..bdce9e7d24 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1558,6 +1558,9 @@ def as_human_friendly_dict(self, query_info): return sD def as_kreport_dict(self, query_info): + """ + Produce kreport dict for named taxonomic groups. + """ lowest_assignment_rank = 'species' sD = {} sD['num_bp_assigned'] = str(0) @@ -1580,6 +1583,33 @@ def as_kreport_dict(self, query_info): sD['rank_code'] = RANKCODE['unclassified'] sD["num_bp_assigned"] = sD["num_bp_contained"] return sD + + # def as_LINgroup_report_dict(self, query_info): + # """ + # Produce LINgroup report dict for LINgroups. + # """ + # # lowest_assignment_rank = 'species' # longest independent LINs? not sure how to do this... + # sD = {} + # sD['num_bp_assigned'] = str(0) + # # total percent containment, weighted to include abundance info + # sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' + # sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) + # if self.lineage != RankLineageInfo(): + # this_rank = self.lineage.lowest_rank + # sD['rank_code'] = RANKCODE[this_rank] + # sD['sci_name'] = self.lineage.lowest_lineage_name + # sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid + # # the number of bp actually 'assigned' at this rank. Sourmash assigns everything + # # at genome level, but since kreport traditionally doesn't include 'strain' or genome, + # # it is reasonable to state that sourmash assigns at 'species' level for this. + # # can be modified later. + # if this_rank == lowest_assignment_rank: + # sD["num_bp_assigned"] = sD["num_bp_contained"] + # else: + # sD['sci_name'] = 'unclassified' + # sD['rank_code'] = RANKCODE['unclassified'] + # sD["num_bp_assigned"] = sD["num_bp_contained"] + # return sD @dataclass class ClassificationResult(SummarizedGatherResult): @@ -1959,3 +1989,52 @@ def make_kreport_results(self): unclassified_recorded = True kreport_results.append(kresD) return header, kreport_results + + def make_LINgroup_report_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} + self.check_summarization() + header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"] + if self.query_info.total_weighted_hashes == 0: + raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") + lingroup_results = [] + # unclassified_recorded=False + # come back to final ordering + # need to order LINgroups by prefix, so we know which LINgroups contain each other. + # lg_relationships = {} + # for lg_name, lg_prefix in LINgroupsD.items(): + # all_lgs = + + all_lg_ranks = set() + rank_to_lgprefix = defaultdict(set) + all_lgs = list(LINgroupsD.values()) + for lg_prefix in all_lgs: + lg_rank = len(lg_prefix) -1 # because 0-based + all_lg_ranks.add(lg_rank) + rank_to_lgprefix[lg_rank].append(lg_prefix) + + # order lg_ranks low--> high (general --> specific) + ordered_lg_ranks = sorted(all_lg_ranks) # ranks are str(int) .. how does this affect sorting? e.g. 1 vs 10? + + for rank in ordered_lg_ranks: + rank_results = self.summarized_lineage_results[rank] + for res in rank_results: + if res.lineage.display_lineage() in rank_to_lgprefix[rank]: + lg_resD = res.as_lingroup_report_dict(self.query_info) + + return header, lingroup_results + + + # + # for LINgroup, subgroups in lingroups: + # if rank == 'strain': # no code for strain, can't include in this output afaik + # continue + # rank_results = self.summarized_lineage_results[rank] + # for res in rank_results: + # kresD = res.as_kreport_dict(self.query_info) + # if kresD['sci_name'] == "unclassified": + # # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. + # # Here, we only need to report it once, since it will be the same fraction for all ranks + # if unclassified_recorded: + # continue + # else: + # unclassified_recorded = True + # kreport_results.append(kresD) From d573cb1a6b541c0213883a3ba9e2442fc5c46bbe Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 14 Feb 2023 16:13:37 -0800 Subject: [PATCH 26/78] add LINgroup summarization utilities --- src/sourmash/tax/tax_utils.py | 87 +++++++++++++---------------------- tests/test_tax_utils.py | 21 +++++++++ 2 files changed, 53 insertions(+), 55 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index bdce9e7d24..6fb7744974 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1567,6 +1567,8 @@ def as_kreport_dict(self, query_info): # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) + if isinstance(self.lineage, LINLineageInfo): + raise ValueError("Cannot produce 'kreport' with LIN taxonomy.") if self.lineage != RankLineageInfo(): this_rank = self.lineage.lowest_rank sD['rank_code'] = RANKCODE[this_rank] @@ -1584,32 +1586,27 @@ def as_kreport_dict(self, query_info): sD["num_bp_assigned"] = sD["num_bp_contained"] return sD - # def as_LINgroup_report_dict(self, query_info): - # """ - # Produce LINgroup report dict for LINgroups. - # """ - # # lowest_assignment_rank = 'species' # longest independent LINs? not sure how to do this... - # sD = {} - # sD['num_bp_assigned'] = str(0) - # # total percent containment, weighted to include abundance info - # sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' - # sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) - # if self.lineage != RankLineageInfo(): - # this_rank = self.lineage.lowest_rank - # sD['rank_code'] = RANKCODE[this_rank] - # sD['sci_name'] = self.lineage.lowest_lineage_name - # sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid - # # the number of bp actually 'assigned' at this rank. Sourmash assigns everything - # # at genome level, but since kreport traditionally doesn't include 'strain' or genome, - # # it is reasonable to state that sourmash assigns at 'species' level for this. - # # can be modified later. - # if this_rank == lowest_assignment_rank: - # sD["num_bp_assigned"] = sD["num_bp_contained"] - # else: - # sD['sci_name'] = 'unclassified' - # sD['rank_code'] = RANKCODE['unclassified'] - # sD["num_bp_assigned"] = sD["num_bp_contained"] - # return sD + def as_lingroup_dict(self, query_info, lg_name, lowest_rank): + """ + Produce LINgroup report dict for LINgroups. + """ + # lowest_assignment_rank = 'species' # longest independent LINs? not sure how to do this... + sD = {} + sD['num_bp_assigned'] = str(0) + # total percent containment, weighted to include abundance info + sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' + sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) + if self.lineage != RankLineageInfo(): #empty lineage is currently always RankLineageInfo() + # the number of bp actually 'assigned' at this rank. Sourmash assigns everything + # at genome level - not sure how we want to handle 'num_bp_assigned' here.. + if self.lineage.lowest_rank == lowest_rank: + sD["num_bp_assigned"] = sD["num_bp_contained"] + else: # unassigned + sD["num_bp_assigned"] = sD["num_bp_contained"] + sD["LINgroup_prefix"] = self.lineage.display_lineage() + sD["LINgroup_name"] = lg_name + return sD + @dataclass class ClassificationResult(SummarizedGatherResult): @@ -1996,45 +1993,25 @@ def make_LINgroup_report_results(self, LINgroupsD): # dictionary {lg_prefix: lg_ if self.query_info.total_weighted_hashes == 0: raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") lingroup_results = [] - # unclassified_recorded=False - # come back to final ordering - # need to order LINgroups by prefix, so we know which LINgroups contain each other. - # lg_relationships = {} - # for lg_name, lg_prefix in LINgroupsD.items(): - # all_lgs = - all_lg_ranks = set() rank_to_lgprefix = defaultdict(set) all_lgs = list(LINgroupsD.values()) for lg_prefix in all_lgs: - lg_rank = len(lg_prefix) -1 # because 0-based + lg_prefix_as_list = lg_prefix.split(';') + lg_rank = len(lg_prefix_as_list) - 1 # because 0-based all_lg_ranks.add(lg_rank) rank_to_lgprefix[lg_rank].append(lg_prefix) # order lg_ranks low--> high (general --> specific) ordered_lg_ranks = sorted(all_lg_ranks) # ranks are str(int) .. how does this affect sorting? e.g. 1 vs 10? - + lowest_rank = ordered_lg_ranks[-1] for rank in ordered_lg_ranks: + these_lgs = rank_to_lgprefix[rank] rank_results = self.summarized_lineage_results[rank] for res in rank_results: - if res.lineage.display_lineage() in rank_to_lgprefix[rank]: - lg_resD = res.as_lingroup_report_dict(self.query_info) - + this_lineage = res.lineage.display_lineage() + if this_lineage in these_lgs: # is this lineage in the list of LINgroups at this rank? + this_lingroup_name = LINgroupsD[this_lineage] + lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank) + lingroup_results.append(lg_resD) return header, lingroup_results - - - # - # for LINgroup, subgroups in lingroups: - # if rank == 'strain': # no code for strain, can't include in this output afaik - # continue - # rank_results = self.summarized_lineage_results[rank] - # for res in rank_results: - # kresD = res.as_kreport_dict(self.query_info) - # if kresD['sci_name'] == "unclassified": - # # SummarizedGatherResults have an unclassified lineage at every rank, to facilitate reporting at a specific rank. - # # Here, we only need to report it once, since it will be the same fraction for all ranks - # if unclassified_recorded: - # continue - # else: - # unclassified_recorded = True - # kreport_results.append(kresD) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a0ba207743..d379358a68 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -158,6 +158,27 @@ def test_SummarizedGatherResult(): 'family': '', 'genus': '', 'species': '', 'strain': ''} +def test_SummarizedGatherResult_LINs(): + "SummarizedGatherResult with LINs" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + + lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="2") + print(lgD) + assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'num_bp_assigned': "600", + 'percent_containment': '30.00', 'num_bp_contained': "600"} + lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="3") + print(lgD) + assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", + 'num_bp_assigned': "0",'percent_containment': '30.00', 'num_bp_contained': "600"} + with pytest.raises(ValueError) as exc: + sgr.as_kreport_dict(query_info=qInf) + print(str(exc)) + assert "Cannot produce 'kreport' with LIN taxonomy." in str(exc) + + def test_SummarizedGatherResult_set_query_ani(): "Check ANI estimation within SummarizedGatherResult dataclass" qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', From 69ed6a9fc520fe4efa127ea222053e92c0630e43 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 14 Feb 2023 16:15:42 -0800 Subject: [PATCH 27/78] add LINgroup summarization --- src/sourmash/cli/tax/genome.py | 28 +++++++++++++++++++++------- src/sourmash/cli/tax/metagenome.py | 30 +++++++++++++++++++++++------- src/sourmash/tax/__main__.py | 9 +++++++++ 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 20764f34f6..a6487ff386 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -83,7 +83,7 @@ def subparser(subparsers): ) subparser.add_argument( '-F', '--output-format', default=[], nargs='*', action='extend', - choices=["csv_summary", "krona", "human", "lineage_csv"], + choices=["csv_summary", "krona", "human", "lineage_csv", "LINgroup_report"], help='choose output format(s)', ) subparser.add_argument( @@ -91,13 +91,17 @@ def subparser(subparsers): help='continue past survivable errors in loading taxonomy database or gather results', ) subparser.add_argument( - '--LIN-taxonomy', action='store_true', + '--LIN-taxonomy', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' ) subparser.add_argument( - '--LIN-position', type=int, + '--LIN-position', type=int, default=None, help='For non-default output formats: summarize taxonomy at this LIN position and above. Replaces "--rank" for standard taxonomy. Note that the taxonomy CSV must contain LIN with information at this position.' ) + subparser.add_argument( + '--LINgroups', metavar='FILE', default=None, + help='CSV containing LINgroup_name, LINgroup_prefix. Will produce a "LINgroup_report" file containing taxonomic summarization for each LINgroup.' + ) add_tax_threshold_arg(subparser, 0.1) @@ -105,16 +109,26 @@ def main(args): import sourmash if not args.gather_csv and not args.from_file: raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") - if len(args.output_format) > 1: - if args.output_base == "-": - raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + # handle LIN options if args.LIN_taxonomy: if args.LIN_position: args.rank = args.LIN_position + if args.LINgroups: + if "LINgroup_report" not in args.output_format: + args.output_format.append("LINgroup_report") + elif "LINgroup_report" in args.output_format: + raise ValueError(f"Must provide LINgroup csv via '--LINgroup-info' in order to output a LINgroup_report.") + elif args.LINgroups: + raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") + + # handle output formats if not args.rank: if any(x in ["krona"] for x in args.output_format): raise ValueError(f"Rank (--rank) is required for krona output format.") - if not args.output_format: + if len(args.output_format) > 1: + if args.output_base == "-": + raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + elif not args.output_format: # change to "human" for 5.0 args.output_format = ["csv_summary"] diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index addee9ce31..1c6df1108e 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -67,7 +67,7 @@ def subparser(subparsers): ) subparser.add_argument( '-F', '--output-format', default=[], nargs='*', action="extend", - choices=["human", "csv_summary", "krona", "lineage_summary", "kreport"], + choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "LINgroup_report"], help='choose output format(s)', ) subparser.add_argument( @@ -79,28 +79,44 @@ def subparser(subparsers): help='continue past errors in taxonomy database loading', ) subparser.add_argument( - '--LIN-taxonomy', action='store_true', + '--LIN-taxonomy', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' ) subparser.add_argument( - '--LIN-position', type=int, + '--LIN-position', type=int, default=None, help='For non-default output formats: summarize taxonomy at this LIN position and above. Replaces "--rank" for standard taxonomy. Note that the taxonomy CSV must contain LIN with information at this position.' ) + subparser.add_argument( + '--LINgroups', metavar='FILE', default=None, + help='CSV containing LINgroup_name, LINgroup_prefix. Will produce a "LINgroup_report" file containing taxonomic summarization for each LINgroup.' + ) + def main(args): import sourmash if not args.gather_csv and not args.from_file: raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") - if len(args.output_format) > 1: - if args.output_base == "-": - raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + # handle LIN options if args.LIN_taxonomy: if args.LIN_position: args.rank = args.LIN_position + if args.LINgroups: + if "LINgroup_report" not in args.output_format: + args.output_format.append("LINgroup_report") + elif "LINgroup_report" in args.output_format: + raise ValueError(f"Must provide LINgroup csv via '--LINgroup-info' in order to output a LINgroup_report.") + elif args.LINgroups: + raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") + + # handle output formats if not args.rank: if any(x in ["krona", "lineage_summary"] for x in args.output_format): raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") - if not args.output_format: + if len(args.output_format) > 1: + if args.output_base == "-": + raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + elif not args.output_format: # change to "human" for 5.0 args.output_format = ["csv_summary"] + return sourmash.tax.__main__.metagenome(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 0f2c341922..6cf80916db 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -43,6 +43,7 @@ 'human': '.human.txt', 'lineage_csv': '.lineage.csv', 'kreport': ".kreport.txt", + 'lingroup_report': ".lingroup_report.txt" } def make_outfile(base, output_type, *, output_dir = ""): @@ -163,6 +164,14 @@ def metagenome(args): header, kreport_results = single_query_results.make_kreport_results() tax_utils.write_output(header, kreport_results, out_fp, sep="\t", write_header=False) + # write summarized --> LINgroup output tsv + if "LINgroup_report" in args.output_format: + lingroup_reportfile, limit_float = make_outfile(args.output_base, "lingroup_report", output_dir=args.output_dir) + + with FileOutputCSV(lingroup_reportfile) as out_fp: + header, lgreport_results = single_query_results.make_LINgroup_report_results() + tax_utils.write_output(header, lgreport_results, out_fp, sep="\t", write_header=True) + def genome(args): """ From 871708be0dcd489d71635bdb704e1e2a1e148018 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 14 Feb 2023 17:38:58 -0800 Subject: [PATCH 28/78] add lingroup summarization method --- src/sourmash/tax/tax_utils.py | 44 ++++++++++++++++------------ tests/test_tax_utils.py | 55 +++++++++++++++++++++++------------ 2 files changed, 63 insertions(+), 36 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6fb7744974..e25566e624 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -528,14 +528,14 @@ def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force # do not allow loading of same query from a second CSV. raise ValueError(f"Gather query {gatherRow.query_name} was found in more than one CSV. Cannot load from '{gather_csv}'.") taxres = TaxResult(raw=gatherRow, keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) + keep_identifier_versions=keep_identifier_versions, + LIN_taxonomy=LIN_taxonomy) taxres.get_match_lineage(tax_assignments=tax_assignments, skip_idents=skip_idents, - fail_on_missing_taxonomy=fail_on_missing_taxonomy, - LIN_taxonomy=LIN_taxonomy) + fail_on_missing_taxonomy=fail_on_missing_taxonomy) # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(gatherRow.query_name, QueryTaxResult(taxres.query_info)) + this_querytaxres = gather_results.get(gatherRow.query_name, QueryTaxResult(taxres.query_info, LIN_taxonomy=LIN_taxonomy)) this_querytaxres.add_taxresult(taxres) gather_results[gatherRow.query_name] = this_querytaxres @@ -1432,7 +1432,6 @@ class TaxResult: query_name: str = field(init=False) query_info: QueryInfo = field(init=False) match_ident: str = field(init=False) - lineageInfo: RankLineageInfo = RankLineageInfo() #None#field(init=False) #RankLineageInfo() skipped_ident: bool = False missed_ident: bool = False match_lineage_attempted: bool = False @@ -1454,6 +1453,10 @@ def __post_init__(self): self.f_unique_to_query = float(self.raw.f_unique_to_query) self.f_unique_weighted = float(self.raw.f_unique_weighted) self.unique_intersect_bp = int(self.raw.unique_intersect_bp) + if self.LIN_taxonomy: + self.lineageInfo = LINLineageInfo(n_lin_positions=0) + else: + self.lineageInfo = RankLineageInfo() def get_ident(self): # split identifiers = split on whitespace @@ -1469,13 +1472,13 @@ def get_ident(self): self.match_ident = self.match_ident.split('.')[0] - def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False, LIN_taxonomy=False): + def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False): if skip_idents and self.match_ident in skip_idents: self.skipped_ident = True else: lin = tax_assignments.get(self.match_ident) if lin: - if LIN_taxonomy: + if self.LIN_taxonomy: self.lineageInfo = LINLineageInfo(lineage = lin) else: self.lineageInfo = RankLineageInfo(lineage = lin) @@ -1592,17 +1595,16 @@ def as_lingroup_dict(self, query_info, lg_name, lowest_rank): """ # lowest_assignment_rank = 'species' # longest independent LINs? not sure how to do this... sD = {} - sD['num_bp_assigned'] = str(0) # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) - if self.lineage != RankLineageInfo(): #empty lineage is currently always RankLineageInfo() + if self.lineage.n_lin_positions == 0: #empty lineage # the number of bp actually 'assigned' at this rank. Sourmash assigns everything # at genome level - not sure how we want to handle 'num_bp_assigned' here.. if self.lineage.lowest_rank == lowest_rank: sD["num_bp_assigned"] = sD["num_bp_contained"] else: # unassigned - sD["num_bp_assigned"] = sD["num_bp_contained"] + sD["num_bp_assigned"] = str(0) sD["LINgroup_prefix"] = self.lineage.display_lineage() sD["LINgroup_name"] = lg_name return sD @@ -1665,6 +1667,7 @@ class QueryTaxResult: Contains methods for formatting results for different outputs. """ query_info: QueryInfo # initialize with QueryInfo dataclass + LIN_taxonomy: bool = False def __post_init__(self): self.query_name = self.query_info.query_name # for convenience @@ -1703,7 +1706,7 @@ def _init_classification_results(self): self.krona_header = [] def is_compatible(self, taxresult): - return taxresult.query_info == self.query_info + return taxresult.query_info == self.query_info and taxresult.LIN_taxonomy == self.LIN_taxonomy @property def ascending_ranks(self): @@ -1796,7 +1799,10 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): self.total_bp_classified[rank] += bp_intersect_at_rank # record unclassified - lineage = RankLineageInfo() + if self.LIN_taxonomy: + lineage = LINLineageInfo(n_lin_positions=0) # empty + else: + lineage = RankLineageInfo() query_ani = None f_unique = 1.0 - self.total_f_classified[rank] if f_unique > 0: @@ -1987,24 +1993,26 @@ def make_kreport_results(self): kreport_results.append(kresD) return header, kreport_results - def make_LINgroup_report_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} + def make_lingroup_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} self.check_summarization() header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"] if self.query_info.total_weighted_hashes == 0: raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") - lingroup_results = [] all_lg_ranks = set() rank_to_lgprefix = defaultdict(set) - all_lgs = list(LINgroupsD.values()) + all_lgs = list(LINgroupsD.keys()) for lg_prefix in all_lgs: lg_prefix_as_list = lg_prefix.split(';') lg_rank = len(lg_prefix_as_list) - 1 # because 0-based - all_lg_ranks.add(lg_rank) - rank_to_lgprefix[lg_rank].append(lg_prefix) + all_lg_ranks.add(str(lg_rank)) + rank_to_lgprefix[str(lg_rank)].add(lg_prefix) # order lg_ranks low--> high (general --> specific) ordered_lg_ranks = sorted(all_lg_ranks) # ranks are str(int) .. how does this affect sorting? e.g. 1 vs 10? lowest_rank = ordered_lg_ranks[-1] + + lingroup_results = [] + for rank in ordered_lg_ranks: these_lgs = rank_to_lgprefix[rank] rank_results = self.summarized_lineage_results[rank] @@ -2013,5 +2021,5 @@ def make_LINgroup_report_results(self, LINgroupsD): # dictionary {lg_prefix: lg_ if this_lineage in these_lgs: # is this lineage in the list of LINgroups at this rank? this_lingroup_name = LINgroupsD[this_lineage] lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank) - lingroup_results.append(lg_resD) + lingroup_results.append(lg_resD) return header, lingroup_results diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index d379358a68..1eb756b9e4 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -23,11 +23,15 @@ from sourmash.lca import lca_utils # utility functions for testing -def make_mini_taxonomy(tax_info): +def make_mini_taxonomy(tax_info, LIN=False): #pass in list of tuples: (name, lineage) taxD = {} - for (name,lin) in tax_info: - taxD[name] = lca_utils.make_lineage(lin) + for (name, lin) in tax_info: + if LIN: + lineage = LINLineageInfo(lineage_str=lin) + else: + lineage = RankLineageInfo(lineage_str=lin) + taxD[name] = lineage.filled_lineage return taxD @@ -53,28 +57,30 @@ def make_GatherRow(gather_dict=None, exclude_cols=[]): return gatherRaw -def make_TaxResult(gather_dict=None, taxD=None, keep_full_ident=False, keep_ident_version=False, skip_idents=None): +def make_TaxResult(gather_dict=None, taxD=None, keep_full_ident=False, keep_ident_version=False, skip_idents=None, LIN=False): """Make TaxResult from artificial gather row (dict)""" gRow = make_GatherRow(gather_dict) - taxres = TaxResult(raw=gRow, keep_full_identifiers=keep_full_ident, keep_identifier_versions=keep_ident_version) + taxres = TaxResult(raw=gRow, keep_full_identifiers=keep_full_ident, + keep_identifier_versions=keep_ident_version, LIN_taxonomy=LIN) if taxD is not None: taxres.get_match_lineage(tax_assignments=taxD, skip_idents=skip_idents) return taxres def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_ident=False, keep_ident_version=False, - skip_idents=None, summarize=False, classify=False, classify_rank=None, c_thresh=0.1, ani_thresh=None): + skip_idents=None, summarize=False, classify=False, classify_rank=None, c_thresh=0.1, ani_thresh=None, + LIN=False): """Make QueryTaxResult(s) from artificial gather information, formatted as list of gather rows (dicts)""" gather_results = {} this_querytaxres = None for gather_infoD in gather_info: taxres = make_TaxResult(gather_infoD, taxD=taxD, keep_full_ident=keep_full_ident, - keep_ident_version=keep_ident_version, skip_idents=skip_idents) + keep_ident_version=keep_ident_version, skip_idents=skip_idents, LIN=LIN) query_name = taxres.query_name # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info)) + this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info, LIN_taxonomy=LIN)) this_querytaxres.add_taxresult(taxres) # print('missed_ident?', taxres.missed_ident) gather_results[query_name] = this_querytaxres @@ -165,9 +171,9 @@ def test_SummarizedGatherResult_LINs(): sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"), f_weighted_at_rank=0.3, bp_match_at_rank=30) - lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="2") + lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="4") print(lgD) - assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'num_bp_assigned': "600", + assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'num_bp_assigned': "0", 'percent_containment': '30.00', 'num_bp_contained': "600"} lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="3") print(lgD) @@ -2722,11 +2728,24 @@ def test_make_kreport_results_fail_pre_v450(): assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc) -def test_make_kreport_results_fail_pre_v450(): - taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) - gather_results = [{}, {"name": 'gB'}] - q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) - with pytest.raises(ValueError) as exc: - q_res.make_kreport_results() - print(str(exc)) - assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc) +def test_make_lingroup_results(): + taxD = make_mini_taxonomy([("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True) + print(taxD) + lingroupD = {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + print(lingroupD) + gather_results = [{"total_weighted_hashes":100}, + {"name": 'gB', "total_weighted_hashes":100}, + {"name": 'gC', "total_weighted_hashes":100}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True, LIN=True) + print(q_res.summarized_lineage_results) + + header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD) + print(header) + assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained', 'num_bp_assigned'] + print(lgD) + assert lgD == [{'percent_containment': '60.00', 'num_bp_contained': '60', 'num_bp_assigned': '0', + 'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'}, + {'percent_containment': '40.00', 'num_bp_contained': '40', 'num_bp_assigned': '0', + 'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'}, + {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '0', + 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'}] From 49558d9efd29eeda9cc311a9f7d3e743440cbf78 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 14 Feb 2023 18:08:04 -0800 Subject: [PATCH 29/78] add fn to read LINgroups file into dict --- src/sourmash/tax/__main__.py | 12 +++++++++++- src/sourmash/tax/tax_utils.py | 19 ++++++++++++++++++- tests/test_tax_utils.py | 33 ++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 3 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 6cf80916db..bcbe0990c4 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -166,10 +166,20 @@ def metagenome(args): # write summarized --> LINgroup output tsv if "LINgroup_report" in args.output_format: + try: + lingroups = tax_utils.read_lingroups(args.LINgroups) + except ValueError as exc: + error(f"ERROR: {str(exc)}") + sys.exit(-1) + + if not lingroups: + error(f'ERROR: No LINgroups loaded from {",".join(args.LINgroups)}. Exiting.') + sys.exit(-1) + lingroup_reportfile, limit_float = make_outfile(args.output_base, "lingroup_report", output_dir=args.output_dir) with FileOutputCSV(lingroup_reportfile) as out_fp: - header, lgreport_results = single_query_results.make_LINgroup_report_results() + header, lgreport_results = single_query_results.make_lingroup_results(LINgroupsD = lingroups) tax_utils.write_output(header, lgreport_results, out_fp, sep="\t", write_header=True) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e25566e624..22108e436b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -501,6 +501,23 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): return gather_csvs +def read_lingroups(lingroup_csv): + lingroupD = {} + with sourmash_args.FileInputCSV(lingroup_csv) as r: + header = r.fieldnames + # check for empty file + if not header: + raise ValueError(f"Cannot read lingroups from '{lingroup_csv}'. Is file empty?") + if "LINgroup_prefix" not in header or "LINgroup_name" not in header: + raise ValueError(f"'{lingroup_csv}' must contain the following columns: 'LINgroup_prefix', 'LINgroup_name'.") + for n, row in enumerate(r): + lingroupD[row['LINgroup_prefix']] = row['LINgroup_name'] + + n_lg = len(lingroupD.keys()) + notify(f"Read {n+1} LINgroup rows and found {n_lg} distinct LINgroup prefixes.") + return lingroupD + + def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False, skip_idents = None, fail_on_missing_taxonomy=False, keep_full_identifiers=False, keep_identifier_versions=False, @@ -2022,4 +2039,4 @@ def make_lingroup_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} this_lingroup_name = LINgroupsD[this_lineage] lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank) lingroup_results.append(lg_resD) - return header, lingroup_results + return header, lingroup_results \ No newline at end of file diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 1eb756b9e4..86c1b3bfc4 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -16,7 +16,7 @@ SummarizedGatherResult, ClassificationResult, BaseLineageInfo, RankLineageInfo, LINLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, - write_krona, write_lineage_sample_frac, + write_krona, write_lineage_sample_frac, read_lingroups, LineageDB, LineageDB_Sqlite, MultiLineageDB) # import lca utils as needed @@ -2749,3 +2749,34 @@ def test_make_lingroup_results(): 'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'}, {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '0', 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'}] + + +def test_read_lingroups(runtmp): + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write('LINgroup_prefix,LINgroup_name\n') + out.write('1,lg1\n') + out.write('1;0,lg2\n') + out.write('1;1,lg3\n') + lgD = read_lingroups(lg_file) + + assert lgD == {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + +def test_read_lingroups_empty_file(runtmp): + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write("") + with pytest.raises(ValueError) as exc: + read_lingroups(lg_file) + print(str(exc)) + assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in str(exc) + + +def test_read_lingroups_bad_header(runtmp): + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write('LINgroup_pfx,LINgroup_nm\n') + with pytest.raises(ValueError) as exc: + read_lingroups(lg_file) + print(str(exc)) + assert f"'{lg_file}' must contain the following columns: 'LINgroup_prefix', 'LINgroup_name'." in str(exc) From 6f26e0b1b8cd4dce15517b6114ac7cdc8f87bbe1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 14 Feb 2023 18:54:07 -0800 Subject: [PATCH 30/78] fix assigned; add full lg test --- src/sourmash/tax/tax_utils.py | 18 ++++++++++-------- tests/test_tax.py | 35 +++++++++++++++++++++++++++++++++++ tests/test_tax_utils.py | 4 ++-- 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 22108e436b..b80942d82d 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1610,18 +1610,17 @@ def as_lingroup_dict(self, query_info, lg_name, lowest_rank): """ Produce LINgroup report dict for LINgroups. """ - # lowest_assignment_rank = 'species' # longest independent LINs? not sure how to do this... sD = {} # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) - if self.lineage.n_lin_positions == 0: #empty lineage + sD["num_bp_assigned"] = str(0) + if self.lineage.n_lin_positions != 0: #empty lineage # the number of bp actually 'assigned' at this rank. Sourmash assigns everything # at genome level - not sure how we want to handle 'num_bp_assigned' here.. if self.lineage.lowest_rank == lowest_rank: sD["num_bp_assigned"] = sD["num_bp_contained"] - else: # unassigned - sD["num_bp_assigned"] = str(0) + sD["LINgroup_prefix"] = self.lineage.display_lineage() sD["LINgroup_name"] = lg_name return sD @@ -2020,17 +2019,20 @@ def make_lingroup_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} all_lgs = list(LINgroupsD.keys()) for lg_prefix in all_lgs: lg_prefix_as_list = lg_prefix.split(';') - lg_rank = len(lg_prefix_as_list) - 1 # because 0-based - all_lg_ranks.add(str(lg_rank)) + lg_rank = len(lg_prefix_as_list) -1 + all_lg_ranks.add(lg_rank) # bc 0 based rank_to_lgprefix[str(lg_rank)].add(lg_prefix) # order lg_ranks low--> high (general --> specific) - ordered_lg_ranks = sorted(all_lg_ranks) # ranks are str(int) .. how does this affect sorting? e.g. 1 vs 10? - lowest_rank = ordered_lg_ranks[-1] + ordered_lg_ranks = list(all_lg_ranks) + ordered_lg_ranks.sort() # ranks are str(int) .. how does this affect sorting? e.g. 1 vs 10? + # ordered_lg_ranks = [str(x-1) for x in ordered_lg_ranks] # because 0-based + lowest_rank = str(ordered_lg_ranks[-1]) lingroup_results = [] for rank in ordered_lg_ranks: + rank = str(rank) these_lgs = rank_to_lgprefix[rank] rank_results = self.summarized_lineage_results[rank] for res in rank_results: diff --git a/tests/test_tax.py b/tests/test_tax.py index e9a56f7db5..f04352a70c 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3327,3 +3327,38 @@ def test_metagenome_LINS(runtmp): assert "test1,19,0.028,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.016,138000,0.891,0" in c.last_result.out assert "test1,19,0.011,1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.007,54000,0.864,0" in c.last_result.out assert "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out + + +def test_metagenome_LINS_LINgroups(runtmp): + # get/design better test data for this? + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write('LINgroup_prefix,LINgroup_name\n') + out.write('0;0;0,lg1\n') + out.write('1;0;0,lg2\n') + out.write('2;0;0,lg3\n') + out.write('1;0;1,lg3\n') + # write a 19 so we can check 'num_bp_assigned' + out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--LIN-taxonomy', '--LINgroups', lg_file) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert "Read 5 LINgroup rows and found 5 distinct LINgroup prefixes." in c.last_result.err + assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained num_bp_assigned" in c.last_result.out + assert "lg1 0;0;0 5.82 714000 0" in c.last_result.out + assert "lg2 1;0;0 5.05 620000 0" in c.last_result.out + assert "lg3 2;0;0 1.56 192000 0" in c.last_result.out + assert "lg3 1;0;1 0.65 80000 0" in c.last_result.out + assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000" in c.last_result.out diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 86c1b3bfc4..f3dd392952 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2745,9 +2745,9 @@ def test_make_lingroup_results(): print(lgD) assert lgD == [{'percent_containment': '60.00', 'num_bp_contained': '60', 'num_bp_assigned': '0', 'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'}, - {'percent_containment': '40.00', 'num_bp_contained': '40', 'num_bp_assigned': '0', + {'percent_containment': '40.00', 'num_bp_contained': '40', 'num_bp_assigned': '40', 'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'}, - {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '0', + {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '20', 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'}] From a040a4b21a3e9181cb02078d49e8e375124e23cd Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 15 Feb 2023 09:40:50 -0800 Subject: [PATCH 31/78] test more lg reading failures --- src/sourmash/tax/__main__.py | 4 --- src/sourmash/tax/tax_utils.py | 3 +++ tests/test_tax.py | 46 +++++++++++++++++++++++++++++++++++ tests/test_tax_utils.py | 21 ++++++++++++++++ 4 files changed, 70 insertions(+), 4 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index bcbe0990c4..c7017189d5 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -172,10 +172,6 @@ def metagenome(args): error(f"ERROR: {str(exc)}") sys.exit(-1) - if not lingroups: - error(f'ERROR: No LINgroups loaded from {",".join(args.LINgroups)}. Exiting.') - sys.exit(-1) - lingroup_reportfile, limit_float = make_outfile(args.output_base, "lingroup_report", output_dir=args.output_dir) with FileOutputCSV(lingroup_reportfile) as out_fp: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b80942d82d..68d65083df 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -503,6 +503,7 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): def read_lingroups(lingroup_csv): lingroupD = {} + n=None with sourmash_args.FileInputCSV(lingroup_csv) as r: header = r.fieldnames # check for empty file @@ -513,6 +514,8 @@ def read_lingroups(lingroup_csv): for n, row in enumerate(r): lingroupD[row['LINgroup_prefix']] = row['LINgroup_name'] + if n is None: + raise ValueError(f'No LINgroups loaded from {lingroup_csv}.') n_lg = len(lingroupD.keys()) notify(f"Read {n+1} LINgroup rows and found {n_lg} distinct LINgroup prefixes.") return lingroupD diff --git a/tests/test_tax.py b/tests/test_tax.py index f04352a70c..62374d59f2 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3362,3 +3362,49 @@ def test_metagenome_LINS_LINgroups(runtmp): assert "lg3 2;0;0 1.56 192000 0" in c.last_result.out assert "lg3 1;0;1 0.65 80000 0" in c.last_result.out assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000" in c.last_result.out + + +def test_metagenome_LINS_LINgroups_empty_lg_file(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write("") + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--LIN-taxonomy', '--LINgroups', lg_file) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status != 0 + assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err + + +def test_metagenome_LINS_LINgroups_lg_only_header(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write('LINgroup_prefix,LINgroup_name\n') + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--LIN-taxonomy', '--LINgroups', lg_file) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status != 0 + assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert f"No LINgroups loaded from {lg_file}" in c.last_result.err diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index f3dd392952..e6a2fc789d 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2751,6 +2751,17 @@ def test_make_lingroup_results(): 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'}] +def test_make_lingroup_results_fail_pre_v450(): + taxD = make_mini_taxonomy([("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True) + gather_results = [{}, {"name": 'gB'}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True, LIN=True) + lingroupD = {"1":"lg1", "1;0":'lg2', '1;1': "lg3"} + with pytest.raises(ValueError) as exc: + q_res.make_lingroup_results(lingroupD) + print(str(exc)) + assert "cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0" in str(exc) + + def test_read_lingroups(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: @@ -2772,6 +2783,16 @@ def test_read_lingroups_empty_file(runtmp): assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in str(exc) +def test_read_lingroups_only_header(runtmp): + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write('LINgroup_prefix,LINgroup_name\n') + with pytest.raises(ValueError) as exc: + read_lingroups(lg_file) + print(str(exc)) + assert f"No LINgroups loaded from {lg_file}" in str(exc) + + def test_read_lingroups_bad_header(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: From 7cb57008846e48b981567b02561fdf4c52a06c2a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 15 Feb 2023 10:17:56 -0800 Subject: [PATCH 32/78] test bad cli inputs --- src/sourmash/cli/tax/genome.py | 52 +++++++++++++----------- src/sourmash/cli/tax/metagenome.py | 52 +++++++++++++----------- tests/test_tax.py | 64 ++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+), 46 deletions(-) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index a6487ff386..37a9e0bd69 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -107,29 +107,35 @@ def subparser(subparsers): def main(args): import sourmash - if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") - # handle LIN options - if args.LIN_taxonomy: - if args.LIN_position: - args.rank = args.LIN_position - if args.LINgroups: - if "LINgroup_report" not in args.output_format: - args.output_format.append("LINgroup_report") - elif "LINgroup_report" in args.output_format: - raise ValueError(f"Must provide LINgroup csv via '--LINgroup-info' in order to output a LINgroup_report.") - elif args.LINgroups: - raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") + try: + if not args.gather_csv and not args.from_file: + raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + # handle LIN options + if args.LIN_taxonomy: + if args.LIN_position: + args.rank = args.LIN_position + if args.LINgroups: + if "LINgroup_report" not in args.output_format: + args.output_format.append("LINgroup_report") + elif "LINgroup_report" in args.output_format: + raise ValueError(f"Must provide LINgroup csv via '--LINgroups' in order to output a LINgroup_report.") + elif args.LINgroups or "LINgroup_report" in args.output_format: + raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") - # handle output formats - if not args.rank: - if any(x in ["krona"] for x in args.output_format): - raise ValueError(f"Rank (--rank) is required for krona output format.") - if len(args.output_format) > 1: - if args.output_base == "-": - raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") - elif not args.output_format: - # change to "human" for 5.0 - args.output_format = ["csv_summary"] + # handle output formats + print(args.output_format) + if not args.rank: + if any(x in ["krona"] for x in args.output_format): + raise ValueError(f"Rank (--rank) is required for krona output format.") + if len(args.output_format) > 1: + if args.output_base == "-": + raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + elif not args.output_format: + # change to "human" for 5.0 + args.output_format = ["csv_summary"] + + except ValueError as exc: + error(f"ERROR: {str(exc)}") + import sys; sys.exit(-1) return sourmash.tax.__main__.genome(args) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 1c6df1108e..eb8b423d35 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -94,29 +94,35 @@ def subparser(subparsers): def main(args): import sourmash - if not args.gather_csv and not args.from_file: - raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") - # handle LIN options - if args.LIN_taxonomy: - if args.LIN_position: - args.rank = args.LIN_position - if args.LINgroups: - if "LINgroup_report" not in args.output_format: - args.output_format.append("LINgroup_report") - elif "LINgroup_report" in args.output_format: - raise ValueError(f"Must provide LINgroup csv via '--LINgroup-info' in order to output a LINgroup_report.") - elif args.LINgroups: - raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") + try: + if not args.gather_csv and not args.from_file: + raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") + # handle LIN options + if args.LIN_taxonomy: + if args.LIN_position: + args.rank = args.LIN_position + if args.LINgroups: + if "LINgroup_report" not in args.output_format: + args.output_format.append("LINgroup_report") + elif "LINgroup_report" in args.output_format: + raise ValueError(f"Must provide LINgroup csv via '--LINgroups' in order to output a LINgroup_report.") + elif args.LINgroups or "LINgroup_report" in args.output_format: + raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") - # handle output formats - if not args.rank: - if any(x in ["krona", "lineage_summary"] for x in args.output_format): - raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") - if len(args.output_format) > 1: - if args.output_base == "-": - raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") - elif not args.output_format: - # change to "human" for 5.0 - args.output_format = ["csv_summary"] + # handle output formats + print(args.output_format) + if not args.rank: + if any(x in ["krona", "lineage_summary"] for x in args.output_format): + raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") + if len(args.output_format) > 1: + if args.output_base == "-": + raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + elif not args.output_format: + # change to "human" for 5.0 + args.output_format = ["csv_summary"] + + except ValueError as exc: + error(f"ERROR: {str(exc)}") + import sys; sys.exit(-1) return sourmash.tax.__main__.metagenome(args) diff --git a/tests/test_tax.py b/tests/test_tax.py index 62374d59f2..33b5617cf3 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3387,6 +3387,70 @@ def test_metagenome_LINS_LINgroups_empty_lg_file(runtmp): assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err +def test_metagenome_LINS_LINgroups_bad_cli_inputs(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + lg_file = runtmp.output("test.lg.csv") + with open(lg_file, 'w') as out: + out.write("") + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--LIN-taxonomy', '-F', "LINgroup_report") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status != 0 + assert "Must provide LINgroup csv via '--LINgroups' in order to output a LINgroup_report." in c.last_result.err + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-F', "LINgroup_report") + print(c.last_result.err) + assert c.last_result.status != 0 + assert "Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report." in c.last_result.err + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LINgroups', lg_file) + print(c.last_result.err) + assert c.last_result.status != 0 + assert "Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report." in c.last_result.err + + +def test_metagenome_mult_outputs_stdout_fail(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '-F', "kreport", 'csv_summary') + + print(c.last_result.err) + assert c.last_result.status != 0 + assert f"Writing to stdout is incompatible with multiple output formats ['kreport', 'csv_summary']" in c.last_result.err + + +def test_genome_mult_outputs_stdout_fail(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '-F', "lineage_csv", 'csv_summary') + + print(c.last_result.err) + assert c.last_result.status != 0 + assert f"Writing to stdout is incompatible with multiple output formats ['lineage_csv', 'csv_summary']" in c.last_result.err + + def test_metagenome_LINS_LINgroups_lg_only_header(runtmp): c = runtmp From 6e6a34c777c7a4eedbbb3e31adf409eecf4a625a Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 15 Feb 2023 21:40:18 -0800 Subject: [PATCH 33/78] rm print --- src/sourmash/cli/tax/metagenome.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index eb8b423d35..4e6a5a6eb6 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -110,7 +110,6 @@ def main(args): raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") # handle output formats - print(args.output_format) if not args.rank: if any(x in ["krona", "lineage_summary"] for x in args.output_format): raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") From acfc843e3279eb8c426df3d6c19d1abd88be7465 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 17 Feb 2023 12:41:02 -0800 Subject: [PATCH 34/78] lingroup output as tsv --- src/sourmash/tax/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index c7017189d5..35094f63e7 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -43,7 +43,7 @@ 'human': '.human.txt', 'lineage_csv': '.lineage.csv', 'kreport': ".kreport.txt", - 'lingroup_report': ".lingroup_report.txt" + 'lingroup_report': ".lingroup_report.tsv" } def make_outfile(base, output_type, *, output_dir = ""): From bb1aea3e5dad01ac36116e4dae4b207b8fe9af8d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 17 Feb 2023 15:50:54 -0800 Subject: [PATCH 35/78] rm remaining lca_utils usage in tax --- src/sourmash/lca/lca_utils.py | 4 - src/sourmash/tax/tax_utils.py | 125 +++++++++++++++++++-------- tests/test_tax_utils.py | 157 ++++++++++++++++++++++++++++++---- 3 files changed, 227 insertions(+), 59 deletions(-) diff --git a/src/sourmash/lca/lca_utils.py b/src/sourmash/lca/lca_utils.py index b9864ed0a8..8ee9340ed7 100644 --- a/src/sourmash/lca/lca_utils.py +++ b/src/sourmash/lca/lca_utils.py @@ -120,10 +120,6 @@ def build_tree(assignments, initial=None): for assignment in assignments: node = tree - # when we switch LineagePair over, will need ot add this. - #if isinstance(assignment, (BaseLineageInfo, RankLineageInfo, LINSLineageInfo)): - # assignment = assignment.filled_lineage - for lineage_tup in assignment: if lineage_tup.name: child = node.get(lineage_tup, {}) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 68d65083df..7c92919275 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -3,8 +3,7 @@ """ import os import csv -from collections import namedtuple, defaultdict -from collections import abc +from collections import abc, defaultdict from itertools import zip_longest from typing import NamedTuple from dataclasses import dataclass, field, replace, asdict @@ -30,10 +29,6 @@ RANKCODE = { "superkingdom": "D", "kingdom": "K", "phylum": "P", "class": "C", "order": "O", "family":"F", "genus": "G", "species": "S", "unclassified": "U"} -# import lca utils as needed for now -from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import (taxlist) - class LineagePair(NamedTuple): rank: str name: str = None @@ -146,19 +141,15 @@ def _init_from_lineage_tuples(self): new_lineage.append(LineagePair(rank=rank)) for lin_tup in self.lineage: # now add input tuples in correct spots. This corrects for order and allows empty values. - if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)): - raise ValueError(f"{lin_tup} is not LineagePair.") - # find index for this rank + if not isinstance(lin_tup, LineagePair): + raise ValueError(f"{lin_tup} is not tax_utils LineagePair.") if lin_tup.rank: # skip this tuple if rank is None or "" (empty lineage tuple. is this needed?) try: + # find index for this rank rank_idx = self.rank_index(lin_tup.rank) except ValueError as e: raise ValueError(f"Rank '{lin_tup.rank}' not present in {', '.join(self.ranks)}") from e - # make sure we're adding tax_utils.LineagePairs, not lca_utils.LineagePairs for consistency - if isinstance(lin_tup, lca_utils.LineagePair): - new_lineage[rank_idx] = LineagePair(rank=lin_tup.rank, name=lin_tup.name) - else: - new_lineage[rank_idx] = lin_tup + new_lineage[rank_idx] = lin_tup # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name is not None] @@ -437,13 +428,10 @@ def _init_from_lineage_tuples(self): ranks = [] # check this is a list or tuple of lineage tuples: for lin_tup in self.lineage: - if not isinstance(lin_tup, (LineagePair, lca_utils.LineagePair)): - raise ValueError(f"{lin_tup} is not LineagePair.") - # make sure we're adding tax_utils.LineagePairs, not lca_utils.LineagePairs for consistency - if isinstance(lin_tup, lca_utils.LineagePair): - new_lineage.append(LineagePair(rank=lin_tup.rank, name=lin_tup.name)) - else: - new_lineage.append(lin_tup) + # make sure we're adding tax_utils.LineagePairs + if not isinstance(lin_tup, LineagePair): + raise ValueError(f"{lin_tup} is not tax_utils LineagePair.") + new_lineage.append(lin_tup) ranks.append(lin_tup.rank) # build list of filled ranks filled_ranks = [a.rank for a in new_lineage if a.name is not None] @@ -455,6 +443,62 @@ def _init_from_lineage_tuples(self): object.__setattr__(self, "n_filled_pos", len(filled_ranks)) +def build_tree(assignments, initial=None): + """ + Builds a tree of dictionaries from lists of LineagePair objects or + LineageInfo objects in 'assignments'. This tree can then be used + to find lowest common ancestor agreements/confusion. + """ + if initial is None: + tree = {} + else: + tree = initial + + if not assignments: + raise ValueError("empty assignment passed to build_tree") + + if not isinstance(assignments, abc.Iterable): + raise ValueError("assignments must be an iterable object.") + + for assignment in assignments: + node = tree + + if isinstance(assignment, (BaseLineageInfo, RankLineageInfo, LINLineageInfo)): + print(assignment) + assignment = assignment.filled_lineage + + for lineage_tup in assignment: + if lineage_tup.name: + child = node.get(lineage_tup, {}) + node[lineage_tup] = child + + # shift -> down in tree + node = child + + return tree + + +def find_lca(tree): + """ + Given a tree produced by 'find_tree', find the first node with multiple + children, OR the only leaf in the tree. Return (lineage_tup, reason), + where 'reason' is the number of children of the returned node, i.e. + 0 if it's a leaf and > 1 if it's an internal node. + """ + + node = tree + lineage = [] + while 1: + if len(node) == 1: # descend to only child; track path + lineage_tup = next(iter(node.keys())) + lineage.append(lineage_tup) + node = node[lineage_tup] + elif len(node) == 0: # at leaf; end + return tuple(lineage), 0 + else: # len(node) > 1 => confusion!! + return tuple(lineage), len(node) + + def get_ident(ident, *, keep_full_identifiers=False, keep_identifier_versions=False): # split identifiers = split on whitespace @@ -886,7 +930,8 @@ def load(cls, filename, *, delimiter=',', force=False, header = ["ident" if "accession" == x else x for x in header] elif 'name' in header and 'lineage' in header: return cls.load_from_gather_with_lineages(filename, - force=force) + force=force, + LIN_taxonomy=LIN_taxonomy) else: header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') @@ -955,7 +1000,7 @@ def load(cls, filename, *, delimiter=',', force=False, @classmethod - def load_from_gather_with_lineages(cls, filename, *, force=False): + def load_from_gather_with_lineages(cls, filename, *, force=False, LIN_taxonomy=False): """ Load an annotated gather-with-lineages CSV file produced by 'tax annotate' into a LineageDB. @@ -976,7 +1021,7 @@ def load_from_gather_with_lineages(cls, filename, *, force=False): if "name" not in header or "lineage" not in header: raise ValueError(f"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?") - ranks = list(lca_utils.taxlist(include_strain=include_strain)) + ranks=None assignments = {} num_rows = 0 n_species = 0 @@ -988,24 +1033,32 @@ def load_from_gather_with_lineages(cls, filename, *, force=False): name = row['name'] ident = get_ident(name) - lineage = row['lineage'] - lineage = lca_utils.make_lineage(lineage) + if LIN_taxonomy: + lineageInfo = LINLineageInfo(lineage_str=row['lineage']) + else: + lineageInfo = RankLineageInfo(lineage_str= row['lineage']) + + if ranks is None: + ranks = lineageInfo.ranks + + lineage = lineageInfo.filled_lineage # check duplicates if ident in assignments: - if assignments[ident] != tuple(lineage): + if assignments[ident] != lineage: # this should not happen with valid # sourmash tax annotate output, but check anyway. if not force: raise ValueError(f"multiple lineages for identifier {ident}") else: - assignments[ident] = tuple(lineage) + assignments[ident] = lineage - if lineage[-1].rank == 'species': - n_species += 1 - elif lineage[-1].rank == 'strain': - n_species += 1 - n_strains += 1 + if isinstance(lineageInfo, RankLineageInfo): + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 return LineageDB(assignments, ranks) @@ -1039,7 +1092,7 @@ def __init__(self, conn, *, table_name=None): # get available ranks... ranks = set() - for column, rank in zip(self.columns, taxlist(include_strain=True)): + for column, rank in zip(self.columns, RankLineageInfo().taxlist): query = f'SELECT COUNT({column}) FROM {self.table_name} WHERE {column} IS NOT NULL AND {column} != ""' c.execute(query) cnt, = c.fetchone() @@ -1083,7 +1136,7 @@ def load(cls, location): def _make_tup(self, row): "build a tuple of LineagePairs for this sqlite row" - tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] + tup = [ LineagePair(n, r) for (n, r) in zip(RankLineageInfo().taxlist, row) ] return tuple(tup) def __getitem__(self, ident): @@ -1283,7 +1336,7 @@ class TEXT, db.commit() def _save_csv(self, fp): - headers = ['identifiers'] + list(taxlist(include_strain=True)) + headers = ['identifiers'] + list(RankLineageInfo().taxlist) w = csv.DictWriter(fp, fieldnames=headers) w.writeheader() diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e6a2fc789d..a7d8f748f7 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -17,11 +17,9 @@ BaseLineageInfo, RankLineageInfo, LINLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, write_krona, write_lineage_sample_frac, read_lingroups, + build_tree, find_lca, LineageDB, LineageDB_Sqlite, MultiLineageDB) -# import lca utils as needed -from sourmash.lca import lca_utils - # utility functions for testing def make_mini_taxonomy(tax_info, LIN=False): #pass in list of tuples: (name, lineage) @@ -928,13 +926,13 @@ def test_tax_multi_load_files_shadowed(runtmp): assert len(db.shadowed_identifiers()) == 6 # we should have everything including strain - assert set(lca_utils.taxlist()) == set(db.available_ranks) + assert set(RankLineageInfo().taxlist) == set(db.available_ranks) db = MultiLineageDB.load([taxonomy_csv, taxonomy_db], keep_full_identifiers=False, keep_identifier_versions=False) assert len(db.shadowed_identifiers()) == 6 - assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks) + assert set(RankLineageInfo().taxlist[:-1]) == set(db.available_ranks) def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions): @@ -1168,7 +1166,7 @@ def test_BaseLineageInfo_init_not_lineagepair(): with pytest.raises(ValueError) as exc: BaseLineageInfo(lineage=lin_tups, ranks=ranks) print(str(exc)) - assert "is not LineagePair" in str(exc) + assert "is not tax_utils LineagePair" in str(exc) def test_RankLineageInfo_taxlist(): @@ -1241,7 +1239,7 @@ def test_LINLineageInfo_init_not_lineagepair(): with pytest.raises(ValueError) as exc: LINLineageInfo(lineage=lin_tups) print(str(exc)) - assert "is not LineagePair" in str(exc) + assert "is not tax_utils LineagePair" in str(exc) def test_LINLineageInfo_init_lineagepair(): @@ -1256,18 +1254,6 @@ def test_LINLineageInfo_init_lineagepair(): assert taxinf.n_filled_pos == 1 -def test_LINLineageInfo_init_lca_lineagepair(): - lin_tups = (lca_utils.LineagePair("rank1", "name1"), lca_utils.LineagePair("rank2", None),) - taxinf = LINLineageInfo(lineage=lin_tups) - print(taxinf.lineage) - assert taxinf.n_lin_positions == 2 - assert taxinf.zip_lineage()== ["name1", ""] - assert taxinf.zip_lineage(truncate_empty=True)== ["name1"] - assert taxinf.filled_ranks == ("rank1",) - assert taxinf.ranks == ("rank1", "rank2") - assert taxinf.n_filled_pos == 1 - - def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): x = "a;b;c" taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] @@ -2801,3 +2787,136 @@ def test_read_lingroups_bad_header(runtmp): read_lingroups(lg_file) print(str(exc)) assert f"'{lg_file}' must contain the following columns: 'LINgroup_prefix', 'LINgroup_name'." in str(exc) + + +def test_build_tree_RankLineageInfo(): + x = "a;b" + lin1 = RankLineageInfo(lineage_str=x) + print(lin1) + tree = build_tree([lin1]) + assert tree == { LineagePair('superkingdom', 'a'): + { LineagePair('phylum', 'b') : {}} } + + +def test_build_tree_LINLineageInfo(): + x = "0;3" + lin1 = LINLineageInfo(lineage_str=x) + print(lin1) + tree = build_tree([lin1]) + assert tree == { LineagePair('0', '0'): + { LineagePair('1', '3') : {}} } + + +def test_build_tree_2(): + x = "a;b" + y = "a;c" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + print(lin1) + print(lin2) + tree = build_tree([lin1,lin2]) + + assert tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, + LineagePair('phylum', 'c') : {}} } + + +def test_build_tree_2_LineagePairs(): + # build tree from LineagePairs + tree = build_tree([[LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b')], + [LineagePair('superkingdom', 'a'), LineagePair('phylum', 'c')], + ]) + + assert tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, + LineagePair('phylum', 'c') : {}} } + + +def test_build_tree_3(): + # empty phylum name + x='a;' + lin1 = RankLineageInfo(lineage_str=x) + tree = build_tree([lin1]) + assert tree == { LineagePair('superkingdom', 'a'): {} } + + +def test_build_tree_3_LineagePairs(): + # empty phylum name: LineagePair input + lin1 = (LineagePair('superkingdom', "a", '3'), + LineagePair('phylum', '', ''),) + tree = build_tree([lin1]) + assert tree == { LineagePair('superkingdom', 'a', '3'): {} } + + +def test_build_tree_5(): + with pytest.raises(ValueError): + tree = build_tree([]) + + +def test_build_tree_5b(): + with pytest.raises(ValueError): + tree = build_tree("") + + +def test_build_tree_iterable(): + with pytest.raises(ValueError) as exc: + tree = build_tree(RankLineageInfo()) + assert "assignments must be an iterable object" in str(exc) + + +def test_find_lca(): + x='a;b' + lin1 = RankLineageInfo(lineage_str=x) + tree = build_tree([lin1]) + lca = find_lca(tree) + + assert lca == ((LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b'),), 0) + + +def test_find_lca_LineagePairs(): + tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) + lca = find_lca(tree) + + assert lca == ((LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'),), 0) + + +def test_find_lca_2(): + x = "a;b" + y = "a;c" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + + tree = build_tree([lin1, lin2]) + lca = find_lca(tree) + + assert lca == ((LineagePair('superkingdom', 'a'),), 2) + + +def test_find_lca_LIN(): + x = "5;6" + y = "5;10" + lin1 = LINLineageInfo(lineage_str=x) + lin2 = LINLineageInfo(lineage_str=y) + + tree = build_tree([lin1, lin2]) + lca = find_lca(tree) + + assert lca == ((LineagePair('0', '5'),), 2) + print(lca) + + +def test_find_lca_2_LineagePairs(): + tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], + [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], + ]) + lca = find_lca(tree) + + assert lca == ((LineagePair('rank1', 'name1'),), 2) + + +def test_find_lca_3(): + lin1 = RankLineageInfo(lineage_str="a;b;c") + lin2 = RankLineageInfo(lineage_str="a;b") + + tree = build_tree([lin1, lin2]) + lca, reason = find_lca(tree) + assert lca == lin1.filled_lineage # find most specific leaf node + print(lca) From 7dba708dd4f408e1e9d0a97180451a3152a6eed0 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 17 Feb 2023 16:58:15 -0800 Subject: [PATCH 36/78] rm remaining lca_utils usage in tax main --- src/sourmash/cli/tax/summarize.py | 4 ++++ src/sourmash/tax/__main__.py | 12 +++++++----- tests/test_tax_utils.py | 20 ++++++++++++++++++++ 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 7fca17e837..cc65c5f6f8 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -46,6 +46,10 @@ def subparser(subparsers): '-f', '--force', action = 'store_true', help='continue past errors in file and taxonomy loading', ) + subparser.add_argument( + '--LIN-taxonomy', action='store_true', default=False, + help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + ) def main(args): import sourmash diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 35094f63e7..ae8bfae294 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -11,10 +11,9 @@ import sourmash from ..sourmash_args import FileOutputCSV, FileOutput from sourmash.logging import set_quiet, error, notify, print_results -from sourmash.lca.lca_utils import zip_lineage from . import tax_utils -from .tax_utils import MultiLineageDB, GatherRow +from .tax_utils import MultiLineageDB, GatherRow, RankLineageInfo, LINLineageInfo usage=''' sourmash taxonomy [] - manipulate/work with taxonomy information. @@ -399,8 +398,7 @@ def search_pattern(l, r): else: with FileOutputCSV(args.output) as fp: w = csv.writer(fp) - - w.writerow(['ident'] + list(sourmash.lca.taxlist(include_strain=False))) + w.writerow(['ident'] + list(RankLineageInfo().taxlist[:-1])) for ident, lineage in sorted(match_ident): w.writerow([ident] + [ x.name for x in lineage ]) @@ -459,7 +457,11 @@ def summarize(args): # output in order of most common for lineage, count in lineage_counts.most_common(): rank = lineage[-1].rank - lin = ";".join(zip_lineage(lineage, truncate_empty=True)) + if args.LIN_taxonomy: + inf = LINLineageInfo(lineage=lineage) + else: + inf = RankLineageInfo(lineage=lineage) + lin = inf.display_lineage() w.writerow([rank, str(count), lin]) n = len(lineage_counts) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a7d8f748f7..19eb8bfe9e 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2920,3 +2920,23 @@ def test_find_lca_3(): lca, reason = find_lca(tree) assert lca == lin1.filled_lineage # find most specific leaf node print(lca) + + +def test_build_tree_with_initial(): + x = "a;b;c" + y = "a;b;d" + z = "a;e" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + lin3 = RankLineageInfo(lineage_str=z) + + tree = build_tree([lin1, lin2]) + lca = find_lca(tree) + + print(lca) + assert lca == ((LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='b', taxid=None)), 2) + tree2 = build_tree([lin3], initial=tree) + lca2 = find_lca(tree2) + print(lca2) + assert lca2 == ((LineagePair('superkingdom', 'a'),), 2) From 1bb699084467ba41608fc83f0c67f33f594ba890 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 17 Feb 2023 16:59:26 -0800 Subject: [PATCH 37/78] rm print st --- src/sourmash/tax/tax_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 7c92919275..d22310cbd4 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -464,7 +464,6 @@ def build_tree(assignments, initial=None): node = tree if isinstance(assignment, (BaseLineageInfo, RankLineageInfo, LINLineageInfo)): - print(assignment) assignment = assignment.filled_lineage for lineage_tup in assignment: From 4558644c2b4c689302040e73d1294aecc5ae14da Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 17 Feb 2023 18:48:54 -0800 Subject: [PATCH 38/78] enable LIN for summarize to rm lca utilities --- src/sourmash/cli/tax/summarize.py | 2 +- src/sourmash/tax/__main__.py | 3 +- src/sourmash/tax/tax_utils.py | 8 ++-- tests/test_tax.py | 70 +++++++++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 6 deletions(-) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index cc65c5f6f8..abd8b706f3 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -48,7 +48,7 @@ def subparser(subparsers): ) subparser.add_argument( '--LIN-taxonomy', action='store_true', default=False, - help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + help='use LIN taxonomy in place of standard taxonomic ranks.' ) def main(args): diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index ae8bfae294..c4cb850313 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -412,7 +412,8 @@ def summarize(args): tax_assign = MultiLineageDB.load(args.taxonomy_files, force=args.force, keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions=args.keep_identifier_versions) + keep_identifier_versions=args.keep_identifier_versions, + LIN_taxonomy=args.LIN_taxonomy) except ValueError as exc: error("ERROR while loading taxonomies!") error(str(exc)) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d22310cbd4..8363b48202 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -914,9 +914,6 @@ def load(cls, filename, *, delimiter=',', force=False, if not header: raise ValueError(f'cannot read taxonomy assignments from {filename}') - if LIN_taxonomy and "LIN" not in header: - raise ValueError(f"'LIN' column not found: cannot read LIN taxonomy assignments from {filename}.") - identifier = "ident" # check for ident/identifier, handle some common alternatives if "ident" not in header: @@ -935,6 +932,9 @@ def load(cls, filename, *, delimiter=',', force=False, header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') + if LIN_taxonomy and "LIN" not in header: + raise ValueError(f"'LIN' column not found: cannot read LIN taxonomy assignments from {filename}.") + if not LIN_taxonomy: # is "strain" an available rank? if "strain" in header: @@ -1039,7 +1039,7 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, LIN_taxonomy=F lineageInfo = RankLineageInfo(lineage_str= row['lineage']) if ranks is None: - ranks = lineageInfo.ranks + ranks = lineageInfo.taxlist lineage = lineageInfo.filled_lineage # check duplicates diff --git a/tests/test_tax.py b/tests/test_tax.py index 33b5617cf3..7c98b4c35c 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3289,6 +3289,50 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): assert c['1'] == 11 +def test_tax_summarize_LINS(runtmp): + # test basic operation w/LINs + taxfile = utils.get_test_data('tax/test.LINS-taxonomy.csv') + lineage_csv = runtmp.output('annotated-lin.csv') + + taxdb = tax_utils.LineageDB.load(taxfile, LIN_taxonomy=True) + with open(lineage_csv, 'w', newline="") as fp: + w = csv.writer(fp) + w.writerow(['name', 'lineage']) + for k, v in taxdb.items(): + lin = tax_utils.LINLineageInfo(lineage=v) + linstr = lin.display_lineage(truncate_empty=False) + print(linstr) + w.writerow([k, linstr]) + + runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv', '--LIN-taxonomy') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + print(out) + print(err) + + assert "number of distinct taxonomic lineages: 6" in out + assert "saved 91 lineage counts to" in err + + csv_out = runtmp.output('ranks.csv') + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + print(row) + val = row['lineage_count'] + c[val] += 1 + + print(list(c.most_common())) + + assert c['1'] == 77 + assert c['2'] == 1 + assert c['3'] == 11 + assert c['4'] == 2 + + def test_metagenome_LINS(runtmp): # test basic metagenome with LIN taxonomy # get/design better test data for this? @@ -3472,3 +3516,29 @@ def test_metagenome_LINS_LINgroups_lg_only_header(runtmp): assert c.last_result.status != 0 assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err assert f"No LINgroups loaded from {lg_file}" in c.last_result.err + + +# def test_metagenome_LINS_csv_out(runtmp): +# # LIN taxonomy:: csv_summary out +# c = runtmp + +# g_csv = utils.get_test_data('tax/test1.gather.csv') +# tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + +# lg_out = c.output('base.lingroup_report.tsv') + +# c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy') + +# print(c.last_result.status) +# print(c.last_result.out) +# print(c.last_result.err) + +# assert c.last_result.status == 0 +# assert os.path.exists(lg_out) + +# results = [x.rstrip() for x in open(lg_out)] +# assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in results[0] +# # 0th rank/position +# assert "test1,0,0.089,1,md5,test1.sig,0.057,444000,0.925,0" in results[1] +# # 19th rank/position +# assert "test1,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925,0" in results[-4] From 6fe08f18c559596fb174599a42f4bb6103fc2f47 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 Feb 2023 09:22:13 -0800 Subject: [PATCH 39/78] LIN pos for human summary; test LIN pos --- src/sourmash/cli/tax/genome.py | 2 +- src/sourmash/cli/tax/metagenome.py | 2 +- src/sourmash/tax/__main__.py | 6 ++- tests/test_tax.py | 71 ++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 37a9e0bd69..56b47b658c 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -113,7 +113,7 @@ def main(args): # handle LIN options if args.LIN_taxonomy: if args.LIN_position: - args.rank = args.LIN_position + args.rank = str(args.LIN_position) if args.LINgroups: if "LINgroup_report" not in args.output_format: args.output_format.append("LINgroup_report") diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 4e6a5a6eb6..e27da46582 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -100,7 +100,7 @@ def main(args): # handle LIN options if args.LIN_taxonomy: if args.LIN_position: - args.rank = args.LIN_position + args.rank = str(args.LIN_position) if args.LINgroups: if "LINgroup_report" not in args.output_format: args.output_format.append("LINgroup_report") diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index c4cb850313..aa3b7cc7db 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -146,7 +146,11 @@ def metagenome(args): summary_outfile, limit_float = make_outfile(args.output_base, "human", output_dir=args.output_dir) with FileOutput(summary_outfile) as out_fp: - tax_utils.write_human_summary(query_gather_results, out_fp, args.rank or "species") + human_display_rank = args.rank or "species" + if args.LIN_taxonomy and not args.rank: + human_display_rank = query_gather_results[0].ranks[-1] # lowest rank + + tax_utils.write_human_summary(query_gather_results, out_fp, human_display_rank) # write summarized output csv single_query_results = query_gather_results[0] diff --git a/tests/test_tax.py b/tests/test_tax.py index 7c98b4c35c..6673ec4d60 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3408,6 +3408,77 @@ def test_metagenome_LINS_LINgroups(runtmp): assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000" in c.last_result.out +def test_metagenome_LINS_human_summary_no_lin_position(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--LIN-taxonomy', '-F', "human") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert "sample name proportion cANI lineage" in c.last_result.out + assert "----------- ---------- ---- -------" in c.last_result.out + assert "test1 86.9% - unclassified" in c.last_result.out + assert "test1 5.8% 92.5% 0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out + assert "test1 5.0% 92.1% 1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out + assert "test1 1.6% 89.1% 2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out + assert "test1 0.7% 86.4% 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out + + +def test_metagenome_LINS_human_summary_lin_position_5(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--LIN-taxonomy', '-F', "human", '--LIN-position', '5') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert "sample name proportion cANI lineage" in c.last_result.out + assert "----------- ---------- ---- -------" in c.last_result.out + assert "test1 86.9% - unclassified" in c.last_result.out + assert "test1 5.8% 92.5% 0;0;0;0;0;0" in c.last_result.out + assert "test1 5.0% 92.1% 1;0;0;0;0;0" in c.last_result.out + assert "test1 1.6% 89.1% 2;0;0;0;0;0" in c.last_result.out + assert "test1 0.7% 86.4% 1;0;1;0;0;0" in c.last_result.out + + +def test_metagenome_LINS_krona_lin_position_5(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--LIN-taxonomy', '-F', "krona", '--LIN-position', '5') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err + assert "fraction 0 1 2 3 4 5" in c.last_result.out + assert "0.08815317112086159 0 0 0 0 0 0" in c.last_result.out + assert "0.07778220981252493 1 0 0 0 0 0" in c.last_result.out + assert "0.027522935779816515 2 0 0 0 0 0" in c.last_result.out + assert "0.010769844435580374 1 0 1 0 0 0" in c.last_result.out + assert "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" in c.last_result.out + + def test_metagenome_LINS_LINgroups_empty_lg_file(runtmp): c = runtmp From 8fcd26bac4e82c300219cd09d64534afaf6f94c8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 Feb 2023 11:32:12 -0800 Subject: [PATCH 40/78] allow completely empty LIN initialization --- src/sourmash/tax/tax_utils.py | 13 +++++++------ tests/test_tax_utils.py | 12 +++++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 8363b48202..0ec3fa344d 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -362,7 +362,7 @@ class LINLineageInfo(BaseLineageInfo): positions is less than provided lineages, initialization will fail. Otherwise, we will insert blanks beyond provided data in `lineage_str`. - LINLineageInfo must be initialized with lineage or n_lin_positions. + If no information is passed, an empty LINLineageInfo will be initialized (n_lin_positions=0). Input lineage information is only used for initialization of the final `lineage` and will not be used or compared in any other class methods. @@ -378,10 +378,8 @@ def __post_init__(self): self._init_from_lineage_tuples() elif self.lineage_str is not None: self._init_from_lineage_str() - elif self.n_lin_positions is not None: - self._init_empty() else: - raise ValueError("Please initialize 'LINLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'.") + self._init_empty() def _init_ranks_from_n_lin_positions(self): new_ranks = [str(x) for x in range(0, self.n_lin_positions)] @@ -390,6 +388,9 @@ def _init_ranks_from_n_lin_positions(self): def _init_empty(self): "initialize empty genome lineage" # first, set ranks from n_positions + if self.n_lin_positions is None: + # set n_lin_positions to 0 for completely empty LINLineageInfo + object.__setattr__(self, "n_lin_positions", 0) self._init_ranks_from_n_lin_positions() new_lineage=[] for rank in self.ranks: @@ -1526,7 +1527,7 @@ def __post_init__(self): self.f_unique_weighted = float(self.raw.f_unique_weighted) self.unique_intersect_bp = int(self.raw.unique_intersect_bp) if self.LIN_taxonomy: - self.lineageInfo = LINLineageInfo(n_lin_positions=0) + self.lineageInfo = LINLineageInfo() else: self.lineageInfo = RankLineageInfo() @@ -1871,7 +1872,7 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): # record unclassified if self.LIN_taxonomy: - lineage = LINLineageInfo(n_lin_positions=0) # empty + lineage = LINLineageInfo() else: lineage = RankLineageInfo() query_ani = None diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 19eb8bfe9e..3cc8e64b8e 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1184,11 +1184,13 @@ def test_RankLineageInfo_init_lineage_str(): assert taxinf.zip_lineage()== ['a', 'b', 'c', '', '', '', '', ''] -def test_LINLineageInfo_init_fail(): - with pytest.raises(ValueError) as exc: - LINLineageInfo() - print(str(exc)) - assert "Please initialize 'LINLineageInfo' with 'lineage', 'lineage_str' or 'n_lin_positions'." in str(exc) +def test_LINLineageInfo_init_empty(): + taxinf = LINLineageInfo() + assert taxinf.n_lin_positions == 0 + assert taxinf.zip_lineage()== [] + assert taxinf.display_lineage()== "" + assert taxinf.filled_ranks == () + assert taxinf.n_filled_pos == 0 def test_LINLineageInfo_init_n_pos(): From 8ebdfe2df6f2f04a42ff72c61b0464433ee1c549 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 20 Feb 2023 14:46:27 -0800 Subject: [PATCH 41/78] add find_lca method to LineageInfo classes --- src/sourmash/tax/tax_utils.py | 107 ++++++++++++++++++++++++++++++---- tests/test_tax_utils.py | 63 ++++++++++++++++++++ 2 files changed, 160 insertions(+), 10 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0ec3fa344d..e9f272518f 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -213,7 +213,7 @@ def check_rank_availability(self, rank): if rank in self.ranks: # rank is available return True raise ValueError(f"Desired Rank '{rank}' not available for this lineage.") - + def rank_is_filled(self, rank, other=None): self.check_rank_availability(rank) if other is not None: @@ -223,12 +223,17 @@ def rank_is_filled(self, rank, other=None): return True return False + def is_compatible(self, other): + if self.ranks == other.ranks: + return True + return False + def is_lineage_match(self, other, rank): """ check to see if two lineages are a match down to given rank. """ self.check_rank_availability(rank) - if not other.ranks == self.ranks: # check same ranks + if not self.is_compatible(other): raise ValueError("Cannot compare lineages from taxonomies with different ranks.") # always return false if rank is not filled in either of the two lineages if self.rank_is_filled(rank, other=other): @@ -263,6 +268,16 @@ def lineage_at_rank(self, rank): rank_idx = self.rank_index(rank) return self.filled_lineage[:rank_idx+1] + def find_lca(self, other): + """ + If an LCA match exists between self and other, + find and report LCA lineage. If not, return None. + """ + for rank in self.ascending_taxlist: + if self.is_lineage_match(other, rank): + return self.pop_to_rank(rank) + return None + @dataclass(frozen=True, order=True) class RankLineageInfo(BaseLineageInfo): @@ -348,6 +363,7 @@ def _init_from_lineage_dict(self): object.__setattr__(self, "lineage", tuple(new_lineage)) object.__setattr__(self, "filled_ranks", tuple(filled_ranks)) + @dataclass(frozen=True, order=True) class LINLineageInfo(BaseLineageInfo): """ @@ -369,7 +385,8 @@ class LINLineageInfo(BaseLineageInfo): """ ranks: tuple = field(default=None, init=False, compare=False)# we will set this within class instead lineage: tuple = None - n_lin_positions: int = None # init with this to make empty LINLineageInfo with correct n_lin_positions + # init with n_positions if you want to set a specific number of positions + n_lin_positions: int = field(default=None, compare=False) def __post_init__(self): "Initialize according to passed values" @@ -381,6 +398,16 @@ def __post_init__(self): else: self._init_empty() + def __eq__(self, other): + """ + Check if two LINLineageInfo match. Since we sometimes want to match LINprefixes, which have fewer + total ranks, with full LINs, we only check for the filled_lineage to match and don't check that + the number of lin_positions match. + """ + if other == (): # if comparing to a null tuple, don't try to find its lineage before returning False + return False + return self.filled_lineage==other.filled_lineage + def _init_ranks_from_n_lin_positions(self): new_ranks = [str(x) for x in range(0, self.n_lin_positions)] object.__setattr__(self, "ranks", new_ranks) @@ -444,6 +471,23 @@ def _init_from_lineage_tuples(self): object.__setattr__(self, "n_filled_pos", len(filled_ranks)) + def is_compatible(self, other): + """ + Since we sometimes want to match LINprefixes with full LINs, + we don't want to enforce identical ranks. Here we just look to + make sure self and other share any ranks (LIN positions). + + Since ranks are positions, this should be true for LINLineageInfo + unless one is empty. However, it should prevent comparison between + other LineageInfo instances and LINLineageInfo. + """ + # do self and other share any ranks? + if any(x in self.ranks for x in other.ranks): + return True + return False + + + def build_tree(assignments, initial=None): """ Builds a tree of dictionaries from lists of LineagePair objects or @@ -480,7 +524,7 @@ def build_tree(assignments, initial=None): def find_lca(tree): """ - Given a tree produced by 'find_tree', find the first node with multiple + Given a tree produced by 'build_tree', find the first node with multiple children, OR the only leaf in the tree. Return (lineage_tup, reason), where 'reason' is the number of children of the returned node, i.e. 0 if it's a leaf and > 1 if it's an internal node. @@ -2064,7 +2108,7 @@ def make_kreport_results(self): unclassified_recorded = True kreport_results.append(kresD) return header, kreport_results - + def make_lingroup_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} self.check_summarization() header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"] @@ -2075,14 +2119,13 @@ def make_lingroup_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} all_lgs = list(LINgroupsD.keys()) for lg_prefix in all_lgs: lg_prefix_as_list = lg_prefix.split(';') - lg_rank = len(lg_prefix_as_list) -1 - all_lg_ranks.add(lg_rank) # bc 0 based + lg_rank = len(lg_prefix_as_list) -1 # bc 0 based + all_lg_ranks.add(lg_rank) rank_to_lgprefix[str(lg_rank)].add(lg_prefix) # order lg_ranks low--> high (general --> specific) ordered_lg_ranks = list(all_lg_ranks) - ordered_lg_ranks.sort() # ranks are str(int) .. how does this affect sorting? e.g. 1 vs 10? - # ordered_lg_ranks = [str(x-1) for x in ordered_lg_ranks] # because 0-based + ordered_lg_ranks.sort() lowest_rank = str(ordered_lg_ranks[-1]) lingroup_results = [] @@ -2097,4 +2140,48 @@ def make_lingroup_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} this_lingroup_name = LINgroupsD[this_lineage] lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank) lingroup_results.append(lg_resD) - return header, lingroup_results \ No newline at end of file + return header, lingroup_results + + # def make_lingroup_results_ordered(self, LINgroupsD): # dictionary {lg_prefix: lg_name} + # self.check_summarization() + # header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"] + # if self.query_info.total_weighted_hashes == 0: + # raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") + # # first, order the LINgroups + + # all_lgs = set() + # lg_ranks = set() + # for lg_prefix in LINgroupsD.keys(): + # # store lineage info for LCA pathfinding + # lg_info = LINLineageInfo(lineage_str=lg_prefix) + # all_lgs.add(lg_info) + # # store rank so we only select summarized results at these ranks + # lg_rank = lg_info.lowest_rank + # lg_ranks.add(str(lg_rank)) + + # # now build tree from all lineage groups + # lg_tree = build_tree(all_lgs) + + # # grab summarized results matching LINgroup prefixes + # lg_results = {} + # for rank in lg_ranks: + # rank_results = self.summarized_lineage_results[rank] + # for res in rank_results: + # if res.lineage in all_lgs:# is this lineage in the list of LINgroups? + # this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)] + # lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, res.lowest_rank) + # lg_results[lg_info] = lg_resD + + # # now find each LCA path and write results for lineage groups there. + # ordered_lg_results = [] + + # while 1: + # this_path_results = [] + # lca, reason = find_lca(lg_tree) + # if reason == 0: # this is a leaf node / at bottom of a path + + # # now reverse this path's results and add to the ordered results + # path_results = this_path_results[::-1] + # ordered_lg_results.extend(path_results) + + # return header, lingroup_results diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 3cc8e64b8e..51ab8df1f0 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1256,6 +1256,51 @@ def test_LINLineageInfo_init_lineagepair(): assert taxinf.n_filled_pos == 1 +def test_lca_LINLineageInfo_diff_n_pos(): + x = "0;0;1" + y = '0' + lin1 = LINLineageInfo(lineage_str=x) + lin2 = LINLineageInfo(lineage_str=y) + assert lin1.is_compatible(lin2) + assert lin2.is_compatible(lin1) + lca_from_lin1 = lin1.find_lca(lin2) + lca_from_lin2 = lin2.find_lca(lin1) + assert lca_from_lin1 == lca_from_lin2 + assert lca_from_lin1.display_lineage(truncate_empty=True) == "0" + + +def test_lca_LINLineageInfo_no_lca(): + x = "0;0;1" + y = '12;0;1' + lin1 = LINLineageInfo(lineage_str=x) + lin2 = LINLineageInfo(lineage_str=y) + assert lin1.is_compatible(lin2) + assert lin2.is_compatible(lin1) + lca_from_lin1 = lin1.find_lca(lin2) + lca_from_lin2 = lin2.find_lca(lin1) + assert lca_from_lin1 == lca_from_lin2 == None + + +def test_lca_RankLineageInfo_no_lca(): + x = "a;b;c" + y = 'd;e;f;g' + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + assert lin1.is_compatible(lin2) + assert lin2.is_compatible(lin1) + lca_from_lin1 = lin1.find_lca(lin2) + lca_from_lin2 = lin2.find_lca(lin1) + assert lca_from_lin1 == lca_from_lin2 == None + + +def test_incompatibility_LINLineageInfo_RankLineageInfo(): + x="a;b;c" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = LINLineageInfo(lineage_str=x) + assert not lin1.is_compatible(lin2) + assert not lin2.is_compatible(lin1) + + def test_RankLineageInfo_init_lineage_str_with_ranks_as_list(): x = "a;b;c" taxranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] @@ -1482,6 +1527,7 @@ def test_is_lineage_match_1(): lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') print(lin1.lineage) + assert lin1.is_compatible(lin2) assert lin1.is_lineage_match(lin2, 'superkingdom') assert lin2.is_lineage_match(lin1, 'superkingdom') assert lin1.is_lineage_match(lin2, 'phylum') @@ -1498,11 +1544,19 @@ def test_is_lineage_match_1(): assert not lin1.is_lineage_match(lin2, 'species') assert not lin2.is_lineage_match(lin1, 'species') + lca_from_lin1 = lin1.find_lca(lin2) + print(lca_from_lin1.display_lineage()) + lca_from_lin2 = lin2.find_lca(lin1) + assert lca_from_lin1 == lca_from_lin2 + assert lca_from_lin1.display_lineage() == "d__a;p__b;c__c;o__d" + + def test_is_lineage_match_2(): # match at family, and above, levels; no genus or species to match lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + assert lin1.is_compatible(lin2) assert lin1.is_lineage_match(lin2, 'superkingdom') assert lin2.is_lineage_match(lin1, 'superkingdom') assert lin1.is_lineage_match(lin2, 'phylum') @@ -1519,12 +1573,19 @@ def test_is_lineage_match_2(): assert not lin1.is_lineage_match(lin2, 'species') assert not lin2.is_lineage_match(lin1, 'species') + lca_from_lin1 = lin1.find_lca(lin2) + print(lca_from_lin1.display_lineage()) + lca_from_lin2 = lin2.find_lca(lin1) + assert lca_from_lin1 == lca_from_lin2 + assert lca_from_lin1.display_lineage() == "d__a;p__b;c__c;o__d;f__f" + def test_is_lineage_match_3(): # one lineage is empty lin1 = RankLineageInfo() lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') + assert lin1.is_compatible(lin2) assert not lin1.is_lineage_match(lin2, 'superkingdom') assert not lin2.is_lineage_match(lin1, 'superkingdom') assert not lin1.is_lineage_match(lin2, 'phylum') @@ -1547,6 +1608,7 @@ def test_is_lineage_match_incorrect_ranks(): lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e', ranks=taxranks[::-1]) lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') print(lin1.lineage) + assert not lin1.is_compatible(lin2) with pytest.raises(ValueError) as exc: lin1.is_lineage_match(lin2, 'superkingdom') print(str(exc)) @@ -1558,6 +1620,7 @@ def test_is_lineage_match_improper_rank(): lin1 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__e') lin2 = RankLineageInfo(lineage_str = 'd__a;p__b;c__c;o__d;f__f') print(lin1.lineage) + assert lin1.is_compatible(lin2) with pytest.raises(ValueError) as exc: lin1.is_lineage_match(lin2, 'NotARank') print(str(exc)) From 6a4449cf26542889916c988145fd94e472be5d24 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 3 Mar 2023 08:25:19 -0800 Subject: [PATCH 42/78] enable LIN for tax annotate --- src/sourmash/cli/tax/annotate.py | 4 ++++ src/sourmash/tax/__main__.py | 4 ++-- tests/test_tax.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index e0c17b0019..0cf613d55e 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -59,6 +59,10 @@ def subparser(subparsers): '-f', '--force', action = 'store_true', help='continue past errors in file and taxonomy loading', ) + subparser.add_argument( + '--LIN-taxonomy', action='store_true', default=False, + help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + ) def main(args): import sourmash diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index aa3b7cc7db..3ebb0d73eb 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -290,7 +290,7 @@ def annotate(args): tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - force=args.force) + force=args.force, LIN_taxonomy=args.LIN_taxonomy) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -308,7 +308,7 @@ def annotate(args): fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - ) + LIN_taxonomy=args.LIN_taxonomy) if not query_gather_results: continue diff --git a/tests/test_tax.py b/tests/test_tax.py index 6673ec4d60..95f2783187 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2195,6 +2195,35 @@ def test_annotate_gzipped_gather(runtmp): assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] +def test_annotate_0_LIN(runtmp): + # test annotate basics + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir, "--LIN-taxonomy") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert os.path.exists(csvout) + + lin_gather_results = [x.rstrip() for x in open(csvout)] + print("\n".join(lin_gather_results)) + assert f"saving 'annotate' output to '{csvout}'" in runtmp.last_result.err + + assert "lineage" in lin_gather_results[0] + assert "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in lin_gather_results[1] + assert "1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in lin_gather_results[2] + assert "2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in lin_gather_results[3] + assert "1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in lin_gather_results[4] + + def test_annotate_gather_argparse(runtmp): # test annotate with two gather CSVs, second one empty, and --force. # this tests argparse handling w/extend. From 3402c73dab8f106ead73f29264312532d300d473 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 3 Mar 2023 08:31:27 -0800 Subject: [PATCH 43/78] punt tax genome to separate PR --- src/sourmash/cli/tax/genome.py | 24 ------------------------ src/sourmash/tax/__main__.py | 2 +- 2 files changed, 1 insertion(+), 25 deletions(-) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 56b47b658c..dc2707a3ba 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -90,18 +90,6 @@ def subparser(subparsers): '-f', '--force', action = 'store_true', help='continue past survivable errors in loading taxonomy database or gather results', ) - subparser.add_argument( - '--LIN-taxonomy', action='store_true', default=False, - help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' - ) - subparser.add_argument( - '--LIN-position', type=int, default=None, - help='For non-default output formats: summarize taxonomy at this LIN position and above. Replaces "--rank" for standard taxonomy. Note that the taxonomy CSV must contain LIN with information at this position.' - ) - subparser.add_argument( - '--LINgroups', metavar='FILE', default=None, - help='CSV containing LINgroup_name, LINgroup_prefix. Will produce a "LINgroup_report" file containing taxonomic summarization for each LINgroup.' - ) add_tax_threshold_arg(subparser, 0.1) @@ -110,18 +98,6 @@ def main(args): try: if not args.gather_csv and not args.from_file: raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") - # handle LIN options - if args.LIN_taxonomy: - if args.LIN_position: - args.rank = str(args.LIN_position) - if args.LINgroups: - if "LINgroup_report" not in args.output_format: - args.output_format.append("LINgroup_report") - elif "LINgroup_report" in args.output_format: - raise ValueError(f"Must provide LINgroup csv via '--LINgroups' in order to output a LINgroup_report.") - elif args.LINgroups or "LINgroup_report" in args.output_format: - raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") - # handle output formats print(args.output_format) if not args.rank: diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 3ebb0d73eb..d84344d6b1 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -193,7 +193,7 @@ def genome(args): tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - force=args.force, LIN_taxonomy=args.LIN_taxonomy) + force=args.force) available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") From ad367f289a0b3b16bf74226c2f473c424f8471c0 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 3 Mar 2023 08:39:00 -0800 Subject: [PATCH 44/78] change LINs test filename --- ...INS-taxonomy.csv => test.LIN-taxonomy.csv} | 0 tests/test_tax.py | 26 +++++++++---------- tests/test_tax_utils.py | 6 ++--- 3 files changed, 16 insertions(+), 16 deletions(-) rename tests/test-data/tax/{test.LINS-taxonomy.csv => test.LIN-taxonomy.csv} (100%) diff --git a/tests/test-data/tax/test.LINS-taxonomy.csv b/tests/test-data/tax/test.LIN-taxonomy.csv similarity index 100% rename from tests/test-data/tax/test.LINS-taxonomy.csv rename to tests/test-data/tax/test.LIN-taxonomy.csv diff --git a/tests/test_tax.py b/tests/test_tax.py index 95f2783187..ce36ec4cc8 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2200,7 +2200,7 @@ def test_annotate_0_LIN(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) @@ -3320,7 +3320,7 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): def test_tax_summarize_LINS(runtmp): # test basic operation w/LINs - taxfile = utils.get_test_data('tax/test.LINS-taxonomy.csv') + taxfile = utils.get_test_data('tax/test.LIN-taxonomy.csv') lineage_csv = runtmp.output('annotated-lin.csv') taxdb = tax_utils.LineageDB.load(taxfile, LIN_taxonomy=True) @@ -3368,7 +3368,7 @@ def test_metagenome_LINS(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy') @@ -3407,7 +3407,7 @@ def test_metagenome_LINS_LINgroups(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: @@ -3441,7 +3441,7 @@ def test_metagenome_LINS_human_summary_no_lin_position(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy', '-F', "human") @@ -3465,7 +3465,7 @@ def test_metagenome_LINS_human_summary_lin_position_5(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy', '-F', "human", '--LIN-position', '5') @@ -3489,7 +3489,7 @@ def test_metagenome_LINS_krona_lin_position_5(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy', '-F', "krona", '--LIN-position', '5') @@ -3512,7 +3512,7 @@ def test_metagenome_LINS_LINgroups_empty_lg_file(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: @@ -3535,7 +3535,7 @@ def test_metagenome_LINS_LINgroups_bad_cli_inputs(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: @@ -3569,7 +3569,7 @@ def test_metagenome_mult_outputs_stdout_fail(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, @@ -3584,7 +3584,7 @@ def test_genome_mult_outputs_stdout_fail(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, @@ -3599,7 +3599,7 @@ def test_metagenome_LINS_LINgroups_lg_only_header(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') - tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: @@ -3623,7 +3623,7 @@ def test_metagenome_LINS_LINgroups_lg_only_header(runtmp): # c = runtmp # g_csv = utils.get_test_data('tax/test1.gather.csv') -# tax = utils.get_test_data('tax/test.LINS-taxonomy.csv') +# tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') # lg_out = c.output('base.lingroup_report.tsv') diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 51ab8df1f0..632a24c8be 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -600,7 +600,7 @@ def test_load_taxonomy_csv(): def test_load_taxonomy_csv_LIN(): - taxonomy_csv = utils.get_test_data('tax/test.LINS-taxonomy.csv') + taxonomy_csv = utils.get_test_data('tax/test.LIN-taxonomy.csv') tax_assign = MultiLineageDB.load([taxonomy_csv], LIN_taxonomy=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] @@ -618,8 +618,8 @@ def test_load_taxonomy_csv_LIN_fail(): def test_load_taxonomy_csv_LIN_mismatch_in_taxfile(runtmp): - taxonomy_csv = utils.get_test_data('tax/test.LINS-taxonomy.csv') - mimatchLIN_csv = runtmp.output('mmLINS-taxonomy.csv') + taxonomy_csv = utils.get_test_data('tax/test.LIN-taxonomy.csv') + mimatchLIN_csv = runtmp.output('mmLIN-taxonomy.csv') with open(mimatchLIN_csv, 'w') as mm: tax21=[] tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] From 2e82b19dd1c2dda030027d822938f38dff44df6b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 3 Mar 2023 08:44:13 -0800 Subject: [PATCH 45/78] clean up --- tests/test_tax.py | 45 +++++++++------------------------------------ 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index ce36ec4cc8..199ceffec4 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3362,9 +3362,8 @@ def test_tax_summarize_LINS(runtmp): assert c['4'] == 2 -def test_metagenome_LINS(runtmp): +def test_metagenome_LIN(runtmp): # test basic metagenome with LIN taxonomy - # get/design better test data for this? c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -3402,8 +3401,8 @@ def test_metagenome_LINS(runtmp): assert "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out -def test_metagenome_LINS_LINgroups(runtmp): - # get/design better test data for this? +def test_metagenome_LIN_LINgroups(runtmp): + # test LINgroups output c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3437,7 +3436,7 @@ def test_metagenome_LINS_LINgroups(runtmp): assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000" in c.last_result.out -def test_metagenome_LINS_human_summary_no_lin_position(runtmp): +def test_metagenome_LIN_human_summary_no_lin_position(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3461,7 +3460,7 @@ def test_metagenome_LINS_human_summary_no_lin_position(runtmp): assert "test1 0.7% 86.4% 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0" in c.last_result.out -def test_metagenome_LINS_human_summary_lin_position_5(runtmp): +def test_metagenome_LIN_human_summary_lin_position_5(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3485,7 +3484,7 @@ def test_metagenome_LINS_human_summary_lin_position_5(runtmp): assert "test1 0.7% 86.4% 1;0;1;0;0;0" in c.last_result.out -def test_metagenome_LINS_krona_lin_position_5(runtmp): +def test_metagenome_LIN_krona_lin_position_5(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3508,7 +3507,7 @@ def test_metagenome_LINS_krona_lin_position_5(runtmp): assert "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" in c.last_result.out -def test_metagenome_LINS_LINgroups_empty_lg_file(runtmp): +def test_metagenome_LIN_LINgroups_empty_lg_file(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3531,7 +3530,7 @@ def test_metagenome_LINS_LINgroups_empty_lg_file(runtmp): assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err -def test_metagenome_LINS_LINgroups_bad_cli_inputs(runtmp): +def test_metagenome_LIN_LINgroups_bad_cli_inputs(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3595,7 +3594,7 @@ def test_genome_mult_outputs_stdout_fail(runtmp): assert f"Writing to stdout is incompatible with multiple output formats ['lineage_csv', 'csv_summary']" in c.last_result.err -def test_metagenome_LINS_LINgroups_lg_only_header(runtmp): +def test_metagenome_LIN_LINgroups_lg_only_header(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3616,29 +3615,3 @@ def test_metagenome_LINS_LINgroups_lg_only_header(runtmp): assert c.last_result.status != 0 assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err assert f"No LINgroups loaded from {lg_file}" in c.last_result.err - - -# def test_metagenome_LINS_csv_out(runtmp): -# # LIN taxonomy:: csv_summary out -# c = runtmp - -# g_csv = utils.get_test_data('tax/test1.gather.csv') -# tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - -# lg_out = c.output('base.lingroup_report.tsv') - -# c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy') - -# print(c.last_result.status) -# print(c.last_result.out) -# print(c.last_result.err) - -# assert c.last_result.status == 0 -# assert os.path.exists(lg_out) - -# results = [x.rstrip() for x in open(lg_out)] -# assert 'query_name,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in results[0] -# # 0th rank/position -# assert "test1,0,0.089,1,md5,test1.sig,0.057,444000,0.925,0" in results[1] -# # 19th rank/position -# assert "test1,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925,0" in results[-4] From caa42f6b804cc4b3329880c37a53ef85829d618e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 3 Mar 2023 10:13:59 -0800 Subject: [PATCH 46/78] add some docs --- doc/command-line.md | 29 ++++++++++++++++++++++++++--- doc/databases.md | 7 +++++++ src/sourmash/cli/tax/genome.py | 2 +- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 15f9591c44..05b33436bd 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -91,6 +91,7 @@ information; these are grouped under the `sourmash tax` and * `tax metagenome` - summarize metagenome gather results at each taxonomic rank. * `tax genome` - summarize single-genome gather results and report most likely classification. * `tax annotate` - annotate gather results with lineage information (no summarization or classification). +* `tax prepare` - prepare and/or combine taxonomy files. * `tax grep` - subset taxonomies and create picklists based on taxonomy string matches. * `tax summarize` - print summary information (counts of lineages) for a taxonomy lineages file or database. @@ -491,7 +492,8 @@ The sourmash `tax` or `taxonomy` commands integrate taxonomic `gather` command (we cannot combine separate `gather` runs for the same query). For supported databases (e.g. GTDB, NCBI), we provide taxonomy csv files, but they can also be generated for user-generated - databases. For more information, see [databases](databases.md). + databases. As of v4.8, some sourmash taxonomy commands can also use `LIN` + lineage information. For more information, see [databases](databases.md). `tax` commands rely upon the fact that `gather` provides both the total fraction of the query matched to each database matched, as well as a @@ -530,8 +532,13 @@ sourmash tax metagenome --taxonomy gtdb-rs202.taxonomy.v2.csv ``` -There are three possible output formats, `csv_summary`, `lineage_summary`, and - `krona`. +The possible output formats are: +- `human` +- `csv_summary` +- `lineage_summary` +- `krona` +- `kreport` +- `LINgroup_report` #### `csv_summary` output format @@ -707,6 +714,22 @@ example sourmash `{output-name}.kreport.txt`: ``` +#### `LINgroup_report` output format + +When using `LIN` taxonomic information, you can optionally also provide a `LINgroups` with `LINgroup_name` and `LINgroup_prefix` columns. If provided, we will output a `LINgroup_report` of the format `{base}.lingroup_report.tsv`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output includes just the subset of `LIN` positions that match the provided prefixes (selected from the full summary). The output will the `LINgroup` info and two additional columns: `percent_containment`, the total percent of the dataset contained in this LINgroup and all descendents, and `num_bp_contained`, the estimated number of base pairs contained in this LINgroup and all descendents. Similar to `kreport` above, we use the wording "contained" rather than "assigned," because `sourmash` assigns matches at the genome level, and the `tax` functions simply summarize this information. + +example output: +``` +LINgroup_name LINgroup_prefix percent_containment num_bp_contained +lg1 0;0;0 5.82 714000 +lg2 1;0;0 5.05 620000 +lg3 2;0;0 1.56 192000 +lg3 1;0;1 0.65 80000 +lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 +``` + +LINgroup subpaths will be grouped in output, but exact ordering may change between runs. + ### `sourmash tax genome` - classify a genome using `gather` results `sourmash tax genome` reports likely classification for each query, diff --git a/doc/databases.md b/doc/databases.md index 0111625306..847565eb4e 100644 --- a/doc/databases.md +++ b/doc/databases.md @@ -15,6 +15,13 @@ You can read more about the different database and index types [here](https://so Note that the SBT and LCA databases can be used with sourmash v3.5 and later, while Zipfile collections can only be used with sourmash v4.1 and up. +## Taxonomic Information (for non-LCA databases) + +For each prepared database, we have also made taxonomic information available linking each genome with its assigned lineage (`GTDB` or `NCBI` as appropriate). +For private databases, users can create their own `taxonomy` files: the critical columns are `ident`, containing the genome accession (e.g. `GCA_1234567.1`) and +a column for each taxonomic rank, `superkingdom` to `species`. If a `strain` column is provided, it will also be used. +As of v4.8, we can also use `LIN` taxonomic information in tax commands that accept the `--LIN-taxonomy` flag. If used, `sourmash tax` commands will require a `LIN` column in the taxonomy file which should contain `;`-separated LINs, preferably with a standard number of positions (e.g. all 20 positions in length or all 10 positions in length). Some taxonomy commands also accept a `LINgroups` file, which is a two-column file (`LINgroup_name`, `LINgroup_prefix`) describing the name and `LIN` prefix of LINgroups to be used for taxonomic summarization. + ## Downloading and using the databases All databases below can be downloaded via the command line with `curl -JLO `, where `` is the URL below. This will download an appropriately named file; you can name it yourself by specify `'-o ` to specify the local filename. diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index dc2707a3ba..555f812a25 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -83,7 +83,7 @@ def subparser(subparsers): ) subparser.add_argument( '-F', '--output-format', default=[], nargs='*', action='extend', - choices=["csv_summary", "krona", "human", "lineage_csv", "LINgroup_report"], + choices=["csv_summary", "krona", "human", "lineage_csv"], help='choose output format(s)', ) subparser.add_argument( From 2dd45b6b5f285bfebf3e3e21213fbf9356ea1eaf Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Mon, 6 Mar 2023 09:23:28 -0800 Subject: [PATCH 47/78] MRG: LineageTree class to help with LINGroup ordering (#2496) Note: based off of https://github.com/sourmash-bio/sourmash/pull/2469 - Rewrites `build_tree`, `find_lca` functions as `LineageTree` class. Using same tests, we produce same results - Adds `ordered_paths` method to produce ~ordered lineages from tree for LINgroup ordered output. - Removes `num_bp_assigned` column because it was artificial anyway (our counts are all assigned at the genome level) and we're not trying to replicate a format exactly, as we are with `kreport` output, where we have this column. Note that LINgroups will not be ordered _absolutely_, as there will be some stochasticity as we descend the dictionary. Instead, related subpaths will be grouped. e.g. two potential outputs: ``` LINgroup_name LINgroup_prefix percent_containment num_bp_contained num_bp_assigned lg3 2;0;0 1.56 192000 0 lg1 0;0;0 5.82 714000 0 lg2 1;0;0 5.05 620000 0 lg3 1;0;1 0.65 80000 0 lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000 ``` ``` LINgroup_name LINgroup_prefix percent_containment num_bp_contained num_bp_assigned lg2 1;0;0 5.05 620000 0 lg3 1;0;1 0.65 80000 0 lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000 lg1 0;0;0 5.82 714000 0 lg3 2;0;0 1.56 192000 0 ``` In these examples, the `1;0`.. paths are always grouped together, but may come before or after the `0;0` and `2;0` groups --- src/sourmash/tax/tax_utils.py | 226 ++++++++++++++++------------------ tests/test_tax.py | 14 +-- tests/test_tax_utils.py | 195 ++++++++++++++++++++++------- 3 files changed, 263 insertions(+), 172 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e9f272518f..2bd9050d40 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -258,8 +258,7 @@ def pop_to_rank(self, rank): return new def lineage_at_rank(self, rank): - "non-destructive pop_to_rank. Returns tuple of LineagePairs" - "Returns tuple of LineagePairs at given rank." + "Return tuple of LineagePairs at specified rank." # are we already above rank? self.check_rank_availability(rank) if not self.rank_is_filled(rank): @@ -488,59 +487,83 @@ def is_compatible(self, other): -def build_tree(assignments, initial=None): +@dataclass +class LineageTree: """ - Builds a tree of dictionaries from lists of LineagePair objects or + Builds a tree of dictionaries from lists of LineagePair or LineageInfo objects in 'assignments'. This tree can then be used to find lowest common ancestor agreements/confusion. """ - if initial is None: - tree = {} - else: - tree = initial - - if not assignments: - raise ValueError("empty assignment passed to build_tree") - - if not isinstance(assignments, abc.Iterable): - raise ValueError("assignments must be an iterable object.") + assignments: list = field(compare=False) - for assignment in assignments: - node = tree - - if isinstance(assignment, (BaseLineageInfo, RankLineageInfo, LINLineageInfo)): - assignment = assignment.filled_lineage - - for lineage_tup in assignment: + def __post_init__(self): + self.tree = {} + self.add_lineages(self.assignments) + + def add_lineage(self, lineage): + if isinstance(lineage, (BaseLineageInfo, RankLineageInfo, LINLineageInfo)): + lineage = lineage.filled_lineage + node = self.tree + for lineage_tup in lineage: if lineage_tup.name: child = node.get(lineage_tup, {}) node[lineage_tup] = child - # shift -> down in tree node = child - return tree + def add_lineages(self, lineages): + if not lineages: + raise ValueError("empty assignment passed to build_tree") + if not isinstance(lineages, abc.Iterable): + raise ValueError("Must pass in an iterable containing LineagePair or LineageInfo objects.") + for lineageInf in lineages: + self.add_lineage(lineageInf) - -def find_lca(tree): - """ - Given a tree produced by 'build_tree', find the first node with multiple - children, OR the only leaf in the tree. Return (lineage_tup, reason), - where 'reason' is the number of children of the returned node, i.e. - 0 if it's a leaf and > 1 if it's an internal node. - """ - - node = tree - lineage = [] - while 1: - if len(node) == 1: # descend to only child; track path - lineage_tup = next(iter(node.keys())) - lineage.append(lineage_tup) - node = node[lineage_tup] - elif len(node) == 0: # at leaf; end - return tuple(lineage), 0 - else: # len(node) > 1 => confusion!! - return tuple(lineage), len(node) + def find_lca(self): + """ + Given a LineageTree tree, find the first node with multiple + children, OR the only leaf in the tree. Return (lineage_tup, reason), + where 'reason' is the number of children of the returned node, i.e. + 0 if it's a leaf and > 1 if it's an internal node. + """ + node = self.tree + lca = [] + while 1: + if len(node) == 1: # descend to only child; track path + lineage_tup = next(iter(node.keys())) + lca.append(lineage_tup) + node = node[lineage_tup] + elif len(node) == 0: # at leaf; end + return tuple(lca), 0 + else: # len(node) > 1 => confusion!! + return tuple(lca), len(node) + + def ordered_paths(self, include_internal=False): + """ + Find all paths in the nested dict in a depth-first manner. + Each path is a tuple of lineage tuples that lead from the root + to a leaf node. Optionally include internal nodes by building + them up from leaf nodes (for ordering). + """ + paths = [] + stack = [((), self.tree)] + while stack: + path, node = stack.pop() + for key, val in node.items(): + if len(val) == 0: # leaf node + # if want internal paths, build up from leaf + if include_internal: + internal_path = path + while internal_path: + if internal_path not in paths: + paths.append(internal_path) + if isinstance(internal_path, abc.Iterable): + internal_path = internal_path[:-1] + # now add leaf path + paths.append(path + (key,)) + else: # not leaf, add to stack + stack.append((path + (key,), val)) + return paths def get_ident(ident, *, @@ -1706,7 +1729,7 @@ def as_kreport_dict(self, query_info): sD["num_bp_assigned"] = sD["num_bp_contained"] return sD - def as_lingroup_dict(self, query_info, lg_name, lowest_rank): + def as_lingroup_dict(self, query_info, lg_name): """ Produce LINgroup report dict for LINgroups. """ @@ -1714,13 +1737,6 @@ def as_lingroup_dict(self, query_info, lg_name, lowest_rank): # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) - sD["num_bp_assigned"] = str(0) - if self.lineage.n_lin_positions != 0: #empty lineage - # the number of bp actually 'assigned' at this rank. Sourmash assigns everything - # at genome level - not sure how we want to handle 'num_bp_assigned' here.. - if self.lineage.lowest_rank == lowest_rank: - sD["num_bp_assigned"] = sD["num_bp_contained"] - sD["LINgroup_prefix"] = self.lineage.display_lineage() sD["LINgroup_name"] = lg_name return sD @@ -2109,79 +2125,51 @@ def make_kreport_results(self): kreport_results.append(kresD) return header, kreport_results - def make_lingroup_results(self, LINgroupsD): # dictionary {lg_prefix: lg_name} + def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_prefix: lg_name} + """ + Report results for the specified LINGroups. + Keep LCA paths in order as much as possible. + """ self.check_summarization() - header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"] + header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained"] + if self.query_info.total_weighted_hashes == 0: raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") - all_lg_ranks = set() - rank_to_lgprefix = defaultdict(set) - all_lgs = list(LINgroupsD.keys()) - for lg_prefix in all_lgs: - lg_prefix_as_list = lg_prefix.split(';') - lg_rank = len(lg_prefix_as_list) -1 # bc 0 based - all_lg_ranks.add(lg_rank) - rank_to_lgprefix[str(lg_rank)].add(lg_prefix) - - # order lg_ranks low--> high (general --> specific) - ordered_lg_ranks = list(all_lg_ranks) - ordered_lg_ranks.sort() - lowest_rank = str(ordered_lg_ranks[-1]) - - lingroup_results = [] - for rank in ordered_lg_ranks: + # find the ranks we need to consider + all_lgs = set() + lg_ranks = set() + for lg_prefix in LINgroupsD.keys(): + # store lineage info for LCA pathfinding + lg_info = LINLineageInfo(lineage_str=lg_prefix) + all_lgs.add(lg_info) + # store rank so we only go through summarized results at these ranks + lg_rank = int(lg_info.lowest_rank) + lg_ranks.add(lg_rank) + + # grab summarized results matching LINgroup prefixes + lg_results = {} + for rank in lg_ranks: rank = str(rank) - these_lgs = rank_to_lgprefix[rank] rank_results = self.summarized_lineage_results[rank] for res in rank_results: - this_lineage = res.lineage.display_lineage() - if this_lineage in these_lgs: # is this lineage in the list of LINgroups at this rank? - this_lingroup_name = LINgroupsD[this_lineage] - lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank) - lingroup_results.append(lg_resD) - return header, lingroup_results - - # def make_lingroup_results_ordered(self, LINgroupsD): # dictionary {lg_prefix: lg_name} - # self.check_summarization() - # header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"] - # if self.query_info.total_weighted_hashes == 0: - # raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") - # # first, order the LINgroups - - # all_lgs = set() - # lg_ranks = set() - # for lg_prefix in LINgroupsD.keys(): - # # store lineage info for LCA pathfinding - # lg_info = LINLineageInfo(lineage_str=lg_prefix) - # all_lgs.add(lg_info) - # # store rank so we only select summarized results at these ranks - # lg_rank = lg_info.lowest_rank - # lg_ranks.add(str(lg_rank)) + if res.lineage in all_lgs:# is this lineage in the list of LINgroups? + this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)] + lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name) + lg_results[res.lineage] = lg_resD + + # We want to return in ~ depth order: descending each specific path in order + # use LineageTree to find ordered paths + lg_tree = LineageTree(all_lgs) + ordered_paths = lg_tree.ordered_paths(include_internal = True) + # store results in order: + lingroup_results=[] + for lg in ordered_paths: + # get LINInfo object + lg_LINInfo = LINLineageInfo(lineage=lg) + # get result, if we have it + lg_res = lg_results.get(lg_LINInfo) + if lg_res: + lingroup_results.append(lg_res) - # # now build tree from all lineage groups - # lg_tree = build_tree(all_lgs) - - # # grab summarized results matching LINgroup prefixes - # lg_results = {} - # for rank in lg_ranks: - # rank_results = self.summarized_lineage_results[rank] - # for res in rank_results: - # if res.lineage in all_lgs:# is this lineage in the list of LINgroups? - # this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)] - # lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, res.lowest_rank) - # lg_results[lg_info] = lg_resD - - # # now find each LCA path and write results for lineage groups there. - # ordered_lg_results = [] - - # while 1: - # this_path_results = [] - # lca, reason = find_lca(lg_tree) - # if reason == 0: # this is a leaf node / at bottom of a path - - # # now reverse this path's results and add to the ordered results - # path_results = this_path_results[::-1] - # ordered_lg_results.extend(path_results) - - # return header, lingroup_results + return header, lingroup_results diff --git a/tests/test_tax.py b/tests/test_tax.py index 199ceffec4..9f4b1ebe13 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3415,7 +3415,7 @@ def test_metagenome_LIN_LINgroups(runtmp): out.write('1;0;0,lg2\n') out.write('2;0;0,lg3\n') out.write('1;0;1,lg3\n') - # write a 19 so we can check 'num_bp_assigned' + # write a 19 so we can check the end out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, @@ -3428,12 +3428,12 @@ def test_metagenome_LIN_LINgroups(runtmp): assert c.last_result.status == 0 assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err assert "Read 5 LINgroup rows and found 5 distinct LINgroup prefixes." in c.last_result.err - assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained num_bp_assigned" in c.last_result.out - assert "lg1 0;0;0 5.82 714000 0" in c.last_result.out - assert "lg2 1;0;0 5.05 620000 0" in c.last_result.out - assert "lg3 2;0;0 1.56 192000 0" in c.last_result.out - assert "lg3 1;0;1 0.65 80000 0" in c.last_result.out - assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000" in c.last_result.out + assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained" in c.last_result.out + assert "lg1 0;0;0 5.82 714000" in c.last_result.out + assert "lg2 1;0;0 5.05 620000" in c.last_result.out + assert "lg3 2;0;0 1.56 192000" in c.last_result.out + assert "lg3 1;0;1 0.65 80000" in c.last_result.out + assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000" in c.last_result.out def test_metagenome_LIN_human_summary_no_lin_position(runtmp): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 632a24c8be..8d24f44382 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -17,8 +17,7 @@ BaseLineageInfo, RankLineageInfo, LINLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, write_krona, write_lineage_sample_frac, read_lingroups, - build_tree, find_lca, - LineageDB, LineageDB_Sqlite, MultiLineageDB) + LineageTree, LineageDB, LineageDB_Sqlite, MultiLineageDB) # utility functions for testing def make_mini_taxonomy(tax_info, LIN=False): @@ -169,14 +168,14 @@ def test_SummarizedGatherResult_LINs(): sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"), f_weighted_at_rank=0.3, bp_match_at_rank=30) - lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="4") + lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'num_bp_assigned': "0", + assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'percent_containment': '30.00', 'num_bp_contained': "600"} - lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="3") + lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", - 'num_bp_assigned': "0",'percent_containment': '30.00', 'num_bp_contained': "600"} + 'percent_containment': '30.00', 'num_bp_contained': "600"} with pytest.raises(ValueError) as exc: sgr.as_kreport_dict(query_info=qInf) print(str(exc)) @@ -2792,14 +2791,17 @@ def test_make_lingroup_results(): header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD) print(header) - assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained', 'num_bp_assigned'] - print(lgD) - assert lgD == [{'percent_containment': '60.00', 'num_bp_contained': '60', 'num_bp_assigned': '0', - 'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'}, - {'percent_containment': '40.00', 'num_bp_contained': '40', 'num_bp_assigned': '40', - 'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'}, - {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '20', - 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'}] + assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained'] + # order may change, just check that each lg entry is present in list of results + lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', + 'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'} + lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', + 'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'} + lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', + 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'} + assert lg1 in lgD + assert lg2 in lgD + assert lg3 in lgD def test_make_lingroup_results_fail_pre_v450(): @@ -2854,12 +2856,65 @@ def test_read_lingroups_bad_header(runtmp): assert f"'{lg_file}' must contain the following columns: 'LINgroup_prefix', 'LINgroup_name'." in str(exc) +def test_LineageTree_init(): + x = "a;b" + lin1 = RankLineageInfo(lineage_str=x) + print(lin1) + tree = LineageTree([lin1]) + assert tree.tree == { LineagePair('superkingdom', 'a'): + { LineagePair('phylum', 'b') : {}} } + +def test_LineageTree_init_mult(): + x = "a;b" + y = "a;c" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + print(lin1) + from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1, lin2]) + assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): + {LineagePair(rank='phylum', name='b', taxid=None): {}, + LineagePair(rank='phylum', name='c', taxid=None): {}}} + + +def test_LineageTree_init_and_add_lineage(): + x = "a;b" + y = "a;c" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + print(lin1) + from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1]) + assert tree.tree == { LineagePair('superkingdom', 'a'): + { LineagePair('phylum', 'b') : {}} } + tree.add_lineage(lin2) + assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): + {LineagePair(rank='phylum', name='b', taxid=None): {}, + LineagePair(rank='phylum', name='c', taxid=None): {}}} + + +def test_LineageTree_init_and_add_lineages(): + x = "a;b" + y = "a;c" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + print(lin1) + from sourmash.tax.tax_utils import LineageTree + tree = LineageTree([lin1]) + assert tree.tree == { LineagePair('superkingdom', 'a'): + { LineagePair('phylum', 'b') : {}} } + tree.add_lineages([lin2]) + assert tree.tree == {LineagePair(rank='superkingdom', name='a', taxid=None): + {LineagePair(rank='phylum', name='b', taxid=None): {}, + LineagePair(rank='phylum', name='c', taxid=None): {}}} + + def test_build_tree_RankLineageInfo(): x = "a;b" lin1 = RankLineageInfo(lineage_str=x) print(lin1) - tree = build_tree([lin1]) - assert tree == { LineagePair('superkingdom', 'a'): + tree = LineageTree([lin1]) + assert tree.tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}} } @@ -2867,8 +2922,8 @@ def test_build_tree_LINLineageInfo(): x = "0;3" lin1 = LINLineageInfo(lineage_str=x) print(lin1) - tree = build_tree([lin1]) - assert tree == { LineagePair('0', '0'): + tree = LineageTree([lin1]) + assert tree.tree == { LineagePair('0', '0'): { LineagePair('1', '3') : {}} } @@ -2879,19 +2934,19 @@ def test_build_tree_2(): lin2 = RankLineageInfo(lineage_str=y) print(lin1) print(lin2) - tree = build_tree([lin1,lin2]) + tree = LineageTree([lin1,lin2]) - assert tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, + assert tree.tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, LineagePair('phylum', 'c') : {}} } def test_build_tree_2_LineagePairs(): # build tree from LineagePairs - tree = build_tree([[LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b')], + tree = LineageTree([[LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b')], [LineagePair('superkingdom', 'a'), LineagePair('phylum', 'c')], ]) - assert tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, + assert tree.tree == { LineagePair('superkingdom', 'a'): { LineagePair('phylum', 'b') : {}, LineagePair('phylum', 'c') : {}} } @@ -2899,46 +2954,46 @@ def test_build_tree_3(): # empty phylum name x='a;' lin1 = RankLineageInfo(lineage_str=x) - tree = build_tree([lin1]) - assert tree == { LineagePair('superkingdom', 'a'): {} } + tree = LineageTree([lin1]) + assert tree.tree == { LineagePair('superkingdom', 'a'): {} } def test_build_tree_3_LineagePairs(): # empty phylum name: LineagePair input lin1 = (LineagePair('superkingdom', "a", '3'), LineagePair('phylum', '', ''),) - tree = build_tree([lin1]) - assert tree == { LineagePair('superkingdom', 'a', '3'): {} } + tree = LineageTree([lin1]) + assert tree.tree == { LineagePair('superkingdom', 'a', '3'): {} } def test_build_tree_5(): with pytest.raises(ValueError): - tree = build_tree([]) + tree = LineageTree([]) def test_build_tree_5b(): with pytest.raises(ValueError): - tree = build_tree("") + tree = LineageTree("") def test_build_tree_iterable(): with pytest.raises(ValueError) as exc: - tree = build_tree(RankLineageInfo()) - assert "assignments must be an iterable object" in str(exc) + tree = LineageTree(RankLineageInfo()) + assert "Must pass in an iterable containing LineagePair or LineageInfo objects" in str(exc) def test_find_lca(): x='a;b' lin1 = RankLineageInfo(lineage_str=x) - tree = build_tree([lin1]) - lca = find_lca(tree) + tree = LineageTree([lin1]) + lca = tree.find_lca() assert lca == ((LineagePair('superkingdom', 'a'), LineagePair('phylum', 'b'),), 0) def test_find_lca_LineagePairs(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) - lca = find_lca(tree) + tree = LineageTree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2')]]) + lca = tree.find_lca() assert lca == ((LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2'),), 0) @@ -2949,8 +3004,8 @@ def test_find_lca_2(): lin1 = RankLineageInfo(lineage_str=x) lin2 = RankLineageInfo(lineage_str=y) - tree = build_tree([lin1, lin2]) - lca = find_lca(tree) + tree = LineageTree([lin1, lin2]) + lca = tree.find_lca() assert lca == ((LineagePair('superkingdom', 'a'),), 2) @@ -2961,18 +3016,18 @@ def test_find_lca_LIN(): lin1 = LINLineageInfo(lineage_str=x) lin2 = LINLineageInfo(lineage_str=y) - tree = build_tree([lin1, lin2]) - lca = find_lca(tree) + tree = LineageTree([lin1, lin2]) + lca = tree.find_lca() assert lca == ((LineagePair('0', '5'),), 2) print(lca) def test_find_lca_2_LineagePairs(): - tree = build_tree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], + tree = LineageTree([[LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2a')], [LineagePair('rank1', 'name1'), LineagePair('rank2', 'name2b')], ]) - lca = find_lca(tree) + lca = tree.find_lca() assert lca == ((LineagePair('rank1', 'name1'),), 2) @@ -2981,8 +3036,8 @@ def test_find_lca_3(): lin1 = RankLineageInfo(lineage_str="a;b;c") lin2 = RankLineageInfo(lineage_str="a;b") - tree = build_tree([lin1, lin2]) - lca, reason = find_lca(tree) + tree = LineageTree([lin1, lin2]) + lca, reason = tree.find_lca() assert lca == lin1.filled_lineage # find most specific leaf node print(lca) @@ -2995,13 +3050,61 @@ def test_build_tree_with_initial(): lin2 = RankLineageInfo(lineage_str=y) lin3 = RankLineageInfo(lineage_str=z) - tree = build_tree([lin1, lin2]) - lca = find_lca(tree) + tree = LineageTree([lin1, lin2]) + lca = tree.find_lca() print(lca) assert lca == ((LineagePair(rank='superkingdom', name='a', taxid=None), LineagePair(rank='phylum', name='b', taxid=None)), 2) - tree2 = build_tree([lin3], initial=tree) - lca2 = find_lca(tree2) + tree.add_lineages([lin3]) + lca2 = tree.find_lca() print(lca2) assert lca2 == ((LineagePair('superkingdom', 'a'),), 2) + + +def test_LineageTree_find_ordered_paths(): + x = "a;b;c" + y = "a;b;d" + z = "a;e" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + lin3 = RankLineageInfo(lineage_str=z) + + tree = LineageTree([lin1, lin2, lin3]) + paths = tree.ordered_paths() + + print(paths) + assert paths == [(LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='e', taxid=None)), + (LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='b', taxid=None), + LineagePair(rank='class', name='c', taxid=None)), + (LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='b', taxid=None), + LineagePair(rank='class', name='d', taxid=None))] + + +def test_LineageTree_find_ordered_paths_include_internal(): + x = "a;b;c" + y = "a;b;d" + z = "a;e" + lin1 = RankLineageInfo(lineage_str=x) + lin2 = RankLineageInfo(lineage_str=y) + lin3 = RankLineageInfo(lineage_str=z) + + tree = LineageTree([lin1, lin2, lin3]) + paths = tree.ordered_paths(include_internal=True) + + print(paths) + + assert paths == [(LineagePair(rank='superkingdom', name='a', taxid=None),), + (LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='e', taxid=None)), + (LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='b', taxid=None)), + (LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='b', taxid=None), + LineagePair(rank='class', name='c', taxid=None)), + (LineagePair(rank='superkingdom', name='a', taxid=None), + LineagePair(rank='phylum', name='b', taxid=None), + LineagePair(rank='class', name='d', taxid=None))] From a08b46bb643bdbb7537a6279e76ca82d55ddf9ea Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 13:11:33 -0800 Subject: [PATCH 48/78] simplify linputs --- doc/command-line.md | 10 ++-- doc/databases.md | 2 +- src/sourmash/cli/tax/annotate.py | 2 +- src/sourmash/cli/tax/metagenome.py | 45 ++++------------- src/sourmash/cli/tax/summarize.py | 2 +- src/sourmash/cli/utils.py | 51 ++++++++++++++++++++ src/sourmash/tax/__main__.py | 18 +++---- src/sourmash/tax/tax_utils.py | 58 +++++++++++----------- tests/test-data/tax/test.LIN-taxonomy.csv | 2 +- tests/test_tax.py | 59 ++++++++++++----------- tests/test_tax_utils.py | 32 ++++++------ 11 files changed, 155 insertions(+), 126 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 05b33436bd..006cee050b 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -538,7 +538,7 @@ The possible output formats are: - `lineage_summary` - `krona` - `kreport` -- `LINgroup_report` +- `lingroup_report` #### `csv_summary` output format @@ -714,13 +714,13 @@ example sourmash `{output-name}.kreport.txt`: ``` -#### `LINgroup_report` output format +#### `lingroup_report` output format -When using `LIN` taxonomic information, you can optionally also provide a `LINgroups` with `LINgroup_name` and `LINgroup_prefix` columns. If provided, we will output a `LINgroup_report` of the format `{base}.lingroup_report.tsv`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output includes just the subset of `LIN` positions that match the provided prefixes (selected from the full summary). The output will the `LINgroup` info and two additional columns: `percent_containment`, the total percent of the dataset contained in this LINgroup and all descendents, and `num_bp_contained`, the estimated number of base pairs contained in this LINgroup and all descendents. Similar to `kreport` above, we use the wording "contained" rather than "assigned," because `sourmash` assigns matches at the genome level, and the `tax` functions simply summarize this information. +When using LIN taxonomic information, you can optionally also provide a `lingroups` with `lingroup_name` and `lingroup_prefix` columns. If provided, we will output a `lingroup_report` of the format `{base}.lingroup_report.tsv`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output includes just the subset of LIN positions that match the provided prefixes (selected from the full summary). The output will the `lingroup` info and two additional columns: `percent_containment`, the total percent of the dataset contained in this lingroup and all descendents, and `num_bp_contained`, the estimated number of base pairs contained in this lingroup and all descendents. Similar to `kreport` above, we use the wording "contained" rather than "assigned," because `sourmash` assigns matches at the genome level, and the `tax` functions simply summarize this information. example output: ``` -LINgroup_name LINgroup_prefix percent_containment num_bp_contained +lingroup_name lingroup_prefix percent_containment num_bp_contained lg1 0;0;0 5.82 714000 lg2 1;0;0 5.05 620000 lg3 2;0;0 1.56 192000 @@ -728,7 +728,7 @@ lg3 1;0;1 0.65 80000 lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 ``` -LINgroup subpaths will be grouped in output, but exact ordering may change between runs. +lingroup subpaths will be grouped in output, but exact ordering may change between runs. ### `sourmash tax genome` - classify a genome using `gather` results diff --git a/doc/databases.md b/doc/databases.md index 847565eb4e..da77f84de8 100644 --- a/doc/databases.md +++ b/doc/databases.md @@ -20,7 +20,7 @@ Note that the SBT and LCA databases can be used with sourmash v3.5 and later, wh For each prepared database, we have also made taxonomic information available linking each genome with its assigned lineage (`GTDB` or `NCBI` as appropriate). For private databases, users can create their own `taxonomy` files: the critical columns are `ident`, containing the genome accession (e.g. `GCA_1234567.1`) and a column for each taxonomic rank, `superkingdom` to `species`. If a `strain` column is provided, it will also be used. -As of v4.8, we can also use `LIN` taxonomic information in tax commands that accept the `--LIN-taxonomy` flag. If used, `sourmash tax` commands will require a `LIN` column in the taxonomy file which should contain `;`-separated LINs, preferably with a standard number of positions (e.g. all 20 positions in length or all 10 positions in length). Some taxonomy commands also accept a `LINgroups` file, which is a two-column file (`LINgroup_name`, `LINgroup_prefix`) describing the name and `LIN` prefix of LINgroups to be used for taxonomic summarization. +As of v4.8, we can also use LIN taxonomic information in tax commands that accept the `--lins` flag. If used, `sourmash tax` commands will require a `lin` column in the taxonomy file which should contain `;`-separated LINs, preferably with a standard number of positions (e.g. all 20 positions in length or all 10 positions in length). Some taxonomy commands also accept a `lingroups` file, which is a two-column file (`lingroup_name`, `lingroup_prefix`) describing the name and LIN prefix of LINgroups to be used for taxonomic summarization. ## Downloading and using the databases diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index 0cf613d55e..5d7affddff 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -60,7 +60,7 @@ def subparser(subparsers): help='continue past errors in file and taxonomy loading', ) subparser.add_argument( - '--LIN-taxonomy', action='store_true', default=False, + '--lins', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' ) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index e27da46582..bb71d2f0d7 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -23,6 +23,8 @@ import sourmash from sourmash.logging import notify, print_results, error +from sourmash.cli.utils import add_rank_arg, check_rank, check_tax_outputs + def subparser(subparsers): @@ -67,58 +69,31 @@ def subparser(subparsers): ) subparser.add_argument( '-F', '--output-format', default=[], nargs='*', action="extend", - choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "LINgroup_report"], + choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "lingroup_report"], help='choose output format(s)', ) - subparser.add_argument( - '-r', '--rank', choices=['strain','species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], - help='For non-default output formats: Summarize genome taxonomy at this rank and above. Note that the taxonomy CSV must contain lineage information at this rank.' - ) subparser.add_argument( '-f', '--force', action = 'store_true', help='continue past errors in taxonomy database loading', ) subparser.add_argument( - '--LIN-taxonomy', action='store_true', default=False, + '--lins', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' ) subparser.add_argument( - '--LIN-position', type=int, default=None, - help='For non-default output formats: summarize taxonomy at this LIN position and above. Replaces "--rank" for standard taxonomy. Note that the taxonomy CSV must contain LIN with information at this position.' + '--lingroups', metavar='FILE', default=None, + help='CSV containing lingroup_name, lingroup_prefix. Will produce a "lingroup_report" file containing taxonomic summarization for each lingroup.' ) - subparser.add_argument( - '--LINgroups', metavar='FILE', default=None, - help='CSV containing LINgroup_name, LINgroup_prefix. Will produce a "LINgroup_report" file containing taxonomic summarization for each LINgroup.' - ) - + add_rank_arg(subparser) def main(args): import sourmash try: if not args.gather_csv and not args.from_file: raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") - # handle LIN options - if args.LIN_taxonomy: - if args.LIN_position: - args.rank = str(args.LIN_position) - if args.LINgroups: - if "LINgroup_report" not in args.output_format: - args.output_format.append("LINgroup_report") - elif "LINgroup_report" in args.output_format: - raise ValueError(f"Must provide LINgroup csv via '--LINgroups' in order to output a LINgroup_report.") - elif args.LINgroups or "LINgroup_report" in args.output_format: - raise ValueError(f"Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report.") - - # handle output formats - if not args.rank: - if any(x in ["krona", "lineage_summary"] for x in args.output_format): - raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") - if len(args.output_format) > 1: - if args.output_base == "-": - raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") - elif not args.output_format: - # change to "human" for 5.0 - args.output_format = ["csv_summary"] + if args.rank: + args.rank = check_rank(args) + args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary']) except ValueError as exc: error(f"ERROR: {str(exc)}") diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index abd8b706f3..f857cbbf0b 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -47,7 +47,7 @@ def subparser(subparsers): help='continue past errors in file and taxonomy loading', ) subparser.add_argument( - '--LIN-taxonomy', action='store_true', default=False, + '--lins', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks.' ) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index d92c726b2d..f550c0e343 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -132,3 +132,54 @@ def add_num_arg(parser, default=0): '-n', '--num-hashes', '--num', metavar='N', type=check_num_bounds, default=default, help='num value should be between 50 and 50000' ) + + +def check_rank(args): + """ Check `--rank`/`--position`/`--lin-position` argument matches selected taxonomy.""" + standard_ranks =['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + if args.lins: + if args.rank.isdigit(): + #if isinstance(args.rank, int): + return str(args.rank) + raise argparse.ArgumentTypeError(f"Invalid `--rank`/`--position` input: {args.rank}. `--lins` is specified. Rank must be an integer corresponding to a LIN position.") + elif args.rank in standard_ranks: + return args.rank + else: + raise argparse.ArgumentTypeError(f"Invalid `--rank`/`--position` input: {args.rank}. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'") + + +def add_rank_arg(parser): + parser.add_argument( + '-r', '--rank', + '--position', '--lin-position', + help="For non-default output formats: Summarize genome taxonomy at this rank (or LIN position) and above. \ + Note that the taxonomy CSV must contain lineage information at this rank (or LIN position). \ + Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position" + ) + +def check_tax_outputs(args, rank_required = ["krona"]): + "Handle ouput format combinations" + # check that rank is passed for formats requiring rank. + if not args.rank: + if any(x in rank_required for x in args.output_format): + raise ValueError(f"Rank (--rank) is required for {', '.join(rank_required)} output formats.") + + # check that `--lins` is specified and `--lingroups` file exists if needed + if args.lins: + if args.lingroups: + if "lingroup_report" not in args.output_format: + args.output_format.append("lingroup_report") + elif "lingroup_report" in args.output_format: + raise ValueError(f"Must provide lingroups csv via '--lingroups' in order to output a lingroup_report.") + elif args.lingroups or "lingroup_report" in args.output_format: + raise ValueError(f"Must enable LIN taxonomy via '--lins' in order to use lingroups.") + + # check that only one output format is specified if writing to stdout + if len(args.output_format) > 1: + if args.output_base == "-": + raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + elif not args.output_format: + # change to "human" for 5.0 + args.output_format = ["csv_summary"] + + return args.output_format diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index d84344d6b1..9efafe6862 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -72,7 +72,7 @@ def metagenome(args): tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - force=args.force, LIN_taxonomy=args.LIN_taxonomy) + force=args.force, lins=args.lins) available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -93,7 +93,7 @@ def metagenome(args): fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - LIN_taxonomy=args.LIN_taxonomy, + lins=args.lins, ) except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -147,7 +147,7 @@ def metagenome(args): with FileOutput(summary_outfile) as out_fp: human_display_rank = args.rank or "species" - if args.LIN_taxonomy and not args.rank: + if args.lins and not args.rank: human_display_rank = query_gather_results[0].ranks[-1] # lowest rank tax_utils.write_human_summary(query_gather_results, out_fp, human_display_rank) @@ -168,9 +168,9 @@ def metagenome(args): tax_utils.write_output(header, kreport_results, out_fp, sep="\t", write_header=False) # write summarized --> LINgroup output tsv - if "LINgroup_report" in args.output_format: + if "lingroup_report" in args.output_format: try: - lingroups = tax_utils.read_lingroups(args.LINgroups) + lingroups = tax_utils.read_lingroups(args.lingroups) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -290,7 +290,7 @@ def annotate(args): tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - force=args.force, LIN_taxonomy=args.LIN_taxonomy) + force=args.force, lins=args.lins) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -308,7 +308,7 @@ def annotate(args): fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - LIN_taxonomy=args.LIN_taxonomy) + lins=args.lins) if not query_gather_results: continue @@ -417,7 +417,7 @@ def summarize(args): force=args.force, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - LIN_taxonomy=args.LIN_taxonomy) + lins=args.lins) except ValueError as exc: error("ERROR while loading taxonomies!") error(str(exc)) @@ -462,7 +462,7 @@ def summarize(args): # output in order of most common for lineage, count in lineage_counts.most_common(): rank = lineage[-1].rank - if args.LIN_taxonomy: + if args.lins: inf = LINLineageInfo(lineage=lineage) else: inf = RankLineageInfo(lineage=lineage) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 2bd9050d40..be869bbb58 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -620,22 +620,22 @@ def read_lingroups(lingroup_csv): # check for empty file if not header: raise ValueError(f"Cannot read lingroups from '{lingroup_csv}'. Is file empty?") - if "LINgroup_prefix" not in header or "LINgroup_name" not in header: - raise ValueError(f"'{lingroup_csv}' must contain the following columns: 'LINgroup_prefix', 'LINgroup_name'.") + if "lingroup_prefix" not in header or "lingroup_name" not in header: + raise ValueError(f"'{lingroup_csv}' must contain the following columns: 'lingroup_prefix', 'lingroup_name'.") for n, row in enumerate(r): - lingroupD[row['LINgroup_prefix']] = row['LINgroup_name'] + lingroupD[row['lingroup_prefix']] = row['lingroup_name'] if n is None: - raise ValueError(f'No LINgroups loaded from {lingroup_csv}.') + raise ValueError(f'No lingroups loaded from {lingroup_csv}.') n_lg = len(lingroupD.keys()) - notify(f"Read {n+1} LINgroup rows and found {n_lg} distinct LINgroup prefixes.") + notify(f"Read {n+1} lingroup rows and found {n_lg} distinct lingroup prefixes.") return lingroupD def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False, skip_idents = None, fail_on_missing_taxonomy=False, keep_full_identifiers=False, keep_identifier_versions=False, - LIN_taxonomy=False): + lins=False): "Load a single gather csv" if not seen_queries: seen_queries=set() @@ -660,13 +660,13 @@ def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force raise ValueError(f"Gather query {gatherRow.query_name} was found in more than one CSV. Cannot load from '{gather_csv}'.") taxres = TaxResult(raw=gatherRow, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions, - LIN_taxonomy=LIN_taxonomy) + lins=lins) taxres.get_match_lineage(tax_assignments=tax_assignments, skip_idents=skip_idents, fail_on_missing_taxonomy=fail_on_missing_taxonomy) # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(gatherRow.query_name, QueryTaxResult(taxres.query_info, LIN_taxonomy=LIN_taxonomy)) + this_querytaxres = gather_results.get(gatherRow.query_name, QueryTaxResult(taxres.query_info, lins=lins)) this_querytaxres.add_taxresult(taxres) gather_results[gatherRow.query_name] = this_querytaxres @@ -678,7 +678,7 @@ def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False, - keep_full_identifiers=False,keep_identifier_versions=False, LIN_taxonomy=False): + keep_full_identifiers=False,keep_identifier_versions=False, lins=False): ''' Load gather csvs, checking for empties and ids missing from taxonomic assignments. ''' @@ -697,7 +697,7 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon force=force, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions = keep_identifier_versions, fail_on_missing_taxonomy=fail_on_missing_taxonomy, - LIN_taxonomy=LIN_taxonomy) + lins=lins) except ValueError as exc: if force: if "found in more than one CSV" in str(exc): @@ -957,7 +957,7 @@ def __bool__(self): @classmethod def load(cls, filename, *, delimiter=',', force=False, - keep_full_identifiers=False, keep_identifier_versions=True, LIN_taxonomy=False): + keep_full_identifiers=False, keep_identifier_versions=True, lins=False): """ Load a taxonomy assignment CSV file into a LineageDB. @@ -995,15 +995,15 @@ def load(cls, filename, *, delimiter=',', force=False, elif 'name' in header and 'lineage' in header: return cls.load_from_gather_with_lineages(filename, force=force, - LIN_taxonomy=LIN_taxonomy) + lins=lins) else: header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') - if LIN_taxonomy and "LIN" not in header: - raise ValueError(f"'LIN' column not found: cannot read LIN taxonomy assignments from {filename}.") + if lins and "lin" not in header: + raise ValueError(f"'lin' column not found: cannot read LIN taxonomy assignments from {filename}.") - if not LIN_taxonomy: + if not lins: # is "strain" an available rank? if "strain" in header: include_strain=True @@ -1026,8 +1026,8 @@ def load(cls, filename, *, delimiter=',', force=False, # now parse and load lineages for n, row in enumerate(r): num_rows += 1 - if LIN_taxonomy: - lineageInfo = LINLineageInfo(lineage_str=row['LIN']) + if lins: + lineageInfo = LINLineageInfo(lineage_str=row['lin']) if n_pos is not None: if lineageInfo.n_lin_positions != n_pos: raise ValueError(f"For taxonomic summarization, all LIN assignments must use the same number of LIN positions.") @@ -1056,7 +1056,7 @@ def load(cls, filename, *, delimiter=',', force=False, else: assignments[ident] = lineage - if not LIN_taxonomy: + if not lins: if lineage[-1].rank == 'species': n_species += 1 elif lineage[-1].rank == 'strain': @@ -1067,7 +1067,7 @@ def load(cls, filename, *, delimiter=',', force=False, @classmethod - def load_from_gather_with_lineages(cls, filename, *, force=False, LIN_taxonomy=False): + def load_from_gather_with_lineages(cls, filename, *, force=False, lins=False): """ Load an annotated gather-with-lineages CSV file produced by 'tax annotate' into a LineageDB. @@ -1101,7 +1101,7 @@ def load_from_gather_with_lineages(cls, filename, *, force=False, LIN_taxonomy=F name = row['name'] ident = get_ident(name) - if LIN_taxonomy: + if lins: lineageInfo = LINLineageInfo(lineage_str=row['lineage']) else: lineageInfo = RankLineageInfo(lineage_str= row['lineage']) @@ -1575,7 +1575,7 @@ class TaxResult: skipped_ident: bool = False missed_ident: bool = False match_lineage_attempted: bool = False - LIN_taxonomy: bool = False + lins: bool = False def __post_init__(self): self.get_ident() @@ -1593,7 +1593,7 @@ def __post_init__(self): self.f_unique_to_query = float(self.raw.f_unique_to_query) self.f_unique_weighted = float(self.raw.f_unique_weighted) self.unique_intersect_bp = int(self.raw.unique_intersect_bp) - if self.LIN_taxonomy: + if self.lins: self.lineageInfo = LINLineageInfo() else: self.lineageInfo = RankLineageInfo() @@ -1618,7 +1618,7 @@ def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_t else: lin = tax_assignments.get(self.match_ident) if lin: - if self.LIN_taxonomy: + if self.lins: self.lineageInfo = LINLineageInfo(lineage = lin) else: self.lineageInfo = RankLineageInfo(lineage = lin) @@ -1737,8 +1737,8 @@ def as_lingroup_dict(self, query_info, lg_name): # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) - sD["LINgroup_prefix"] = self.lineage.display_lineage() - sD["LINgroup_name"] = lg_name + sD["lingroup_prefix"] = self.lineage.display_lineage() + sD["lingroup_name"] = lg_name return sD @@ -1799,7 +1799,7 @@ class QueryTaxResult: Contains methods for formatting results for different outputs. """ query_info: QueryInfo # initialize with QueryInfo dataclass - LIN_taxonomy: bool = False + lins: bool = False def __post_init__(self): self.query_name = self.query_info.query_name # for convenience @@ -1838,7 +1838,7 @@ def _init_classification_results(self): self.krona_header = [] def is_compatible(self, taxresult): - return taxresult.query_info == self.query_info and taxresult.LIN_taxonomy == self.LIN_taxonomy + return taxresult.query_info == self.query_info and taxresult.lins == self.lins @property def ascending_ranks(self): @@ -1931,7 +1931,7 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): self.total_bp_classified[rank] += bp_intersect_at_rank # record unclassified - if self.LIN_taxonomy: + if self.lins: lineage = LINLineageInfo() else: lineage = RankLineageInfo() @@ -2131,7 +2131,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref Keep LCA paths in order as much as possible. """ self.check_summarization() - header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained"] + header = ["lingroup_name", "lingroup_prefix", "percent_containment", "num_bp_contained"] if self.query_info.total_weighted_hashes == 0: raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") diff --git a/tests/test-data/tax/test.LIN-taxonomy.csv b/tests/test-data/tax/test.LIN-taxonomy.csv index 7185e5679c..1544b78994 100644 --- a/tests/test-data/tax/test.LIN-taxonomy.csv +++ b/tests/test-data/tax/test.LIN-taxonomy.csv @@ -1,4 +1,4 @@ -ident,LIN +ident,lin GCF_001881345.1,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 GCF_009494285.1,1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 GCF_013368705.1,2;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 diff --git a/tests/test_tax.py b/tests/test_tax.py index 9f4b1ebe13..086eee8c70 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -405,7 +405,8 @@ def test_metagenome_no_rank_lineage_summary(runtmp): with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'lineage_summary') - assert "Rank (--rank) is required for krona and lineage_summary output formats." in str(exc.value) + print(str(exc.value)) + assert "Rank (--rank) is required for krona, lineage_summary output formats." in str(exc.value) def test_metagenome_no_rank_krona(runtmp): @@ -415,7 +416,8 @@ def test_metagenome_no_rank_krona(runtmp): with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') - assert "Rank (--rank) is required for krona and lineage_summary output formats." in str(exc.value) + print(str(exc.value)) + assert "Rank (--rank) is required for krona, lineage_summary output formats." in str(exc.value) def test_genome_no_rank_krona(runtmp): @@ -2204,7 +2206,7 @@ def test_annotate_0_LIN(runtmp): csvout = runtmp.output("test1.gather.with-lineages.csv") out_dir = os.path.dirname(csvout) - c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir, "--LIN-taxonomy") + c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir, "--lins") print(c.last_result.status) print(c.last_result.out) @@ -3323,7 +3325,7 @@ def test_tax_summarize_LINS(runtmp): taxfile = utils.get_test_data('tax/test.LIN-taxonomy.csv') lineage_csv = runtmp.output('annotated-lin.csv') - taxdb = tax_utils.LineageDB.load(taxfile, LIN_taxonomy=True) + taxdb = tax_utils.LineageDB.load(taxfile, lins=True) with open(lineage_csv, 'w', newline="") as fp: w = csv.writer(fp) w.writerow(['name', 'lineage']) @@ -3333,7 +3335,7 @@ def test_tax_summarize_LINS(runtmp): print(linstr) w.writerow([k, linstr]) - runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv', '--LIN-taxonomy') + runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv', '--lins') out = runtmp.last_result.out err = runtmp.last_result.err @@ -3369,7 +3371,7 @@ def test_metagenome_LIN(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LIN-taxonomy') + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lins') print(c.last_result.status) print(c.last_result.out) @@ -3401,8 +3403,8 @@ def test_metagenome_LIN(runtmp): assert "test1,19,0.796,unclassified,md5,test1.sig,0.869,3990000,,0" in c.last_result.out -def test_metagenome_LIN_LINgroups(runtmp): - # test LINgroups output +def test_metagenome_LIN_lingroups(runtmp): + # test lingroups output c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3410,7 +3412,7 @@ def test_metagenome_LIN_LINgroups(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('LINgroup_prefix,LINgroup_name\n') + out.write('lingroup_prefix,lingroup_name\n') out.write('0;0;0,lg1\n') out.write('1;0;0,lg2\n') out.write('2;0;0,lg3\n') @@ -3419,7 +3421,7 @@ def test_metagenome_LIN_LINgroups(runtmp): out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--LIN-taxonomy', '--LINgroups', lg_file) + '--lins', '--lingroups', lg_file) print(c.last_result.status) print(c.last_result.out) @@ -3427,8 +3429,8 @@ def test_metagenome_LIN_LINgroups(runtmp): assert c.last_result.status == 0 assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err - assert "Read 5 LINgroup rows and found 5 distinct LINgroup prefixes." in c.last_result.err - assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained" in c.last_result.out + assert "Read 5 lingroup rows and found 5 distinct lingroup prefixes." in c.last_result.err + assert "lingroup_name lingroup_prefix percent_containment num_bp_contained" in c.last_result.out assert "lg1 0;0;0 5.82 714000" in c.last_result.out assert "lg2 1;0;0 5.05 620000" in c.last_result.out assert "lg3 2;0;0 1.56 192000" in c.last_result.out @@ -3443,7 +3445,7 @@ def test_metagenome_LIN_human_summary_no_lin_position(runtmp): tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--LIN-taxonomy', '-F', "human") + '--lins', '-F', "human") print(c.last_result.status) print(c.last_result.out) @@ -3467,7 +3469,7 @@ def test_metagenome_LIN_human_summary_lin_position_5(runtmp): tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--LIN-taxonomy', '-F', "human", '--LIN-position', '5') + '--lins', '-F', "human", '--lin-position', '5') print(c.last_result.status) print(c.last_result.out) @@ -3491,7 +3493,7 @@ def test_metagenome_LIN_krona_lin_position_5(runtmp): tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--LIN-taxonomy', '-F', "krona", '--LIN-position', '5') + '--lins', '-F', "krona", '--lin-position', '5') print(c.last_result.status) print(c.last_result.out) @@ -3507,7 +3509,7 @@ def test_metagenome_LIN_krona_lin_position_5(runtmp): assert "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" in c.last_result.out -def test_metagenome_LIN_LINgroups_empty_lg_file(runtmp): +def test_metagenome_LIN_lingroups_empty_lg_file(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3519,7 +3521,7 @@ def test_metagenome_LIN_LINgroups_empty_lg_file(runtmp): with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--LIN-taxonomy', '--LINgroups', lg_file) + '--lins', '--lingroups', lg_file) print(c.last_result.status) print(c.last_result.out) @@ -3530,7 +3532,7 @@ def test_metagenome_LIN_LINgroups_empty_lg_file(runtmp): assert f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err -def test_metagenome_LIN_LINgroups_bad_cli_inputs(runtmp): +def test_metagenome_LIN_lingroups_bad_cli_inputs(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3542,26 +3544,27 @@ def test_metagenome_LIN_LINgroups_bad_cli_inputs(runtmp): with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--LIN-taxonomy', '-F', "LINgroup_report") + '--lins', '-F', "lingroup_report") print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must provide LINgroup csv via '--LINgroups' in order to output a LINgroup_report." in c.last_result.err + assert "Must provide lingroups csv via '--lingroups' in order to output a lingroup_report." in c.last_result.err with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-F', "LINgroup_report") + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-F', "lingroup_report") print(c.last_result.err) assert c.last_result.status != 0 - assert "Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report." in c.last_result.err + assert "Must enable LIN taxonomy via '--lins' in order to use lingroups." in c.last_result.err + # assert "Must enable LIN taxonomy via '--lins' in order to output a lingroup_report." in c.last_result.err with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--LINgroups', lg_file) + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lingroups', lg_file) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must enable LIN taxonomy via '--LIN-taxonomy' in order to output a LINgroup_report." in c.last_result.err + assert "Must enable LIN taxonomy via '--lins' in order to use lingroups." in c.last_result.err def test_metagenome_mult_outputs_stdout_fail(runtmp): @@ -3594,7 +3597,7 @@ def test_genome_mult_outputs_stdout_fail(runtmp): assert f"Writing to stdout is incompatible with multiple output formats ['lineage_csv', 'csv_summary']" in c.last_result.err -def test_metagenome_LIN_LINgroups_lg_only_header(runtmp): +def test_metagenome_LIN_lingroups_lg_only_header(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.v450.csv') @@ -3602,11 +3605,11 @@ def test_metagenome_LIN_LINgroups_lg_only_header(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('LINgroup_prefix,LINgroup_name\n') + out.write('lingroup_prefix,lingroup_name\n') with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--LIN-taxonomy', '--LINgroups', lg_file) + '--lins', '--lingroups', lg_file) print(c.last_result.status) print(c.last_result.out) @@ -3614,4 +3617,4 @@ def test_metagenome_LIN_LINgroups_lg_only_header(runtmp): assert c.last_result.status != 0 assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err - assert f"No LINgroups loaded from {lg_file}" in c.last_result.err + assert f"No lingroups loaded from {lg_file}" in c.last_result.err diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 8d24f44382..b613880184 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -58,7 +58,7 @@ def make_TaxResult(gather_dict=None, taxD=None, keep_full_ident=False, keep_iden """Make TaxResult from artificial gather row (dict)""" gRow = make_GatherRow(gather_dict) taxres = TaxResult(raw=gRow, keep_full_identifiers=keep_full_ident, - keep_identifier_versions=keep_ident_version, LIN_taxonomy=LIN) + keep_identifier_versions=keep_ident_version, lins=LIN) if taxD is not None: taxres.get_match_lineage(tax_assignments=taxD, skip_idents=skip_idents) return taxres @@ -77,7 +77,7 @@ def make_QueryTaxResults(gather_info, taxD=None, single_query=False, keep_full_i # add to matching QueryTaxResult or create new one if not this_querytaxres or not this_querytaxres.is_compatible(taxres): # get existing or initialize new - this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info, LIN_taxonomy=LIN)) + this_querytaxres = gather_results.get(query_name, QueryTaxResult(taxres.query_info, lins=LIN)) this_querytaxres.add_taxresult(taxres) # print('missed_ident?', taxres.missed_ident) gather_results[query_name] = this_querytaxres @@ -170,11 +170,11 @@ def test_SummarizedGatherResult_LINs(): lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", + assert lgD == {'lingroup_name': "lg_name", "lingroup_prefix": "0;0;1", 'percent_containment': '30.00', 'num_bp_contained': "600"} lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", + assert lgD == {'lingroup_name': "lg_name", "lingroup_prefix": "0;0;1", 'percent_containment': '30.00', 'num_bp_contained': "600"} with pytest.raises(ValueError) as exc: sgr.as_kreport_dict(query_info=qInf) @@ -600,7 +600,7 @@ def test_load_taxonomy_csv(): def test_load_taxonomy_csv_LIN(): taxonomy_csv = utils.get_test_data('tax/test.LIN-taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], LIN_taxonomy=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], lins=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] #assert list(tax_assign.keys()) == ["GCF_000010525.1", "GCF_000007365.1", "GCF_000007725.1", "GCF_000009605.1", "GCF_000021065.1", "GCF_000021085.1"] @@ -612,8 +612,8 @@ def test_load_taxonomy_csv_LIN(): def test_load_taxonomy_csv_LIN_fail(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') with pytest.raises(ValueError) as exc: - MultiLineageDB.load([taxonomy_csv], LIN_taxonomy=True) - assert f"'LIN' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." in str(exc.value) + MultiLineageDB.load([taxonomy_csv], lins=True) + assert f"'lin' column not found: cannot read LIN taxonomy assignments from {taxonomy_csv}." in str(exc.value) def test_load_taxonomy_csv_LIN_mismatch_in_taxfile(runtmp): @@ -631,7 +631,7 @@ def test_load_taxonomy_csv_LIN_mismatch_in_taxfile(runtmp): tax21.append(taxline) mm.write("\n".join(tax21)) with pytest.raises(ValueError) as exc: - MultiLineageDB.load([mimatchLIN_csv], LIN_taxonomy=True) + MultiLineageDB.load([mimatchLIN_csv], lins=True) assert "For taxonomic summarization, all LIN assignments must use the same number of LIN positions." in str(exc.value) @@ -2791,14 +2791,14 @@ def test_make_lingroup_results(): header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD) print(header) - assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained'] + assert header == ['lingroup_name', 'lingroup_prefix', 'percent_containment', 'num_bp_contained'] # order may change, just check that each lg entry is present in list of results lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', - 'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'} + 'lingroup_prefix': '1', 'lingroup_name': 'lg1'} lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', - 'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'} + 'lingroup_prefix': '1;0', 'lingroup_name': 'lg2'} lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', - 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'} + 'lingroup_prefix': '1;1', 'lingroup_name': 'lg3'} assert lg1 in lgD assert lg2 in lgD assert lg3 in lgD @@ -2818,7 +2818,7 @@ def test_make_lingroup_results_fail_pre_v450(): def test_read_lingroups(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('LINgroup_prefix,LINgroup_name\n') + out.write('lingroup_prefix,lingroup_name\n') out.write('1,lg1\n') out.write('1;0,lg2\n') out.write('1;1,lg3\n') @@ -2839,11 +2839,11 @@ def test_read_lingroups_empty_file(runtmp): def test_read_lingroups_only_header(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('LINgroup_prefix,LINgroup_name\n') + out.write('lingroup_prefix,lingroup_name\n') with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) - assert f"No LINgroups loaded from {lg_file}" in str(exc) + assert f"No lingroups loaded from {lg_file}" in str(exc) def test_read_lingroups_bad_header(runtmp): @@ -2853,7 +2853,7 @@ def test_read_lingroups_bad_header(runtmp): with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) - assert f"'{lg_file}' must contain the following columns: 'LINgroup_prefix', 'LINgroup_name'." in str(exc) + assert f"'{lg_file}' must contain the following columns: 'lingroup_prefix', 'lingroup_name'." in str(exc) def test_LineageTree_init(): From c57f688a33cf14431ea90c79cf474d6ba8aebfd9 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 14:40:53 -0800 Subject: [PATCH 49/78] allow --lins or --lin-taxonomy --- src/sourmash/cli/tax/annotate.py | 2 +- src/sourmash/cli/tax/metagenome.py | 2 +- src/sourmash/cli/tax/summarize.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index 5d7affddff..501a02fd58 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -60,7 +60,7 @@ def subparser(subparsers): help='continue past errors in file and taxonomy loading', ) subparser.add_argument( - '--lins', action='store_true', default=False, + '--lins', '--lin-taxonomy', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' ) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index bb71d2f0d7..31b2bab2e5 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -77,7 +77,7 @@ def subparser(subparsers): help='continue past errors in taxonomy database loading', ) subparser.add_argument( - '--lins', action='store_true', default=False, + '--lins', '--lin-taxonomy', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' ) subparser.add_argument( diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index f857cbbf0b..06a109e95c 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -47,7 +47,7 @@ def subparser(subparsers): help='continue past errors in file and taxonomy loading', ) subparser.add_argument( - '--lins', action='store_true', default=False, + '--lins', '--lin-taxonomy', action='store_true', default=False, help='use LIN taxonomy in place of standard taxonomic ranks.' ) From f21eb7c4044471de1c918b260011c6c68246a328 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 16:46:25 -0800 Subject: [PATCH 50/78] add demo as tutorial --- doc/tutorial-lin-taxonomy.md | 451 +++++++++++++++++++++++++++++++++++ doc/tutorials.md | 4 +- 2 files changed, 454 insertions(+), 1 deletion(-) create mode 100644 doc/tutorial-lin-taxonomy.md diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md new file mode 100644 index 0000000000..40e367dcfc --- /dev/null +++ b/doc/tutorial-lin-taxonomy.md @@ -0,0 +1,451 @@ +# Analyzing Metagenome Composition using the LIN taxonomic framework + +Tessa Pierce Ward + +March 2023 +_Requires sourmash v4.8+_ + +--- + +```{contents} + :depth: 2 +``` + +In this tutorial, we'll use sourmash gather to analyze metagenomes using the [LIN taxonomic framework](https://dl.acm.org/doi/pdf/10.1145/3535508.3545546). +Specifically, we will analyze plant metagenomes with a low-level pathogen spike-in. +The goal is to see if we can correctly assign the pathogen sequence to its LINgroup, which includes +all known pathogenic strains. + +- `barcode1` - highest spike-in (75 picogram/microliter pathogen DNA) +- `barcode3` - lower spike-in (7.5 picogram/microliter pathogen DNA) +- `barcode5` - no spike-in + +The pathogen is `Ralstonia solanacearum` in the `Phylum IIB sequevar 1` group. + +## Install sourmash + +First, we need to install the software! We'll use conda/mamba to do this. + +The below command installs [sourmash](http://sourmash.readthedocs.io/). + +Install the software: +``` +# create a new environment +mamba create -n smash -y -c conda-forge -c bioconda sourmash +``` + +then activate the conda environment: +``` +conda activate smash +``` + +> Victory conditions: your prompt should start with +> `(smash) ` +> and you should now be able to run `sourmash` and have it output usage information!! + +## Create a working subdirectory + +Make a directory named `smash_lin`, change into it: +``` +mkdir -p ~/smash_lin +cd ~/smash_lin +``` + +Now make a couple useful folders: +``` +mkdir -p inputs +mkdir -p databases +``` + +## Download relevant data + +### First, download a database and taxonomic information + +Here, we know the spike-in is a pathogenic seqevar of Ralstonia. We will download a database +containing signatures of 27 Ralstonia genomes (pathogenic and not) and the corresponding taxonomic and lingroup information. + +``` +# database +curl -JLO https://osf.io/vxsta/download +mv ralstonia*.zip ./databases/ralstonia.zip + +# taxonomy csv +curl -JLO https://raw.githubusercontent.com/bluegenes/2023-demo-sourmash-LIN/main/databases/ralstonia-lin.taxonomy.GCA-GCF.csv +mv ralstonia-lin.taxonomy.GCA-GCF.csv ./databases + +# lingroup csv +curl -JLO https://raw.githubusercontent.com/bluegenes/2023-demo-sourmash-LIN/main/inputs/ralstonia.lingroups.csv +mv ralstonia.lingroups.csv ./databases + +ls databases # look at the database files +``` + +### Next, download pre-made sourmash signatures made from the input metagenomes + +``` +# download barcode 1 sig +curl -JLO https://osf.io/ujntr/download +mv barcode1_22142.sig.zip ./inputs/ + +# download barcode 3 signature +curl -JLO https://osf.io/2h9wx/download +mv barcode3_31543.sig.zip ./inputs + +# download barcode 5 signature +curl -JLO https://osf.io/k8nw5/download +mv barcode5_36481.sig.zip ./inputs + +# look at available input files +ls inputs +``` + +## Start with the `barcode1` (highest spike-in) sample + +### First, let's look at the metagenome signature. + +By running `sourmash sig fileinfo`, we can see information on the signatures available within the zip file. + +Here, you can see I've generated the metagenome signature with `scaled=1000` and built two ksizes, `k=31` and `k=51` + +Run: +``` +sourmash sig fileinfo ./inputs/barcode1_22142.sig.zip +``` + +In the output, you should see: +``` +** loading from './inputs/barcode1_22142.sig.zip' +path filetype: ZipFileLinearIndex +location: /home/jovyan/smash_lin/inputs/barcode1_22142.sig.zip +is database? yes +has manifest? yes +num signatures: 2 +total hashes: 914328 +summary of sketches: + 1 sketches with DNA, k=31, scaled=1000, abund 426673 total hashes + 1 sketches with DNA, k=51, scaled=1000, abund 487655 total hashes +``` + +### We can also look at the database + +Here, you can see I've generated the database with `scaled=1000` and built three ksizes, `k=21`, `k=31` and `k=51` + +Run: +``` +sourmash sig fileinfo ./databases/ralstonia.zip +``` + +In the output, you should see: + +``` +** loading from './databases/ralstonia.zip' +path filetype: ZipFileLinearIndex +location: /home/jovyan/databases/ralstonia.zip +is database? yes +has manifest? yes +num signatures: 81 +** examining manifest... +total hashes: 445041 +summary of sketches: + 27 sketches with DNA, k=21, scaled=1000, abund 148324 total hashes + 27 sketches with DNA, k=31, scaled=1000, abund 148111 total hashes + 27 sketches with DNA, k=51, scaled=1000, abund 148606 total hashes +``` +There's a lot of things to digest in this output but the two main ones are: +* there are 27 genomes represented in this database, each of which are sketched at k=21,k=31,k=51 +* this database represents ~445 *million* k-mers (multiply number of hashes by the scaled number) + + +## Run sourmash gather using ksize 51 + +Now let's run `sourmash gather` to find the closest reference genome(s) in the database. +If you want to read more about what, exactly, sourmash is doing, please see [Lightweight compositional analysis of metagenomes with FracMinHash and minimum metagenome covers](https://www.biorxiv.org/content/10.1101/2022.01.11.475838v2), Irber et al., 2022. + +Run: +``` +query="inputs/barcode1_22142.sig.zip" +database="databases/ralstonia.zip" + +gather_csv_output="barcode1_22141.k51.gather.csv" + +sourmash gather $query $database -k 51 -o $gather_csv_output +``` + +You should see the following output: +``` +selecting specified query k=51 +loaded query: barcode1_22142... (k=51, DNA) +--ading from 'databases/ralstonia.zip'... +loaded 81 total signatures from 1 locations. +after selecting signatures compatible with search, 27 remain. +Starting prefetch sweep across databases. + +Found 7 signatures via prefetch; now doing gather. + +overlap p_query p_match avg_abund +--------- ------- ------- --------- +105.0 kbp 0.0% 2.0% 1.0 GCA_002251655.1 Ralstonia solanacear... +found less than 50.0 kbp in common. => exiting + +found 1 matches total; +the recovered matches hit 0.0% of the abundance-weighted query. +the recovered matches hit 0.0% of the query k-mers (unweighted). +``` + +We only had one match, and it was a very small percentage of the total dataset. This is expected, +since the dataset is a plant metagenome with a small `Ralstonia` spike-in. + +## Add taxonomic information and summarize up lingroups + +`sourmash gather` finds the smallest set of reference genomes that contains all the known information (k-mers) in the metagenome. +In most cases, `gather` will find many metagenome matches. Here, we're only looking for `Ralstonia` matches and we only have a +single match. Regardless, let's use `sourmash tax metagenome` to add taxonomic information and see if we've correctly assigned the pathogenic sequence. + +### First, let's look at the relevant taxonomy files. + +These commands will show the first few lines of each file. If you prefer, you can look at a more human-friendly view by opening the files in a spreadsheet program. + +- **taxonomy_csv:** `databases/ralstonia-lin.taxonomy.GCA-GCF.csv` + - the essential columns are `lin` (`14;1;0;...`) and `ident` (`GCF_00`...) +- **lingroups information:** `databases/ralstonia.lingroups.csv` + - both columns are essential (`lingroup_name`, `lingroup_prefix`) + + +Look at the taxonomy file: +``` +head -n 5 databases/ralstonia-lin.taxonomy.GCA-GCF.csv +``` + +You should see: +``` +lin,species,strain,filename,accession,ident +14;1;0;0;0;0;0;0;0;0;6;0;1;0;1;0;0;0;0;0,Ralstonia solanacearum,OE1_1,GCF_001879565.1_ASM187956v1_genomic.fna,GCF_001879565.1,GCF_001879565.1 +14;1;0;0;0;0;0;0;0;0;6;0;1;0;0;0;0;0;0;0,Ralstonia solanacearum,PSS1308,GCF_001870805.1_ASM187080v1_genomic.fna,GCF_001870805.1,GCF_001870805.1 +14;1;0;0;0;0;0;0;0;0;2;1;0;0;0;0;0;0;0;0,Ralstonia solanacearum,FJAT_1458,GCF_001887535.1_ASM188753v1_genomic.fna,GCF_001887535.1,GCF_001887535.1 +14;1;0;0;0;0;0;0;0;0;2;0;0;4;4;0;0;0;0;0,Ralstonia solanacearum,Pe_13,GCF_012062595.1_ASM1206259v1_genomic.fna,GCF_012062595.1,GCF_012062595.1 +``` +> The key columns are: +> - `ident`, containing identifiers matching the database sketches +> - `lin`, containing the species information. + +Now, let's look at the lingroups file +``` +head -n5 databases/ralstonia.lingroups.csv +``` + +You should see: +``` +lingroup_name,lingroup_prefix +Phyl II,14;1;0;0;0;3;0 +Phyl IIA,14;1;0;0;0;3;0;1;0;0 +Phyl IIB,14;1;0;0;0;3;0;0 +Phyl IIB seq1 and seq2,14;1;0;0;0;3;0;0;0;0;1;0;0;0;0 +``` +> Here, we have two columns: +> - `lingroup_name` - the name for each lingroup. +> - `lingroup_prefix` - the LIN prefix corresponding to each group. + + +### Now, run `sourmash tax metagenome` to integrate taxonomic information into `gather` results + +Using the `gather` output we generated above, we can integrate taxonomic information and summarize up "ranks" (LIN positions). We can produce several different types of outputs, including a `lingroup_report`. + +`lingroup_report` format summarizes the taxonomic information at the provided `lingroup` levels, and produces a report with 4 columns: +- `lingroup_name` (from lingroups file) +- `lingroup_prefix` (from lingroups file) +- `percent_containment` - total % of the file matched to this lingroup +- `num_bp_contained` - estimated number of bp matched to this lingroup + +> Since sourmash assigns all k-mers to individual genomes, no reads/base pairs are "assigned" to higher taxonomic ranks or lingroups (as with Kraken-style LCA). Here, "percent_containment" and "num_bp_contained" is calculated by summarizing the assignments made to all genomes in a lingroup. This is akin to the "contained" information in Kraken-style reports. + +Run `tax metagenome`: +``` +gather_csv_output="barcode1_22141.k51.gather.csv" +taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" +lingroups_csv="databases/ralstonia.lingroups.csv" + +sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ + --lins --lingroups $lingroups_csv \ + -F lingroup_report +``` + +You should see: +``` +loaded 1 gather results from 'barcode1_22141.k51.gather.csv'. +loaded results for 1 queries from 1 gather CSVs +Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +Read 11 lingroup rows and found 11 distinct lingroup prefixes. +``` + +and the results: +``` +lingroup_name lingroup_prefix percent_containment num_bp_contained +Phyl II 14;1;0;0;0;3;0 0.02 108000 +Phyl IIB 14;1;0;0;0;3;0;0 0.02 108000 +Phyl IIB seq1 and seq2 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0 0.02 108000 +IIB seq1 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 0.02 108000 +``` +:::info +Here, the most specific lingroup we assign to is `Phyl IIB seq1`, which is actually the pathogenic lingroup that was spiked in, YAY! Note that the other groups in the output all contain this group. +::: + + +#### Now output the lingroup_report to a file (instead of to the terminal) + +use `-o` to provide an output basename for taxonomic output. + +``` +gather_csv_output="barcode1_22141.k51.gather.csv" +taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" +lingroups_csv="databases/ralstonia.lingroups.csv" + +sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ + --lins --lingroups $lingroups_csv \ + -F lingroup_report -o "barcode1" +``` + +> You should see `saving 'lingroup_report' output to 'barcode1.lingroup_report.tsv'` in the output. + +#### Optionally, output multiple output formats + +You can use `-F` to specify additional output formats. Here, I've added `csv_summary`. Note that `lingroup_report` will be generated automatically if you specify the `--lingroups` file. + +Run: +``` +gather_csv_output="barcode1_22141.k51.gather.csv" +taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" +lingroups_csv="databases/ralstonia.lingroups.csv" + +sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ + --lins --lingroups $lingroups_csv \ + -F lingroup_report csv_summary -o "barcode1" +``` + + +You should see the following in the output: + +``` +saving 'csv_summary' output to 'barcode1.summarized.csv'. +saving 'lingroup_report' output to 'barcode1.lingroup_report.txt'. +``` + +The `csv_summary` format is the **full** summary of this sample, e.g. the summary at each taxonomic rank (LIN position). It also includes an entry with the `unclassified` portion at each rank. + +> Note: Multiple output formats require the `-o` `--output-base` to be specified, as each must be written to a file. + +Abbreviated Results, `barcode1`: + +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lingroup_prefix** | +| ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | +| **barcode1** | 51 | 1000 | 105 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +| **barcode1** | 31 | 1000 | 173 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | + + +### Now run with `barcode3` sample + +#### sourmash gather +Run: +``` +query="inputs/barcode3_31543.sig.zip" +database="databases/ralstonia.zip" + +gather_csv_output="barcode3_31543.dna.k51.gather.csv" + +sourmash gather $query $database -k 51 -o $gather_csv_output +``` + +#### we found no matches! But, we can lower the detection threshold: + +``` +query="inputs/barcode3_31543.sig.zip" +database="databases/ralstonia.zip" +gather_csv_output="barcode3_31543.k51.gather.csv" + +# use a 10kb detection threshold +sourmash gather $query $database -k 51 --threshold-bp 10000 -o $gather_csv_output +``` + +We have a match but it's not the right one! If you run `sourmash tax metagenome` on this output, you'll see that this genome belongs to `Phyl IIB seq 2` group, which is a sister group to the correct `Phyl IIB seq` group that we expected. + + +### Dig in a bit to see what might have happened + +`sourmash gather` has two steps: first, it runs a `prefetch` to find ALL genome matches, and then uses a greedy approach to select the smallest set of genomes that contain ('cover') all known sequence content. Let's run `prefetch` independently so we can look at the results of the first step. Here, let's use `--threshold-bp 0` to get all possible matches. + +Run: +``` +query="inputs/barcode3_31543.sig.zip" +prefetch_csv_output="barcode3_31543.k51.prefetch.csv" +database="databases/ralstonia.zip" + +sourmash prefetch $query $database -k 51 --threshold-bp 0 -o $prefetch_csv_output +``` + +You should see: +``` +selecting specified query k=51 +loaded query: barcode3_31543... (k=51, DNA) +query sketch has scaled=1000; will be dynamically downsampled as needed. +--tal of 10 matching signatures so far.tonia.zip' +loaded 81 total signatures from 1 locations. +after selecting signatures compatible with search, 27 remain. +-- +total of 15 matching signatures. +saved 15 matches to CSV file 'barcode3_31543.k51.prefetch.csv' +of 487043 distinct query hashes, 12 were found in matches above threshold. +a total of 487031 query hashes remain unmatched. +final scaled value (max across query and all matches) is 1000 +``` + +#### Open the `barcode3_31543.k51.prefetch.csv` file to see what it looks like + +> Use a spreadsheet program on your computer or use `less -S barcode3_31543.k51.prefetch.csv` to see the file on the terminal. If using `less`, hit `q` when you want to exit and return to your terminal prompt. + +The first column contains the estimated number of base pairs matched between our query and each matching reference genomes. You'll notice there are four genomes that match 12kb of sequence, one of which is the "correct" genome (with the lineage we were expecting). + +**What is happening here?** + +When faced with equally good matches, `sourmash gather` makes a random choice about which genome to assign these k-mers to. This happens primarily with highly similar genomes and/or very small sequence matches. If this happens and you need to distinguish between these genomes, we recommend trying a lower scaled value. + +To see if we could robustly assign the correct sequevar for `barcode3` using a higher resolution sketch, I also ran `gather` using scaled=100. + +Abbreviated results, `barcode3`: + + +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lingroup_prefix** | +| ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | +| **barcode3** | 51 | 1000 | 12kb | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | +| **barcode3** | 31 | 1000 | 28 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +| **barcode3** | 51 | 100 | 14.8 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +| **barcode3** | 31 | 100 | 21.1 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | + + + +## barcode5 + +You can also run the `barcode5` file using the same commands as above and see that no matches are found. If you drop the threshold-bp to 0 (`--threshold-bp 0`), you can find ~1kbp overlap (a single k-mer match!). **Note, we do not recommend trusting/using results with fewer than 3 k-mer matches (3kbp at scaled=1000)**. + +I then ran this file at higher resolution to see how the results changed. In each case, very few k-mers matched and we could not robustly identify the Ralstonia genome or lingroup. As it turns out, `barcode5` does not have a `Ralstonia` spike-in, so this is a good thing! + +Abbreviated results, `barcode5`: + +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lingroup_prefix** | +| ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | +| **barcode5** | 51 | 1000 | 1 kbp | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | +| **barcode5** | 31 | 1000 | 0 | N/A | | | +| **barcode5** | 51 | 100 | 300bp | all | | | +| **barcode5** | 31 | 100 | 1.2 kb | all | | | +| **barcode5** | 51 | 10 | 120 bp | all | | | +| **barcode5** | 31 | 10 | 670 bp | all | | | +| **barcode5** | 51 | 5 | 150 bp | all | | | +| **barcode5** | 31 | 5 | 500 bp | all | | | + + +**Again, while I've used a threshold-bp of 0 to get the gather match at scaled=1000, we do trust gather matches with less than `3*scaled` overlap (< 3 k-mers matched).** + +## Summary and concluding thoughts + +The LIN taxonomic framework may be useful distinguishing groups below the species level. +We can now use LINs and lingroups with `sourmash tax metagenome`. For low level matches, the gather greedy +approach can struggle. We are working on ways to better warn users about this behavior and welcome +feedback, issues, or suggestions on our [issue tracker](https://github.com/sourmash-bio/sourmash/issues/new). \ No newline at end of file diff --git a/doc/tutorials.md b/doc/tutorials.md index bb201fbca4..c089822574 100644 --- a/doc/tutorials.md +++ b/doc/tutorials.md @@ -13,7 +13,7 @@ X and Linux. They require about 5 GB of disk space and 5 GB of RAM. ## Background and details -These next three tutorials are all notebooks that you can view, run +These next four tutorials are all notebooks that you can view, run yourself, or run interactively online via the [binder](https://mybinder.org) service. @@ -23,6 +23,8 @@ yourself, or run interactively online via the * [Working with private collections of signatures.](sourmash-collections.md) +* [Using `sourmash taxonomy` with the LIN taxonomic framework.](tutorial-lin-taxonomy.md) + ## More information For more information on analyzing sequencing data with sourmash, check out our [longer tutorial](tutorial-long.md). From 68e9afa91172a8fa6b4673e417c198c8fe435b28 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 16:53:12 -0800 Subject: [PATCH 51/78] add data ref --- doc/tutorial-lin-taxonomy.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index 40e367dcfc..9df4ccbd69 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -3,7 +3,8 @@ Tessa Pierce Ward March 2023 -_Requires sourmash v4.8+_ + +requires sourmash v4.8+ --- @@ -20,7 +21,9 @@ all known pathogenic strains. - `barcode3` - lower spike-in (7.5 picogram/microliter pathogen DNA) - `barcode5` - no spike-in -The pathogen is `Ralstonia solanacearum` in the `Phylum IIB sequevar 1` group. +The pathogen is `Ralstonia solanacearum` in the `Phylum IIB sequevar 1` group. + +This data is courtesy of [The Laboratory of Plant & Atmospheric Microbiology & (Meta)Genomics](https://sites.google.com/vt.edu/lab-vinatzer/home). ## Install sourmash From b89f826be3d05b34a500cc8bedd09b6581431bfa Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 17:27:22 -0800 Subject: [PATCH 52/78] fix typo --- doc/tutorial-lin-taxonomy.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index 9df4ccbd69..e4d24f084a 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -288,9 +288,9 @@ Phyl IIB 14;1;0;0;0;3;0;0 0.02 108000 Phyl IIB seq1 and seq2 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0 0.02 108000 IIB seq1 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 0.02 108000 ``` -:::info -Here, the most specific lingroup we assign to is `Phyl IIB seq1`, which is actually the pathogenic lingroup that was spiked in, YAY! Note that the other groups in the output all contain this group. -::: + +Here, the most specific lingroup we assign to is `Phyl IIB seq1`, which is the pathogenic lingroup that was spiked in, YAY! Note that the other groups in the output all contain this group. + #### Now output the lingroup_report to a file (instead of to the terminal) @@ -444,7 +444,7 @@ Abbreviated results, `barcode5`: | **barcode5** | 31 | 5 | 500 bp | all | | | -**Again, while I've used a threshold-bp of 0 to get the gather match at scaled=1000, we do trust gather matches with less than `3*scaled` overlap (< 3 k-mers matched).** +**Again, while I've used a threshold-bp of 0 to get the gather match at scaled=1000, we do not typically trust gather matches with less than `3*scaled` overlap (< 3 k-mers matched).** ## Summary and concluding thoughts From 9617bea338924e6f29ce69700335502ceb7e228d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 18:06:23 -0800 Subject: [PATCH 53/78] fix typo --- doc/tutorial-lin-taxonomy.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index e4d24f084a..86e155b593 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -404,7 +404,7 @@ final scaled value (max across query and all matches) is 1000 > Use a spreadsheet program on your computer or use `less -S barcode3_31543.k51.prefetch.csv` to see the file on the terminal. If using `less`, hit `q` when you want to exit and return to your terminal prompt. -The first column contains the estimated number of base pairs matched between our query and each matching reference genomes. You'll notice there are four genomes that match 12kb of sequence, one of which is the "correct" genome (with the lineage we were expecting). +The first column contains the estimated number of base pairs matched between our query and each matching reference genome. You'll notice there are four genomes that match 12kb of sequence, one of which is the "correct" genome (with the lineage we were expecting). **What is happening here?** @@ -428,7 +428,7 @@ Abbreviated results, `barcode3`: You can also run the `barcode5` file using the same commands as above and see that no matches are found. If you drop the threshold-bp to 0 (`--threshold-bp 0`), you can find ~1kbp overlap (a single k-mer match!). **Note, we do not recommend trusting/using results with fewer than 3 k-mer matches (3kbp at scaled=1000)**. -I then ran this file at higher resolution to see how the results changed. In each case, very few k-mers matched and we could not robustly identify the Ralstonia genome or lingroup. As it turns out, `barcode5` does not have a `Ralstonia` spike-in, so this is a good thing! +I then ran this file at higher resolution to see how the results changed. In each case, very few k-mers matched and we could not robustly identify a specific `Ralstonia` genome or lingroup. As it turns out, `barcode5` does not have a `Ralstonia` spike-in, so this is a good thing! Abbreviated results, `barcode5`: From 035e4b2a23de0f2bc9338f24d1c48933cd9dc689 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 18:13:47 -0800 Subject: [PATCH 54/78] better content headers --- doc/tutorial-lin-taxonomy.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index 86e155b593..10b2779fcd 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -102,7 +102,9 @@ mv barcode5_36481.sig.zip ./inputs ls inputs ``` -## Start with the `barcode1` (highest spike-in) sample +## Look at the signatures + +Let's start with the `barcode1` (highest spike-in) sample ### First, let's look at the metagenome signature. @@ -309,7 +311,7 @@ sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ > You should see `saving 'lingroup_report' output to 'barcode1.lingroup_report.tsv'` in the output. -#### Optionally, output multiple output formats +#### Optionally, write multiple output formats You can use `-F` to specify additional output formats. Here, I've added `csv_summary`. Note that `lingroup_report` will be generated automatically if you specify the `--lingroups` file. @@ -424,7 +426,7 @@ Abbreviated results, `barcode3`: -## barcode5 +### Now try barcode5 You can also run the `barcode5` file using the same commands as above and see that no matches are found. If you drop the threshold-bp to 0 (`--threshold-bp 0`), you can find ~1kbp overlap (a single k-mer match!). **Note, we do not recommend trusting/using results with fewer than 3 k-mer matches (3kbp at scaled=1000)**. From 39b6010374c99fcf89eb1df921b91cb4430cfc90 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Mon, 6 Mar 2023 18:20:16 -0800 Subject: [PATCH 55/78] add refs for sourmash tax --- doc/tutorial-lin-taxonomy.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index 10b2779fcd..5b645cd42e 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -12,6 +12,10 @@ requires sourmash v4.8+ :depth: 2 ``` +This tutorial uses the `sourmash taxonomy` module, which was introduced via [blog post](https://bluegenes.github.io/sourmash-tax/) +and was recently shown to perfom well for taxonomic profiling of long (and short!) reads in [Evaluation of taxonomic classification and profiling methods for long-read shotgun metagenomic sequencing datasets](https://link.springer.com/article/10.1186/s12859-022-05103-0), Portik et al., 2022. + + In this tutorial, we'll use sourmash gather to analyze metagenomes using the [LIN taxonomic framework](https://dl.acm.org/doi/pdf/10.1145/3535508.3545546). Specifically, we will analyze plant metagenomes with a low-level pathogen spike-in. The goal is to see if we can correctly assign the pathogen sequence to its LINgroup, which includes From 8f8d9a635fa7d1396f021a4549aa2884872c8e4d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 7 Mar 2023 09:04:30 -0800 Subject: [PATCH 56/78] simplify lingroup file colnames and lingroup report name --- doc/command-line.md | 6 ++-- doc/databases.md | 2 +- doc/tutorial-lin-taxonomy.md | 46 +++++++++++++++--------------- src/sourmash/cli/tax/metagenome.py | 8 +++--- src/sourmash/cli/utils.py | 14 ++++----- src/sourmash/tax/__main__.py | 10 +++---- src/sourmash/tax/tax_utils.py | 14 ++++----- tests/test_tax.py | 21 +++++++------- tests/test_tax_utils.py | 18 ++++++------ 9 files changed, 69 insertions(+), 70 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 006cee050b..ea43df3844 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -714,13 +714,13 @@ example sourmash `{output-name}.kreport.txt`: ``` -#### `lingroup_report` output format +#### `lingroup` output format -When using LIN taxonomic information, you can optionally also provide a `lingroups` with `lingroup_name` and `lingroup_prefix` columns. If provided, we will output a `lingroup_report` of the format `{base}.lingroup_report.tsv`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output includes just the subset of LIN positions that match the provided prefixes (selected from the full summary). The output will the `lingroup` info and two additional columns: `percent_containment`, the total percent of the dataset contained in this lingroup and all descendents, and `num_bp_contained`, the estimated number of base pairs contained in this lingroup and all descendents. Similar to `kreport` above, we use the wording "contained" rather than "assigned," because `sourmash` assigns matches at the genome level, and the `tax` functions simply summarize this information. +When using LIN taxonomic information, you can optionally also provide a `lingroups` with `name` and `lin` columns. If provided, we will output a `lingroup` of the format `{base}.lingroups.tsv`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output includes just the subset of LIN positions that match the provided prefixes (selected from the full summary). The output will the `lingroup` info and two additional columns: `percent_containment`, the total percent of the dataset contained in this lingroup and all descendents, and `num_bp_contained`, the estimated number of base pairs contained in this lingroup and all descendents. Similar to `kreport` above, we use the wording "contained" rather than "assigned," because `sourmash` assigns matches at the genome level, and the `tax` functions simply summarize this information. example output: ``` -lingroup_name lingroup_prefix percent_containment num_bp_contained +name lin percent_containment num_bp_contained lg1 0;0;0 5.82 714000 lg2 1;0;0 5.05 620000 lg3 2;0;0 1.56 192000 diff --git a/doc/databases.md b/doc/databases.md index da77f84de8..38d9430764 100644 --- a/doc/databases.md +++ b/doc/databases.md @@ -20,7 +20,7 @@ Note that the SBT and LCA databases can be used with sourmash v3.5 and later, wh For each prepared database, we have also made taxonomic information available linking each genome with its assigned lineage (`GTDB` or `NCBI` as appropriate). For private databases, users can create their own `taxonomy` files: the critical columns are `ident`, containing the genome accession (e.g. `GCA_1234567.1`) and a column for each taxonomic rank, `superkingdom` to `species`. If a `strain` column is provided, it will also be used. -As of v4.8, we can also use LIN taxonomic information in tax commands that accept the `--lins` flag. If used, `sourmash tax` commands will require a `lin` column in the taxonomy file which should contain `;`-separated LINs, preferably with a standard number of positions (e.g. all 20 positions in length or all 10 positions in length). Some taxonomy commands also accept a `lingroups` file, which is a two-column file (`lingroup_name`, `lingroup_prefix`) describing the name and LIN prefix of LINgroups to be used for taxonomic summarization. +As of v4.8, we can also use LIN taxonomic information in tax commands that accept the `--lins` flag. If used, `sourmash tax` commands will require a `lin` column in the taxonomy file which should contain `;`-separated LINs, preferably with a standard number of positions (e.g. all 20 positions in length or all 10 positions in length). Some taxonomy commands also accept a `lingroups` file, which is a two-column file (`name`, `lin`) describing the name and LIN prefix of LINgroups to be used for taxonomic summarization. ## Downloading and using the databases diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index 5b645cd42e..ff216d6f1e 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -217,7 +217,7 @@ These commands will show the first few lines of each file. If you prefer, you ca - **taxonomy_csv:** `databases/ralstonia-lin.taxonomy.GCA-GCF.csv` - the essential columns are `lin` (`14;1;0;...`) and `ident` (`GCF_00`...) - **lingroups information:** `databases/ralstonia.lingroups.csv` - - both columns are essential (`lingroup_name`, `lingroup_prefix`) + - both columns are essential (`name`, `lin`) Look at the taxonomy file: @@ -244,24 +244,24 @@ head -n5 databases/ralstonia.lingroups.csv You should see: ``` -lingroup_name,lingroup_prefix +name,lin Phyl II,14;1;0;0;0;3;0 Phyl IIA,14;1;0;0;0;3;0;1;0;0 Phyl IIB,14;1;0;0;0;3;0;0 Phyl IIB seq1 and seq2,14;1;0;0;0;3;0;0;0;0;1;0;0;0;0 ``` > Here, we have two columns: -> - `lingroup_name` - the name for each lingroup. -> - `lingroup_prefix` - the LIN prefix corresponding to each group. +> - `name` - the name for each lingroup. +> - `lin` - the LIN prefix corresponding to each group. ### Now, run `sourmash tax metagenome` to integrate taxonomic information into `gather` results -Using the `gather` output we generated above, we can integrate taxonomic information and summarize up "ranks" (LIN positions). We can produce several different types of outputs, including a `lingroup_report`. +Using the `gather` output we generated above, we can integrate taxonomic information and summarize up "ranks" (LIN positions). We can produce several different types of outputs, including a `lingroup` report. -`lingroup_report` format summarizes the taxonomic information at the provided `lingroup` levels, and produces a report with 4 columns: -- `lingroup_name` (from lingroups file) -- `lingroup_prefix` (from lingroups file) +`lingroup` format summarizes the taxonomic information at the provided `lingroup` levels, and produces a report with 4 columns: +- `name` (from lingroups file) +- `lin` (from lingroups file) - `percent_containment` - total % of the file matched to this lingroup - `num_bp_contained` - estimated number of bp matched to this lingroup @@ -274,8 +274,8 @@ taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" lingroups_csv="databases/ralstonia.lingroups.csv" sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ - --lins --lingroups $lingroups_csv \ - -F lingroup_report + --lins --lingroup $lingroups_csv \ + -F lingroup ``` You should see: @@ -288,7 +288,7 @@ Read 11 lingroup rows and found 11 distinct lingroup prefixes. and the results: ``` -lingroup_name lingroup_prefix percent_containment num_bp_contained +name lin percent_containment num_bp_contained Phyl II 14;1;0;0;0;3;0 0.02 108000 Phyl IIB 14;1;0;0;0;3;0;0 0.02 108000 Phyl IIB seq1 and seq2 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0 0.02 108000 @@ -299,7 +299,7 @@ Here, the most specific lingroup we assign to is `Phyl IIB seq1`, which is the p -#### Now output the lingroup_report to a file (instead of to the terminal) +#### Now output the lingroup report to a file (instead of to the terminal) use `-o` to provide an output basename for taxonomic output. @@ -309,25 +309,25 @@ taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" lingroups_csv="databases/ralstonia.lingroups.csv" sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ - --lins --lingroups $lingroups_csv \ - -F lingroup_report -o "barcode1" + --lins --lingroup $lingroups_csv \ + -F lingroup -o "barcode1" ``` -> You should see `saving 'lingroup_report' output to 'barcode1.lingroup_report.tsv'` in the output. +> You should see `saving 'lingroup' output to 'barcode1.lingroup.tsv'` in the output. #### Optionally, write multiple output formats -You can use `-F` to specify additional output formats. Here, I've added `csv_summary`. Note that `lingroup_report` will be generated automatically if you specify the `--lingroups` file. +You can use `-F` to specify additional output formats. Here, I've added `csv_summary`. Note that `lingroup` will be generated automatically if you specify the `--lingroup` file. Run: ``` gather_csv_output="barcode1_22141.k51.gather.csv" taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" -lingroups_csv="databases/ralstonia.lingroups.csv" +lingroups_csv="databases/ralstonia.lingroup.csv" sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ - --lins --lingroups $lingroups_csv \ - -F lingroup_report csv_summary -o "barcode1" + --lins --lingroup $lingroups_csv \ + -F lingroup csv_summary -o "barcode1" ``` @@ -335,7 +335,7 @@ You should see the following in the output: ``` saving 'csv_summary' output to 'barcode1.summarized.csv'. -saving 'lingroup_report' output to 'barcode1.lingroup_report.txt'. +saving 'lingroup' output to 'barcode1.lingroup.txt'. ``` The `csv_summary` format is the **full** summary of this sample, e.g. the summary at each taxonomic rank (LIN position). It also includes an entry with the `unclassified` portion at each rank. @@ -344,7 +344,7 @@ The `csv_summary` format is the **full** summary of this sample, e.g. the summar Abbreviated Results, `barcode1`: -| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lingroup_prefix** | +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | | ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | | **barcode1** | 51 | 1000 | 105 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | | **barcode1** | 31 | 1000 | 173 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | @@ -421,7 +421,7 @@ To see if we could robustly assign the correct sequevar for `barcode3` using a h Abbreviated results, `barcode3`: -| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lingroup_prefix** | +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | | ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | | **barcode3** | 51 | 1000 | 12kb | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | | **barcode3** | 31 | 1000 | 28 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | @@ -438,7 +438,7 @@ I then ran this file at higher resolution to see how the results changed. In eac Abbreviated results, `barcode5`: -| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lingroup_prefix** | +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | | ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | | **barcode5** | 51 | 1000 | 1 kbp | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | | **barcode5** | 31 | 1000 | 0 | N/A | | | diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 31b2bab2e5..709e4510e5 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -69,7 +69,7 @@ def subparser(subparsers): ) subparser.add_argument( '-F', '--output-format', default=[], nargs='*', action="extend", - choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "lingroup_report"], + choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "lingroup"], help='choose output format(s)', ) subparser.add_argument( @@ -78,11 +78,11 @@ def subparser(subparsers): ) subparser.add_argument( '--lins', '--lin-taxonomy', action='store_true', default=False, - help='use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain LIN lineage information.' + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information." ) subparser.add_argument( - '--lingroups', metavar='FILE', default=None, - help='CSV containing lingroup_name, lingroup_prefix. Will produce a "lingroup_report" file containing taxonomic summarization for each lingroup.' + '--lingroup', metavar='FILE', default=None, + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group." ) add_rank_arg(subparser) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index f550c0e343..22c0c4844c 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -164,14 +164,14 @@ def check_tax_outputs(args, rank_required = ["krona"]): if any(x in rank_required for x in args.output_format): raise ValueError(f"Rank (--rank) is required for {', '.join(rank_required)} output formats.") - # check that `--lins` is specified and `--lingroups` file exists if needed + # check that `--lins` is specified and `--lingroup` file exists if needed if args.lins: - if args.lingroups: - if "lingroup_report" not in args.output_format: - args.output_format.append("lingroup_report") - elif "lingroup_report" in args.output_format: - raise ValueError(f"Must provide lingroups csv via '--lingroups' in order to output a lingroup_report.") - elif args.lingroups or "lingroup_report" in args.output_format: + if args.lingroup: + if "lingroup" not in args.output_format: + args.output_format.append("lingroup") + elif "lingroup" in args.output_format: + raise ValueError(f"Must provide lingroup csv via '--lingroup' in order to output a lingroup report.") + elif args.lingroup or "lingroup" in args.output_format: raise ValueError(f"Must enable LIN taxonomy via '--lins' in order to use lingroups.") # check that only one output format is specified if writing to stdout diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 9efafe6862..dc01abaded 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -42,7 +42,7 @@ 'human': '.human.txt', 'lineage_csv': '.lineage.csv', 'kreport': ".kreport.txt", - 'lingroup_report': ".lingroup_report.tsv" + 'lingroup': ".lingroup.tsv" } def make_outfile(base, output_type, *, output_dir = ""): @@ -168,16 +168,16 @@ def metagenome(args): tax_utils.write_output(header, kreport_results, out_fp, sep="\t", write_header=False) # write summarized --> LINgroup output tsv - if "lingroup_report" in args.output_format: + if "lingroup" in args.output_format: try: - lingroups = tax_utils.read_lingroups(args.lingroups) + lingroups = tax_utils.read_lingroups(args.lingroup) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) - lingroup_reportfile, limit_float = make_outfile(args.output_base, "lingroup_report", output_dir=args.output_dir) + lingroupfile, limit_float = make_outfile(args.output_base, "lingroup", output_dir=args.output_dir) - with FileOutputCSV(lingroup_reportfile) as out_fp: + with FileOutputCSV(lingroupfile) as out_fp: header, lgreport_results = single_query_results.make_lingroup_results(LINgroupsD = lingroups) tax_utils.write_output(header, lgreport_results, out_fp, sep="\t", write_header=True) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index be869bbb58..6cb83f5eda 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -620,10 +620,10 @@ def read_lingroups(lingroup_csv): # check for empty file if not header: raise ValueError(f"Cannot read lingroups from '{lingroup_csv}'. Is file empty?") - if "lingroup_prefix" not in header or "lingroup_name" not in header: - raise ValueError(f"'{lingroup_csv}' must contain the following columns: 'lingroup_prefix', 'lingroup_name'.") + if "lin" not in header or "name" not in header: + raise ValueError(f"'{lingroup_csv}' must contain the following columns: 'name', 'lin'.") for n, row in enumerate(r): - lingroupD[row['lingroup_prefix']] = row['lingroup_name'] + lingroupD[row['lin']] = row['name'] if n is None: raise ValueError(f'No lingroups loaded from {lingroup_csv}.') @@ -1731,14 +1731,14 @@ def as_kreport_dict(self, query_info): def as_lingroup_dict(self, query_info, lg_name): """ - Produce LINgroup report dict for LINgroups. + Produce lingroup report dict for lingroups. """ sD = {} # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) - sD["lingroup_prefix"] = self.lineage.display_lineage() - sD["lingroup_name"] = lg_name + sD["lin"] = self.lineage.display_lineage() + sD["name"] = lg_name return sD @@ -2131,7 +2131,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref Keep LCA paths in order as much as possible. """ self.check_summarization() - header = ["lingroup_name", "lingroup_prefix", "percent_containment", "num_bp_contained"] + header = ["name", "lin", "percent_containment", "num_bp_contained"] if self.query_info.total_weighted_hashes == 0: raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") diff --git a/tests/test_tax.py b/tests/test_tax.py index 086eee8c70..7e5e14fe96 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3412,7 +3412,7 @@ def test_metagenome_LIN_lingroups(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('lingroup_prefix,lingroup_name\n') + out.write('lin,name\n') out.write('0;0;0,lg1\n') out.write('1;0;0,lg2\n') out.write('2;0;0,lg3\n') @@ -3421,7 +3421,7 @@ def test_metagenome_LIN_lingroups(runtmp): out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroups', lg_file) + '--lins', '--lingroup', lg_file) print(c.last_result.status) print(c.last_result.out) @@ -3430,7 +3430,7 @@ def test_metagenome_LIN_lingroups(runtmp): assert c.last_result.status == 0 assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err assert "Read 5 lingroup rows and found 5 distinct lingroup prefixes." in c.last_result.err - assert "lingroup_name lingroup_prefix percent_containment num_bp_contained" in c.last_result.out + assert "name lin percent_containment num_bp_contained" in c.last_result.out assert "lg1 0;0;0 5.82 714000" in c.last_result.out assert "lg2 1;0;0 5.05 620000" in c.last_result.out assert "lg3 2;0;0 1.56 192000" in c.last_result.out @@ -3521,7 +3521,7 @@ def test_metagenome_LIN_lingroups_empty_lg_file(runtmp): with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroups', lg_file) + '--lins', '--lingroup', lg_file) print(c.last_result.status) print(c.last_result.out) @@ -3544,24 +3544,23 @@ def test_metagenome_LIN_lingroups_bad_cli_inputs(runtmp): with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '-F', "lingroup_report") + '--lins', '-F', "lingroup") print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must provide lingroups csv via '--lingroups' in order to output a lingroup_report." in c.last_result.err + assert "Must provide lingroup csv via '--lingroup' in order to output a lingroup report." in c.last_result.err with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-F', "lingroup_report") + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-F', "lingroup") print(c.last_result.err) assert c.last_result.status != 0 assert "Must enable LIN taxonomy via '--lins' in order to use lingroups." in c.last_result.err - # assert "Must enable LIN taxonomy via '--lins' in order to output a lingroup_report." in c.last_result.err with pytest.raises(SourmashCommandFailed): - c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lingroups', lg_file) + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lingroup', lg_file) print(c.last_result.err) assert c.last_result.status != 0 assert "Must enable LIN taxonomy via '--lins' in order to use lingroups." in c.last_result.err @@ -3605,11 +3604,11 @@ def test_metagenome_LIN_lingroups_lg_only_header(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('lingroup_prefix,lingroup_name\n') + out.write('lin,name\n') with pytest.raises(SourmashCommandFailed): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, - '--lins', '--lingroups', lg_file) + '--lins', '--lingroup', lg_file) print(c.last_result.status) print(c.last_result.out) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index b613880184..412340ae37 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -170,11 +170,11 @@ def test_SummarizedGatherResult_LINs(): lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'lingroup_name': "lg_name", "lingroup_prefix": "0;0;1", + assert lgD == {'name': "lg_name", "lin": "0;0;1", 'percent_containment': '30.00', 'num_bp_contained': "600"} lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'lingroup_name': "lg_name", "lingroup_prefix": "0;0;1", + assert lgD == {'name': "lg_name", "lin": "0;0;1", 'percent_containment': '30.00', 'num_bp_contained': "600"} with pytest.raises(ValueError) as exc: sgr.as_kreport_dict(query_info=qInf) @@ -2791,14 +2791,14 @@ def test_make_lingroup_results(): header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD) print(header) - assert header == ['lingroup_name', 'lingroup_prefix', 'percent_containment', 'num_bp_contained'] + assert header == ['name', 'lin', 'percent_containment', 'num_bp_contained'] # order may change, just check that each lg entry is present in list of results lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', - 'lingroup_prefix': '1', 'lingroup_name': 'lg1'} + 'lin': '1', 'name': 'lg1'} lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', - 'lingroup_prefix': '1;0', 'lingroup_name': 'lg2'} + 'lin': '1;0', 'name': 'lg2'} lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', - 'lingroup_prefix': '1;1', 'lingroup_name': 'lg3'} + 'lin': '1;1', 'name': 'lg3'} assert lg1 in lgD assert lg2 in lgD assert lg3 in lgD @@ -2818,7 +2818,7 @@ def test_make_lingroup_results_fail_pre_v450(): def test_read_lingroups(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('lingroup_prefix,lingroup_name\n') + out.write('lin,name\n') out.write('1,lg1\n') out.write('1;0,lg2\n') out.write('1;1,lg3\n') @@ -2839,7 +2839,7 @@ def test_read_lingroups_empty_file(runtmp): def test_read_lingroups_only_header(runtmp): lg_file = runtmp.output("test.lg.csv") with open(lg_file, 'w') as out: - out.write('lingroup_prefix,lingroup_name\n') + out.write('lin,name\n') with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) @@ -2853,7 +2853,7 @@ def test_read_lingroups_bad_header(runtmp): with pytest.raises(ValueError) as exc: read_lingroups(lg_file) print(str(exc)) - assert f"'{lg_file}' must contain the following columns: 'lingroup_prefix', 'lingroup_name'." in str(exc) + assert f"'{lg_file}' must contain the following columns: 'name', 'lin'." in str(exc) def test_LineageTree_init(): From 10e5dda5fa72abc51ecb86dddb6eae6475709c7b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 7 Mar 2023 09:09:13 -0800 Subject: [PATCH 57/78] flex --- src/sourmash/cli/tax/metagenome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 709e4510e5..cbcca18fad 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -81,7 +81,7 @@ def subparser(subparsers): help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information." ) subparser.add_argument( - '--lingroup', metavar='FILE', default=None, + '--lingroup', '--lingroups', metavar='FILE', default=None, help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will produce a 'lingroup' report containing taxonomic summarization for each group." ) add_rank_arg(subparser) From aac866965606d9530bdcdf698f9c7c514d82fdca Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 7 Mar 2023 10:59:35 -0800 Subject: [PATCH 58/78] more description --- doc/tutorial-lin-taxonomy.md | 131 ++++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 47 deletions(-) diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index ff216d6f1e..0d80ac53c0 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -13,7 +13,7 @@ requires sourmash v4.8+ ``` This tutorial uses the `sourmash taxonomy` module, which was introduced via [blog post](https://bluegenes.github.io/sourmash-tax/) -and was recently shown to perfom well for taxonomic profiling of long (and short!) reads in [Evaluation of taxonomic classification and profiling methods for long-read shotgun metagenomic sequencing datasets](https://link.springer.com/article/10.1186/s12859-022-05103-0), Portik et al., 2022. +and was recently shown to perfom well for taxonomic profiling of long (and short) reads in [Evaluation of taxonomic classification and profiling methods for long-read shotgun metagenomic sequencing datasets](https://link.springer.com/article/10.1186/s12859-022-05103-0), Portik et al., 2022. In this tutorial, we'll use sourmash gather to analyze metagenomes using the [LIN taxonomic framework](https://dl.acm.org/doi/pdf/10.1145/3535508.3545546). @@ -27,7 +27,7 @@ all known pathogenic strains. The pathogen is `Ralstonia solanacearum` in the `Phylum IIB sequevar 1` group. -This data is courtesy of [The Laboratory of Plant & Atmospheric Microbiology & (Meta)Genomics](https://sites.google.com/vt.edu/lab-vinatzer/home). +This data is courtesy of [The Laboratory of Plant & Atmospheric Microbiology & (Meta)Genomics](https://sites.google.com/vt.edu/lab-vinatzer/home) in collaboration with USDA APHIS. ## Install sourmash @@ -47,7 +47,7 @@ conda activate smash ``` > Victory conditions: your prompt should start with -> `(smash) ` +> `(smash) ` > and you should now be able to run `sourmash` and have it output usage information!! ## Create a working subdirectory @@ -168,7 +168,7 @@ There's a lot of things to digest in this output but the two main ones are: ## Run sourmash gather using ksize 51 Now let's run `sourmash gather` to find the closest reference genome(s) in the database. -If you want to read more about what, exactly, sourmash is doing, please see [Lightweight compositional analysis of metagenomes with FracMinHash and minimum metagenome covers](https://www.biorxiv.org/content/10.1101/2022.01.11.475838v2), Irber et al., 2022. +If you want to read more about what sourmash is doing, please see [Lightweight compositional analysis of metagenomes with FracMinHash and minimum metagenome covers](https://www.biorxiv.org/content/10.1101/2022.01.11.475838v2), Irber et al., 2022. Run: ``` @@ -201,14 +201,11 @@ the recovered matches hit 0.0% of the abundance-weighted query. the recovered matches hit 0.0% of the query k-mers (unweighted). ``` -We only had one match, and it was a very small percentage of the total dataset. This is expected, -since the dataset is a plant metagenome with a small `Ralstonia` spike-in. +The first step of gather found all potential matches (7), and the greedy algorithm narrowed this to a single best match, `GCA_002251655.1` which shared an estimated 105 kbp with the metagenome (a very small percentage of the total dataset.) This is expected, though, since the dataset is a plant metagenome with a small `Ralstonia` spike-in. ## Add taxonomic information and summarize up lingroups -`sourmash gather` finds the smallest set of reference genomes that contains all the known information (k-mers) in the metagenome. -In most cases, `gather` will find many metagenome matches. Here, we're only looking for `Ralstonia` matches and we only have a -single match. Regardless, let's use `sourmash tax metagenome` to add taxonomic information and see if we've correctly assigned the pathogenic sequence. +`sourmash gather` finds the smallest set of reference genomes that contains all the known information (k-mers) in the metagenome. In most cases, `gather` will find many metagenome matches. Here, we're only looking for `Ralstonia` matches and we only have a single gather result. Regardless, let's use `sourmash tax metagenome` to add taxonomic information and see if we've correctly assigned the pathogenic sequence. ### First, let's look at the relevant taxonomy files. @@ -257,9 +254,9 @@ Phyl IIB seq1 and seq2,14;1;0;0;0;3;0;0;0;0;1;0;0;0;0 ### Now, run `sourmash tax metagenome` to integrate taxonomic information into `gather` results -Using the `gather` output we generated above, we can integrate taxonomic information and summarize up "ranks" (LIN positions). We can produce several different types of outputs, including a `lingroup` report. +Using the `gather` output we generated above, we can integrate taxonomic information and summarize up "ranks" (lin positions). We can produce several different types of outputs, including a `lingroup` report. -`lingroup` format summarizes the taxonomic information at the provided `lingroup` levels, and produces a report with 4 columns: +`lingroup` format summarizes the taxonomic information at each `lingroup`, and produces a report with 4 columns: - `name` (from lingroups file) - `lin` (from lingroups file) - `percent_containment` - total % of the file matched to this lingroup @@ -274,8 +271,7 @@ taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" lingroups_csv="databases/ralstonia.lingroups.csv" sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ - --lins --lingroup $lingroups_csv \ - -F lingroup + --lins --lingroup $lingroups_csv ``` You should see: @@ -295,8 +291,7 @@ Phyl IIB seq1 and seq2 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0 0.02 108000 IIB seq1 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 0.02 108000 ``` -Here, the most specific lingroup we assign to is `Phyl IIB seq1`, which is the pathogenic lingroup that was spiked in, YAY! Note that the other groups in the output all contain this group. - +Here, the most specific lingroup we assign to is `Phyl IIB seq1`, which is the pathogenic lingroup that was spiked in, yay! Note that the other groups in the output all contain this group. #### Now output the lingroup report to a file (instead of to the terminal) @@ -310,14 +305,14 @@ lingroups_csv="databases/ralstonia.lingroups.csv" sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ --lins --lingroup $lingroups_csv \ - -F lingroup -o "barcode1" + -o "barcode1" ``` > You should see `saving 'lingroup' output to 'barcode1.lingroup.tsv'` in the output. #### Optionally, write multiple output formats -You can use `-F` to specify additional output formats. Here, I've added `csv_summary`. Note that `lingroup` will be generated automatically if you specify the `--lingroup` file. +You can use `-F` to specify additional output formats. Here, I've added `csv_summary`. Note that while the `lingroup` format will be generated automatically if you specify the `--lingroup` file, you can also specify it with `-F lingroup` if you want, as I've done here. Run: ``` @@ -342,12 +337,12 @@ The `csv_summary` format is the **full** summary of this sample, e.g. the summar > Note: Multiple output formats require the `-o` `--output-base` to be specified, as each must be written to a file. -Abbreviated Results, `barcode1`: +Here's an abbreviated version of the `gather` results for `barcode1`, with lingroup information added: -| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | -| ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | -| **barcode1** | 51 | 1000 | 105 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | -| **barcode1** | 31 | 1000 | 173 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | +| ------- | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | +| **bc1** | 51 | 1000 | 105 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +| **bc1** | 31 | 1000 | 173 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | ### Now run with `barcode3` sample @@ -363,7 +358,23 @@ gather_csv_output="barcode3_31543.dna.k51.gather.csv" sourmash gather $query $database -k 51 -o $gather_csv_output ``` -#### we found no matches! But, we can lower the detection threshold: +You should see: +``` +selecting specified query k=51 +loaded query: barcode3_31543... (k=51, DNA) +loading from 'databases/ralstonia.zip'... +loaded 81 total signatures from 1 locations. +after selecting signatures compatible with search, 27 remain. +Starting prefetch sweep across databases. +Found 0 signatures via prefetch; now doing gather. +found less than 50.0 kbp in common. => exiting + + +found 0 matches total; +the recovered matches hit 0.0% of the query k-mers (unweighted). +``` + +#### gather found no sequence matches! But, we can lower the detection threshold: ``` query="inputs/barcode3_31543.sig.zip" @@ -374,10 +385,30 @@ gather_csv_output="barcode3_31543.k51.gather.csv" sourmash gather $query $database -k 51 --threshold-bp 10000 -o $gather_csv_output ``` -We have a match but it's not the right one! If you run `sourmash tax metagenome` on this output, you'll see that this genome belongs to `Phyl IIB seq 2` group, which is a sister group to the correct `Phyl IIB seq` group that we expected. +This time, you should see: +``` +selecting specified query k=51 +loaded query: barcode3_31543... (k=51, DNA) +loading from 'databases/ralstonia.zip'... +loaded 81 total signatures from 1 locations. +after selecting signatures compatible with search, 27 remain. +Starting prefetch sweep across databases. +Found 6 signatures via prefetch; now doing gather. + +overlap p_query p_match avg_abund +--------- ------- ------- --------- +12.0 kbp 0.0% 0.2% 1.0 GCA_000750575.1 Ralstonia solanacear... -### Dig in a bit to see what might have happened +found 1 matches total; +the recovered matches hit 0.0% of the abundance-weighted query. +the recovered matches hit 0.0% of the query k-mers (unweighted). + +``` + +You'll notice that while we have an estimated ~12kbp overlap, the matched genome (`GCA_000750575.1`) is different from the one matched above for `barcode5`. If you run `sourmash tax metagenome` on this output, you'll see that this genome belongs to `Phyl IIB seq 2` group, which is a sister group to the correct `Phyl IIB seq 1` group that we expected. So we have a match but it's not the right one -- why not? + +### What happened? Use `prefetch` to investigate `sourmash gather` has two steps: first, it runs a `prefetch` to find ALL genome matches, and then uses a greedy approach to select the smallest set of genomes that contain ('cover') all known sequence content. Let's run `prefetch` independently so we can look at the results of the first step. Here, let's use `--threshold-bp 0` to get all possible matches. @@ -406,11 +437,13 @@ a total of 487031 query hashes remain unmatched. final scaled value (max across query and all matches) is 1000 ``` -#### Open the `barcode3_31543.k51.prefetch.csv` file to see what it looks like +Here, the output is telling us we found matches to 15 of the 27 Ralstonia genomes. But only **12 k-mers** were shared between the metagenome sample and the genomes. Remember that sourmash uses a representative subsample of all k-mers, so here these 12 k-mers represent ~ 12kb of sequence (12 * scaled). We've found that this is sufficient to detect presence of an organism, but at this low level, it can be hard to distinguish between closely-related genomes. Let's open the prefetch output to see how those 12 k-mers matched between different genomes. + +#### Look at the `barcode3_31543.k51.prefetch.csv` file > Use a spreadsheet program on your computer or use `less -S barcode3_31543.k51.prefetch.csv` to see the file on the terminal. If using `less`, hit `q` when you want to exit and return to your terminal prompt. -The first column contains the estimated number of base pairs matched between our query and each matching reference genome. You'll notice there are four genomes that match 12kb of sequence, one of which is the "correct" genome (with the lineage we were expecting). +The first column contains the estimated number of base pairs matched between our query and each matching reference genome. You'll notice there are four genomes that match 12kb of sequence, one of which is the "correct" genome (`GCA_002251655.1`, which is in the `IIB seq1` lingroup). **What is happening here?** @@ -418,36 +451,40 @@ When faced with equally good matches, `sourmash gather` makes a random choice ab To see if we could robustly assign the correct sequevar for `barcode3` using a higher resolution sketch, I also ran `gather` using scaled=100. -Abbreviated results, `barcode3`: +Here's an abbreviated version of the `gather` results for `barcode3`, with lingroup information added: + + +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | +| ------- | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | +| **bc3** | 51 | 1000 | 12kb | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | +| **bc3** | 31 | 1000 | 28 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +| **bc3** | 51 | 100 | 14.8 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +| **bc3** | 31 | 100 | 21.1 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | -| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | -| ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | -| **barcode3** | 51 | 1000 | 12kb | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | -| **barcode3** | 31 | 1000 | 28 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | -| **barcode3** | 51 | 100 | 14.8 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | -| **barcode3** | 31 | 100 | 21.1 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | +We typically use k=51 for strain-level matching and k=31 for species-level matching. Notice that running at k=31 with scaled 1000 found the right match. However, if you run prefetch for this sample, you see there are three matches with `28kb` overlap, so we just got lucky that `gather` selected the "right" one for this case. +In contrast, running at `scaled=100` had sufficient information to correctly assign the sequence to the `IIB seq1` lingroup. -### Now try barcode5 +### Now try the `barcode5` sample You can also run the `barcode5` file using the same commands as above and see that no matches are found. If you drop the threshold-bp to 0 (`--threshold-bp 0`), you can find ~1kbp overlap (a single k-mer match!). **Note, we do not recommend trusting/using results with fewer than 3 k-mer matches (3kbp at scaled=1000)**. I then ran this file at higher resolution to see how the results changed. In each case, very few k-mers matched and we could not robustly identify a specific `Ralstonia` genome or lingroup. As it turns out, `barcode5` does not have a `Ralstonia` spike-in, so this is a good thing! -Abbreviated results, `barcode5`: +Here's an abbreviated version of the `gather` results for `barcode5`, with lingroup information added in cases with a single gather match: -| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | -| ------------ | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | -| **barcode5** | 51 | 1000 | 1 kbp | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | -| **barcode5** | 31 | 1000 | 0 | N/A | | | -| **barcode5** | 51 | 100 | 300bp | all | | | -| **barcode5** | 31 | 100 | 1.2 kb | all | | | -| **barcode5** | 51 | 10 | 120 bp | all | | | -| **barcode5** | 31 | 10 | 670 bp | all | | | -| **barcode5** | 51 | 5 | 150 bp | all | | | -| **barcode5** | 31 | 5 | 500 bp | all | | | +| | **ksize** | **scaled** | **best overlap** | **gather match(es)** | **lingroup** | **lin** | +| ------- | --------- | ---------- | ---------------- | -------------------- | ------------ | ---------------------------------- | +| **bc5** | 51 | 1000 | 1 kbp | GCA_000750575.1 | IIB seq2 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;1;0 | +| **bc5** | 31 | 1000 | 0 | N/A | | | +| **bc5** | 51 | 100 | 300bp | all | | | +| **bc5** | 31 | 100 | 1.2 kb | all | | | +| **bc5** | 51 | 10 | 120 bp | all | | | +| **bc5** | 31 | 10 | 670 bp | all | | | +| **bc5** | 51 | 5 | 150 bp | all | | | +| **bc5** | 31 | 5 | 500 bp | all | | | **Again, while I've used a threshold-bp of 0 to get the gather match at scaled=1000, we do not typically trust gather matches with less than `3*scaled` overlap (< 3 k-mers matched).** @@ -457,4 +494,4 @@ Abbreviated results, `barcode5`: The LIN taxonomic framework may be useful distinguishing groups below the species level. We can now use LINs and lingroups with `sourmash tax metagenome`. For low level matches, the gather greedy approach can struggle. We are working on ways to better warn users about this behavior and welcome -feedback, issues, or suggestions on our [issue tracker](https://github.com/sourmash-bio/sourmash/issues/new). \ No newline at end of file +feedback and suggestions on our [issue tracker](https://github.com/sourmash-bio/sourmash/issues/new). \ No newline at end of file From 2751ebe6e3de980a8ae064bdc752d475db45ae60 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 7 Mar 2023 11:35:15 -0800 Subject: [PATCH 59/78] more description for tutorial --- doc/tutorial-lin-taxonomy.md | 45 ++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/doc/tutorial-lin-taxonomy.md b/doc/tutorial-lin-taxonomy.md index 0d80ac53c0..5512a6115f 100644 --- a/doc/tutorial-lin-taxonomy.md +++ b/doc/tutorial-lin-taxonomy.md @@ -318,7 +318,7 @@ Run: ``` gather_csv_output="barcode1_22141.k51.gather.csv" taxonomy_csv="databases/ralstonia-lin.taxonomy.GCA-GCF.csv" -lingroups_csv="databases/ralstonia.lingroup.csv" +lingroups_csv="databases/ralstonia.lingroups.csv" sourmash tax metagenome -g $gather_csv_output -t $taxonomy_csv \ --lins --lingroup $lingroups_csv \ @@ -447,9 +447,9 @@ The first column contains the estimated number of base pairs matched between our **What is happening here?** -When faced with equally good matches, `sourmash gather` makes a random choice about which genome to assign these k-mers to. This happens primarily with highly similar genomes and/or very small sequence matches. If this happens and you need to distinguish between these genomes, we recommend trying a lower scaled value. +When faced with equally good matches, `sourmash gather` makes a random choice about which genome to assign these k-mers to. This happens primarily with highly similar genomes and/or very small sequence matches. If this happens and you need to distinguish between these genomes, we recommend trying a lower scaled value (higher resolution). "scaled" refers to the systematic downsampling: we keep rougly 1/scaled k-mers (`scaled=1000` keeps ~1 of every 1000 unique k-mers). `scaled=1` keeps all k-mers, but our signature storage is not optimized for this use case. -To see if we could robustly assign the correct sequevar for `barcode3` using a higher resolution sketch, I also ran `gather` using scaled=100. +To see if we could robustly assign the correct sequevar for `barcode3` using a higher resolution sketch, I also ran `gather` using `scaled=100`. Here's an abbreviated version of the `gather` results for `barcode3`, with lingroup information added: @@ -462,14 +462,43 @@ Here's an abbreviated version of the `gather` results for `barcode3`, with ling | **bc3** | 51 | 100 | 14.8 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | | **bc3** | 31 | 100 | 21.1 kb | GCA_002251655.1 | IIB seq1 | 14;1;0;0;0;3;0;0;0;0;1;0;0;0;0;0;0 | -We typically use k=51 for strain-level matching and k=31 for species-level matching. Notice that running at k=31 with scaled 1000 found the right match. However, if you run prefetch for this sample, you see there are three matches with `28kb` overlap, so we just got lucky that `gather` selected the "right" one for this case. +We typically use k=51 for strain-level matching and k=31 for species-level matching. Notice that running at k=31 with scaled 1000 found the right match. However, if you run prefetch for `k=31`, you see there are three matches with `28kb` overlap, so we just got lucky that `gather` selected the right one for this test case. -In contrast, running at `scaled=100` had sufficient information to correctly assign the sequence to the `IIB seq1` lingroup. +In contrast, by sketching the `Ralstonia` genomes and metagenome at higher resolution (`scaled=100`), we had sufficient information to correctly assign the sequence to the `IIB seq1` lingroup at either ksize. ### Now try the `barcode5` sample -You can also run the `barcode5` file using the same commands as above and see that no matches are found. If you drop the threshold-bp to 0 (`--threshold-bp 0`), you can find ~1kbp overlap (a single k-mer match!). **Note, we do not recommend trusting/using results with fewer than 3 k-mer matches (3kbp at scaled=1000)**. +You can also run the `barcode5` file using the same commands as above: + +``` +query="inputs/barcode5_36481.sig.zip" +database="databases/ralstonia.zip" + +gather_csv_output="barcode5_36481.dna.k51.gather.csv" + +sourmash gather $query $database -k 51 -o $gather_csv_output +``` + +You should see: + +``` +selecting specified query k=51 +loaded query: barcode5_36481... (k=51, DNA) +-- +loaded 81 total signatures from 1 locations. +after selecting signatures compatible with search, 27 remain. + +Starting prefetch sweep across databases. +Found 0 signatures via prefetch; now doing gather. +found less than 50.0 kbp in common. => exiting + +found 0 matches total; +the recovered matches hit 0.0% of the query k-mers (unweighted). +``` + + +No matches are found. If you drop the threshold-bp to 0 (`--threshold-bp 0`), you can find ~1kbp overlap (a single k-mer match!). **Note, we do not recommend trusting/using results with fewer than 3 k-mer matches (3kbp at scaled=1000)**. Especially in larger databases (e.g. NCBI/GTDB), a single k-mer match might actually be from contamination in the reference genome rather than true genome content, so you may end up assigning the wrong lineage. Requiring 3 k-mers (representing ~3kb of matching sequence) makes it more likely your matches represent true genome content. I then ran this file at higher resolution to see how the results changed. In each case, very few k-mers matched and we could not robustly identify a specific `Ralstonia` genome or lingroup. As it turns out, `barcode5` does not have a `Ralstonia` spike-in, so this is a good thing! @@ -487,7 +516,9 @@ Here's an abbreviated version of the `gather` results for `barcode5`, with ling | **bc5** | 31 | 5 | 500 bp | all | | | -**Again, while I've used a threshold-bp of 0 to get the gather match at scaled=1000, we do not typically trust gather matches with less than `3*scaled` overlap (< 3 k-mers matched).** +**Again, while I've used a threshold-bp of 0 to get the gather match at scaled=1000, we do not typically trust gather matches with less than `3*scaled` overlap (< 3 k-mers matched).** Even at very high resolution (scaled=5), we matched nearly all Ralstonia genomes and could not distinguish a single lingroup. + +We typically recommend running at `scaled=1000` (our default), as this works for most microbial use cases. You can run at higher resolution (lower scaled) if you need to, but higher resolution signatures are larger and can take significantly longer to build and search - use at your own risk :). ## Summary and concluding thoughts From e8cb7a0b83e59f7726c7a1bf8d093fcc459cec2b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 7 Mar 2023 15:14:32 -0800 Subject: [PATCH 60/78] better lingroup output documentation --- doc/command-line.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index ea43df3844..e98e337d8b 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -716,7 +716,14 @@ example sourmash `{output-name}.kreport.txt`: #### `lingroup` output format -When using LIN taxonomic information, you can optionally also provide a `lingroups` with `name` and `lin` columns. If provided, we will output a `lingroup` of the format `{base}.lingroups.tsv`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output includes just the subset of LIN positions that match the provided prefixes (selected from the full summary). The output will the `lingroup` info and two additional columns: `percent_containment`, the total percent of the dataset contained in this lingroup and all descendents, and `num_bp_contained`, the estimated number of base pairs contained in this lingroup and all descendents. Similar to `kreport` above, we use the wording "contained" rather than "assigned," because `sourmash` assigns matches at the genome level, and the `tax` functions simply summarize this information. +When using LIN taxonomic information, you can optionally also provide a `lingroup` file with two required columns: `name` and `lin`. If provided, we will produce a file, `{base}.lingroups.tsv`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output will select information from the full summary that match the LIN prefixes provided as groups. + +This output format consists of four columns: +- `name`, `lin` columns are taken directly from the `--lingroup` file +- `percent_containment`, the total percent of the dataset contained in this lingroup and all descendents +- `num_bp_contained`, the estimated number of base pairs contained in this lingroup and all descendents. + +Similar to `kreport` above, we use the wording "contained" rather than "assigned," because `sourmash` assigns matches at the genome level, and the `tax` functions summarize this information. example output: ``` @@ -728,7 +735,7 @@ lg3 1;0;1 0.65 80000 lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 ``` -lingroup subpaths will be grouped in output, but exact ordering may change between runs. +Related lingroup subpaths will be grouped in output, but exact ordering may change between runs. ### `sourmash tax genome` - classify a genome using `gather` results From 907b74cceec77a69cd8d54ef48c2cb2e2dc3e2af Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Tue, 7 Mar 2023 15:47:04 -0800 Subject: [PATCH 61/78] rank arg tests --- src/sourmash/cli/utils.py | 8 ++++---- tests/test_tax.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 22c0c4844c..4e1e2d9ff6 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -135,17 +135,17 @@ def add_num_arg(parser, default=0): def check_rank(args): - """ Check `--rank`/`--position`/`--lin-position` argument matches selected taxonomy.""" + """ Check '--rank'/'--position'/'--lin-position' argument matches selected taxonomy.""" standard_ranks =['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] if args.lins: if args.rank.isdigit(): #if isinstance(args.rank, int): return str(args.rank) - raise argparse.ArgumentTypeError(f"Invalid `--rank`/`--position` input: {args.rank}. `--lins` is specified. Rank must be an integer corresponding to a LIN position.") + raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position.") elif args.rank in standard_ranks: return args.rank else: - raise argparse.ArgumentTypeError(f"Invalid `--rank`/`--position` input: {args.rank}. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'") + raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'") def add_rank_arg(parser): @@ -164,7 +164,7 @@ def check_tax_outputs(args, rank_required = ["krona"]): if any(x in rank_required for x in args.output_format): raise ValueError(f"Rank (--rank) is required for {', '.join(rank_required)} output formats.") - # check that `--lins` is specified and `--lingroup` file exists if needed + # check that '--lins' is specified and '--lingroup' file exists if needed if args.lins: if args.lingroup: if "lingroup" not in args.output_format: diff --git a/tests/test_tax.py b/tests/test_tax.py index 7e5e14fe96..b5314bac3c 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -420,6 +420,22 @@ def test_metagenome_no_rank_krona(runtmp): assert "Rank (--rank) is required for krona, lineage_summary output formats." in str(exc.value) +def test_metagenome_bad_rank_krona(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona', '--rank', 'NotARank') + print(str(exc.value)) + assert "Invalid '--rank'/'--position' input: 'NotARank'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" in runtmp.last_result.err + + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona', '--rank', '5') + print(str(exc.value)) + assert "Invalid '--rank'/'--position' input: '5'. Please choose: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'" in runtmp.last_result.err + + def test_genome_no_rank_krona(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -3509,6 +3525,25 @@ def test_metagenome_LIN_krona_lin_position_5(runtmp): assert "0.7957718388512166 unclassified unclassified unclassified unclassified unclassified unclassified" in c.last_result.out +def test_metagenome_LIN_krona_bad_rank(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, + '--lins', '-F', "krona", '--lin-position', 'strain') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status != 0 + assert "Invalid '--rank'/'--position' input: 'strain'. '--lins' is specified. Rank must be an integer corresponding to a LIN position." in c.last_result.err + + + def test_metagenome_LIN_lingroups_empty_lg_file(runtmp): c = runtmp From 6b2d15cb82d5734d327122a411ba848c84a7f5b5 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 11:41:06 -0800 Subject: [PATCH 62/78] mod annotate for flexibility --- src/sourmash/tax/__main__.py | 93 +++++++++++++++++++--------- src/sourmash/tax/tax_utils.py | 111 ++++++++++++++++++++++------------ tests/test_tax.py | 10 +-- 3 files changed, 141 insertions(+), 73 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index dc01abaded..cc9bf6a279 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -9,11 +9,11 @@ import re import sourmash -from ..sourmash_args import FileOutputCSV, FileOutput +from ..sourmash_args import FileOutputCSV, FileInputCSV, FileOutput from sourmash.logging import set_quiet, error, notify, print_results from . import tax_utils -from .tax_utils import MultiLineageDB, GatherRow, RankLineageInfo, LINLineageInfo +from .tax_utils import MultiLineageDB, RankLineageInfo, LINLineageInfo, AnnotateTaxResult usage=''' sourmash taxonomy [] - manipulate/work with taxonomy information. @@ -285,12 +285,13 @@ def annotate(args): set_quiet(args.quiet) - # first, load taxonomic_assignments try: + # first, load taxonomic_assignments tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins) + except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -299,36 +300,68 @@ def annotate(args): error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') sys.exit(-1) - # get gather_csvs from args - gather_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file) + # get csv from args + input_csvs = tax_utils.collect_gather_csvs(args.gather_csv, from_file=args.from_file) # handle each gather csv separately - for n, g_csv in enumerate(gather_csvs): - query_gather_results = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, - fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, - keep_full_identifiers=args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - lins=args.lins) - - if not query_gather_results: - continue - - out_base = os.path.basename(g_csv.rsplit('.csv')[0]) - this_outfile, limit_float = make_outfile(out_base, "annotate", output_dir=args.output_dir) - - header = [field.name for field in fields(GatherRow)] - with FileOutputCSV(this_outfile) as out_fp: - header.append("lineage") - w = csv.DictWriter(out_fp, header, delimiter=',') - w.writeheader() - - for gather_res in query_gather_results: - for taxres in gather_res.raw_taxresults: - gr = asdict(taxres.raw) - write_gr = {key: gr[key] for key in gr if key in header} - write_gr['lineage'] = taxres.lineageInfo.display_lineage(truncate_empty=True) - w.writerow(write_gr) + for n, in_csv in enumerate(input_csvs): + try: + # Check for a column we can use to find lineage information: + with FileInputCSV(in_csv) as r: + header = r.fieldnames + # check for empty file + if not header: + raise ValueError(f"Cannot read from '{in_csv}'. Is file empty?") + + # look for the column to match with taxonomic identifier + # search: 'name'; prefetch: 'match_name'; gather: 'name'. Also allow: 'ident', 'accession' + id_col = None + col_options = ['name', 'match_name', 'ident', 'accession'] + for colname in col_options: + if colname in header: + id_col = colname + break + + if not id_col: + raise ValueError(f"Cannot find taxonomic identifier column in '{in_csv}'. Tried: {', '.join(col_options)}") + + notify(f"Starting annotation on '{in_csv}'. Using ID column: '{id_col}'") + + # make output file for this input + out_base = os.path.basename(in_csv.rsplit('.csv')[0]) + this_outfile, _ = make_outfile(out_base, "annotate", output_dir=args.output_dir) + + out_header = header + ['lineage'] + + with FileOutputCSV(this_outfile) as out_fp: + w = csv.DictWriter(out_fp, out_header) + w.writeheader() + + n = 0 + n_missed = 0 + for n, row in enumerate(r): + # find lineage and write annotated row + taxres = AnnotateTaxResult(raw=row, id_col=id_col, lins=args.lins, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions) + taxres.get_match_lineage(tax_assignments=tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) + + if taxres.missed_ident: + n_missed+=1 + w.writerow(taxres.row_with_lineages()) + + if not n: + raise ValueError(f"Could not annotate any rows from '{in_csv}'.") + else: + notify(f"Annotated {(n+1) - n_missed} of {n+1} total rows from '{in_csv}'.") + except ValueError as exc: + if args.force: + notify(str(exc)) + notify('--force is set. Attempting to continue to next file.') + else: + error(f"ERROR: {str(exc)}") + sys.exit(-1) def prepare(args): diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6cb83f5eda..42bf09de95 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1540,8 +1540,78 @@ def __post_init__(self): def total_weighted_bp(self): return self.total_weighted_hashes * self.scaled + +@dataclass +class BaseTaxResult: + """ + Base class for sourmash taxonomic annotation. + """ + raw: dict # csv row + keep_full_identifiers: bool = False + keep_identifier_versions: bool = False + match_ident: str = field(init=False) + skipped_ident: bool = False + missed_ident: bool = False + match_lineage_attempted: bool = False + lins: bool = False + + def get_ident(self, id_col=None): + # split identifiers = split on whitespace + # keep identifiers = don't split .[12] from assembly accessions + "Hack and slash identifiers." + if id_col: + self.match_ident = self.raw[id_col] + else: + self.match_ident = self.raw.name + if not self.keep_full_identifiers: + self.match_ident = self.match_ident.split(' ')[0] + else: + #overrides version bc can't keep full without keeping version + self.keep_identifier_versions = True + if not self.keep_identifier_versions: + self.match_ident = self.match_ident.split('.')[0] + + + def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False): + if skip_idents and self.match_ident in skip_idents: + self.skipped_ident = True + else: + lin = tax_assignments.get(self.match_ident) + if lin: + if self.lins: + self.lineageInfo = LINLineageInfo(lineage = lin) + else: + self.lineageInfo = RankLineageInfo(lineage = lin) + else: + self.missed_ident=True + self.match_lineage_attempted = True + if self.missed_ident and fail_on_missing_taxonomy: + raise ValueError(f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy") + + @dataclass -class TaxResult: +class AnnotateTaxResult(BaseTaxResult): + """ + Class to enable taxonomic annotation of any sourmash CSV. + """ + id_col: str = 'name' + + def __post_init__(self): + self.get_ident(id_col=self.id_col) + if self.lins: + self.lineageInfo = LINLineageInfo() + else: + self.lineageInfo = RankLineageInfo() + + def row_with_lineages(self): + lineage = self.lineageInfo.display_lineage(truncate_empty=True) + rl = {"lineage": lineage} + rl.update(self.raw) + return rl + + +@dataclass +class TaxResult(BaseTaxResult): """ Class to store taxonomic result of a single row from a gather CSV, including accessible query information (QueryInfo) and matched taxonomic lineage. TaxResult tracks whether @@ -1563,19 +1633,11 @@ class TaxResult: # get match lineage tax_res.get_match_lineage(taxD=taxonomic_assignments) - Uses RankLineageInfo to store lineage information; this may need to be modified in the future. + Use RankLineageInfo or LINLineageInfo to store lineage information. """ raw: GatherRow - keep_full_identifiers: bool = False - keep_identifier_versions: bool = False - query_name: str = field(init=False) query_info: QueryInfo = field(init=False) - match_ident: str = field(init=False) - skipped_ident: bool = False - missed_ident: bool = False - match_lineage_attempted: bool = False - lins: bool = False def __post_init__(self): self.get_ident() @@ -1598,35 +1660,6 @@ def __post_init__(self): else: self.lineageInfo = RankLineageInfo() - def get_ident(self): - # split identifiers = split on whitespace - # keep identifiers = don't split .[12] from assembly accessions - "Hack and slash identifiers." - self.match_ident = self.raw.name - if not self.keep_full_identifiers: - self.match_ident = self.raw.name.split(' ')[0] - else: - #overrides version bc can't keep full without keeping version - self.keep_identifier_versions = True - if not self.keep_identifier_versions: - self.match_ident = self.match_ident.split('.')[0] - - - def get_match_lineage(self, tax_assignments, skip_idents=None, fail_on_missing_taxonomy=False): - if skip_idents and self.match_ident in skip_idents: - self.skipped_ident = True - else: - lin = tax_assignments.get(self.match_ident) - if lin: - if self.lins: - self.lineageInfo = LINLineageInfo(lineage = lin) - else: - self.lineageInfo = RankLineageInfo(lineage = lin) - else: - self.missed_ident=True - self.match_lineage_attempted = True - if self.missed_ident and fail_on_missing_taxonomy: - raise ValueError(f"Error: ident '{self.match_ident}' is not in the taxonomy database. Failing, as requested via --fail-on-missing-taxonomy") @dataclass class SummarizedGatherResult: diff --git a/tests/test_tax.py b/tests/test_tax.py index b5314bac3c..8e230448c5 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2316,7 +2316,7 @@ def test_annotate_empty_gather_results(runtmp): with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'annotate', '-g', g_csv, '--taxonomy-csv', tax) - assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value) + assert f"Cannot read from '{g_csv}'. Is file empty?" in str(exc.value) assert runtmp.last_result.status == -1 @@ -2327,17 +2327,19 @@ def test_annotate_bad_gather_header(runtmp): bad_g_csv = runtmp.output('g.csv') #creates bad gather result - bad_g = [x.replace("query_name", "nope") for x in open(g_csv, 'r')] + bad_g = [x.replace("name", "nope") for x in open(g_csv, 'r')] with open(bad_g_csv, 'w') as fp: for line in bad_g: fp.write(line) - print("bad_gather_results: \n", bad_g) + # print("bad_gather_results: \n", bad_g) with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) - assert 'is missing columns needed for taxonomic summarization.' in str(exc.value) + assert f"ERROR: Cannot find taxonomic identifier column in '{bad_g_csv}'. Tried: name, match_name, ident, accession" in str(exc.value) assert runtmp.last_result.status == -1 + print(runtmp.last_result.out) + print(runtmp.last_result.err) def test_annotate_empty_tax_lineage_input(runtmp): From 4831ffe83884f9058df2bbc2b20ac2aa25c38976 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 12:28:45 -0800 Subject: [PATCH 63/78] add tests --- src/sourmash/tax/__main__.py | 8 ++-- src/sourmash/tax/tax_utils.py | 2 + tests/test_tax.py | 76 ++++++++++++++++++++++++++++++++++- tests/test_tax_utils.py | 64 ++++++++++++++++++++++++++++- 4 files changed, 144 insertions(+), 6 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index cc9bf6a279..a3a9bfe29b 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -314,7 +314,6 @@ def annotate(args): raise ValueError(f"Cannot read from '{in_csv}'. Is file empty?") # look for the column to match with taxonomic identifier - # search: 'name'; prefetch: 'match_name'; gather: 'name'. Also allow: 'ident', 'accession' id_col = None col_options = ['name', 'match_name', 'ident', 'accession'] for colname in col_options: @@ -346,14 +345,15 @@ def annotate(args): keep_identifier_versions=args.keep_identifier_versions) taxres.get_match_lineage(tax_assignments=tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) - if taxres.missed_ident: + if taxres.missed_ident: # could not assign taxonomy n_missed+=1 w.writerow(taxres.row_with_lineages()) - if not n: + rows_annotated = (n+1) - n_missed + if not rows_annotated: raise ValueError(f"Could not annotate any rows from '{in_csv}'.") else: - notify(f"Annotated {(n+1) - n_missed} of {n+1} total rows from '{in_csv}'.") + notify(f"Annotated {rows_annotated} of {n+1} total rows from '{in_csv}'.") except ValueError as exc: if args.force: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 42bf09de95..8eaf40f931 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1597,6 +1597,8 @@ class AnnotateTaxResult(BaseTaxResult): id_col: str = 'name' def __post_init__(self): + if self.id_col not in self.raw.keys(): + raise ValueError(f"ID column '{self.id_col}' not found.") self.get_ident(id_col=self.id_col) if self.lins: self.lineageInfo = LINLineageInfo() diff --git a/tests/test_tax.py b/tests/test_tax.py index 8e230448c5..a6317ca5ac 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2320,7 +2320,28 @@ def test_annotate_empty_gather_results(runtmp): assert runtmp.last_result.status == -1 -def test_annotate_bad_gather_header(runtmp): +def test_annotate_prefetch_or_other_header(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + alt_csv = runtmp.output('g.csv') + for alt_col in ['match_name', 'ident', 'accession']: + #modify 'name' to other acceptable id_columns result + alt_g = [x.replace("name", alt_col) for x in open(g_csv, 'r')] + with open(alt_csv, 'w') as fp: + for line in alt_g: + fp.write(line) + + runtmp.run_sourmash('tax', 'annotate', '-g', alt_csv, '--taxonomy-csv', tax) + + assert runtmp.last_result.status == 0 + print(runtmp.last_result.out) + print(runtmp.last_result.err) + assert f"Starting annotation on '{alt_csv}'. Using ID column: '{alt_col}'" in runtmp.last_result.err + assert f"Annotated 4 of 4 total rows from '{alt_csv}'" in runtmp.last_result.err + + +def test_annotate_bad_header(runtmp): tax = utils.get_test_data('tax/test.taxonomy.csv') g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -2342,6 +2363,59 @@ def test_annotate_bad_gather_header(runtmp): print(runtmp.last_result.err) +def test_annotate_no_tax_matches(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + bad_g_csv = runtmp.output('g.csv') + + #mess up tax idents + bad_g = [x.replace("GCF_", "GGG_") for x in open(g_csv, 'r')] + with open(bad_g_csv, 'w') as fp: + for line in bad_g: + fp.write(line) + # print("bad_gather_results: \n", bad_g) + + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + + assert f"ERROR: Could not annotate any rows from '{bad_g_csv}'" in str(exc.value) + assert runtmp.last_result.status == -1 + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax, '--force') + + assert runtmp.last_result.status == 0 + assert f"Could not annotate any rows from '{bad_g_csv}'" in runtmp.last_result.err + assert f"--force is set. Attempting to continue to next file." in runtmp.last_result.err + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + +def test_annotate_missed_tax_matches(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + bad_g_csv = runtmp.output('g.csv') + + with open(g_csv, 'r') as gather_lines, open(bad_g_csv, 'w') as fp: + for n, line in enumerate(gather_lines): + if n > 2: + # mess up tax idents of lines 3, 4 + line = line.replace("GCF_", "GGG_") + fp.write(line) + # print("bad_gather_results: \n", bad_g) + + runtmp.run_sourmash('tax', 'annotate', '-g', bad_g_csv, '--taxonomy-csv', tax) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert f"Annotated 2 of 4 total rows from '{bad_g_csv}'." in runtmp.last_result.err + + def test_annotate_empty_tax_lineage_input(runtmp): tax_empty = runtmp.output('t.csv') g_csv = utils.get_test_data('tax/test1.gather.csv') diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 412340ae37..92feb7f2ba 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -13,7 +13,7 @@ from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, collect_gather_csvs, check_and_load_gather_csvs, LineagePair, QueryInfo, GatherRow, TaxResult, QueryTaxResult, - SummarizedGatherResult, ClassificationResult, + SummarizedGatherResult, ClassificationResult, AnnotateTaxResult, BaseLineageInfo, RankLineageInfo, LINLineageInfo, aggregate_by_lineage_at_rank, format_for_krona, write_krona, write_lineage_sample_frac, read_lingroups, @@ -367,6 +367,37 @@ def test_TaxResult_get_ident_default(): assert taxres.match_ident == "GCF_001881345" +def test_AnnotateTaxResult_get_ident_default(): + gA = {"name": "GCF_001881345.1"} # gather result with match name as GCF_001881345.1 + taxres = AnnotateTaxResult(raw=gA) + print(taxres.match_ident) + assert taxres.match_ident == "GCF_001881345" + + +def test_AnnotateTaxResult_get_ident_idcol(): + gA = {"name": "n1", "match_name": "n2", "ident": "n3", "accession": "n4"} # gather result with match name as GCF_001881345.1 + taxres = AnnotateTaxResult(raw=gA) + print(taxres.match_ident) + assert taxres.match_ident == "n1" + taxres = AnnotateTaxResult(raw=gA, id_col="match_name") + print(taxres.match_ident) + assert taxres.match_ident == "n2" + taxres = AnnotateTaxResult(raw=gA, id_col="ident") + print(taxres.match_ident) + assert taxres.match_ident == "n3" + taxres = AnnotateTaxResult(raw=gA, id_col="accession") + print(taxres.match_ident) + assert taxres.match_ident == "n4" + + +def test_AnnotateTaxResult_get_ident_idcol_fail(): + gA = {"name": "n1", "match_name": "n2", "ident": "n3", "accession": "n4"} # gather result with match name as GCF_001881345.1 + with pytest.raises(ValueError) as exc: + AnnotateTaxResult(raw=gA, id_col="NotACol") + print(str(exc)) + assert "ID column 'NotACol' not found." in str(exc) + + def test_get_ident_split_but_keep_version(): ident = "GCF_001881345.1 secondname" n_id = get_ident(ident, keep_identifier_versions=True) @@ -383,6 +414,16 @@ def test_TaxResult_get_ident_split_but_keep_version(): assert taxres.match_ident == "GCF_001881345.1" +def test_AnnotateTaxResult_get_ident_split_but_keep_version(): + gA = {"name": "GCF_001881345.1 secondname"} + taxres = AnnotateTaxResult(gA, keep_identifier_versions=True) + print("raw ident: ", taxres.raw['name']) + print("keep_full?: ", taxres.keep_full_identifiers) + print("keep_version?: ",taxres.keep_identifier_versions) + print("final ident: ", taxres.match_ident) + assert taxres.match_ident == "GCF_001881345.1" + + def test_get_ident_no_split(): ident = "GCF_001881345.1 secondname" n_id = get_ident(ident, keep_full_identifiers=True) @@ -399,6 +440,16 @@ def test_TaxResult_get_ident_keep_full(): assert taxres.match_ident == "GCF_001881345.1 secondname" +def test_AnnotateTaxResult_get_ident_keep_full(): + gA = {"name": "GCF_001881345.1 secondname"} + taxres = AnnotateTaxResult(gA, keep_full_identifiers=True) + print("raw ident: ", taxres.raw['name']) + print("keep_full?: ", taxres.keep_full_identifiers) + print("keep_version?: ",taxres.keep_identifier_versions) + print("final ident: ", taxres.match_ident) + assert taxres.match_ident == "GCF_001881345.1 secondname" + + def test_collect_gather_csvs(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') from_file = runtmp.output("tmp-from-file.txt") @@ -1702,6 +1753,17 @@ def test_TaxResult_get_match_lineage_1(): assert taxres.lineageInfo.display_lineage() == "a;b;c" +def test_AnnotateTaxResult_get_match_lineage_1(): + gA_tax = ("gA", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + gA = {"name": "gA.1 name"} + taxres = AnnotateTaxResult(gA) + taxres.get_match_lineage(tax_assignments=taxD) + assert taxres.lineageInfo.display_lineage() == "a;b;c" + assert taxres.row_with_lineages() == {"name": "gA.1 name", "lineage": "a;b;c"} + + def test_TaxResult_get_match_lineage_skip_ident(): gA_tax = ("gA", "a;b;c") taxD = make_mini_taxonomy([gA_tax]) From eca0f873f8aa818e53358087aeed79a1ce78e528 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 15:04:36 -0800 Subject: [PATCH 64/78] add tests; output bioboxes from tax metagenome --- src/sourmash/cli/tax/metagenome.py | 4 +- src/sourmash/cli/utils.py | 8 ++- src/sourmash/tax/__main__.py | 11 +++- src/sourmash/tax/tax_utils.py | 62 +++++++++++++------ tests/test_tax.py | 62 +++++++++++++++++++ tests/test_tax_utils.py | 98 +++++++++++++++++++++++++++++- 6 files changed, 221 insertions(+), 24 deletions(-) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index cbcca18fad..40aa62adfe 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -69,7 +69,7 @@ def subparser(subparsers): ) subparser.add_argument( '-F', '--output-format', default=[], nargs='*', action="extend", - choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "lingroup"], + choices=["human", "csv_summary", "krona", "lineage_summary", "kreport", "lingroup", "bioboxes"], help='choose output format(s)', ) subparser.add_argument( @@ -93,7 +93,7 @@ def main(args): raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") if args.rank: args.rank = check_rank(args) - args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary']) + args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport']) except ValueError as exc: error(f"ERROR: {str(exc)}") diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 4e1e2d9ff6..f54b85f4f7 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -157,15 +157,19 @@ def add_rank_arg(parser): Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position" ) -def check_tax_outputs(args, rank_required = ["krona"]): +def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins = None): "Handle ouput format combinations" # check that rank is passed for formats requiring rank. if not args.rank: if any(x in rank_required for x in args.output_format): raise ValueError(f"Rank (--rank) is required for {', '.join(rank_required)} output formats.") - # check that '--lins' is specified and '--lingroup' file exists if needed if args.lins: + # check for outputs incompatible with lins + if incompatible_with_lins: + if any(x in args.output_format for x in incompatible_with_lins): + raise ValueError(f"The following outputs are incompatible with '--lins': : {', '.join(incompatible_with_lins)}") + # check that lingroup file exists if needed if args.lingroup: if "lingroup" not in args.output_format: args.output_format.append("lingroup") diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index dc01abaded..d69454162f 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -42,7 +42,8 @@ 'human': '.human.txt', 'lineage_csv': '.lineage.csv', 'kreport': ".kreport.txt", - 'lingroup': ".lingroup.tsv" + 'lingroup': ".lingroup.tsv", + 'bioboxes': '.bioboxes.profile' } def make_outfile(base, output_type, *, output_dir = ""): @@ -181,6 +182,14 @@ def metagenome(args): header, lgreport_results = single_query_results.make_lingroup_results(LINgroupsD = lingroups) tax_utils.write_output(header, lgreport_results, out_fp, sep="\t", write_header=True) + # write cami bioboxes format + if "bioboxes" in args.output_format: + bbfile, limit_float = make_outfile(args.output_base, "bioboxes", output_dir=args.output_dir) + + with FileOutputCSV(bbfile) as out_fp: + header_lines, bb_results = single_query_results.make_cami_bioboxes() + tax_utils.write_bioboxes(header_lines, bb_results, out_fp, sep="\t") + def genome(args): """ diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index f435f16a4b..665d47d99a 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -19,7 +19,7 @@ __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', 'load_gather_results', 'check_and_load_gather_csvs' 'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank' - 'format_for_krona', + 'format_for_krona', 'write_output', 'write_bioboxes', 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', 'MultiLineageDB', 'RankLineageInfo', 'LINLineageInfo'] @@ -856,7 +856,19 @@ def write_output(header, results, out_fp, *, sep=',', write_header=True): if write_header: output.writeheader() for res in results: - output.writerow(res) + output.writerow(res) + + +def write_bioboxes(header_lines, results, out_fp, *, sep='\t'): + """ + write pre-generated results list of rows, with each + row being list. + """ + for inf in header_lines: + out_fp.write(inf + '\n') + for res in results: + res = sep.join(res) + '\n' + out_fp.write(res) def write_summary(query_gather_results, csv_fp, *, sep=',', limit_float_decimals=False, classification=False): @@ -1707,6 +1719,7 @@ def as_kreport_dict(self, query_info): lowest_assignment_rank = 'species' sD = {} sD['num_bp_assigned'] = str(0) + sD['ncbi_taxid'] = None # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) @@ -1716,7 +1729,9 @@ def as_kreport_dict(self, query_info): this_rank = self.lineage.lowest_rank sD['rank_code'] = RANKCODE[this_rank] sD['sci_name'] = self.lineage.lowest_lineage_name - sD['ncbi_taxid'] = self.lineage.lowest_lineage_taxid + taxid = self.lineage.lowest_lineage_taxid + if taxid: + sD['ncbi_taxid'] = str(taxid) # the number of bp actually 'assigned' at this rank. Sourmash assigns everything # at genome level, but since kreport traditionally doesn't include 'strain' or genome, # it is reasonable to state that sourmash assigns at 'species' level for this. @@ -1741,20 +1756,26 @@ def as_lingroup_dict(self, query_info, lg_name): sD["name"] = lg_name return sD -def as_cami_bioboxes(self): - ''' + def as_cami_bioboxes(self): + """ Format taxonomy-summarized gather results as CAMI profiling Bioboxes format. Columns are: TAXID RANK TAXPATH TAXPATHSN PERCENTAGE - - ''' - # if this is filled (should always be true here, right? So don't actually need to check this?) - taxid = self.lineage.lowest_lineage_taxid - taxpath = self.lineage.display_taxid(sep="|") - taxpathsn = self.lineage.display_lineage(sep="|") - percentage = f"{(self.f_weighted_at_rank * 100):.2f}" # fix at 2 decimal points - return [taxid, self.rank, taxpath, taxpathsn, percentage] + """ + if isinstance(self.lineage, LINLineageInfo): + raise ValueError("Cannot produce 'cami' results with LIN taxonomy.") + if self.lineage != RankLineageInfo(): # if not unassigned + taxid = self.lineage.lowest_lineage_taxid + if taxid: + taxpath = self.lineage.display_taxid(sep="|") + taxid = str(taxid) + else: + taxpath = None + taxpathsn = self.lineage.display_lineage(sep="|") + percentage = f"{(self.f_weighted_at_rank * 100):.2f}" # fix at 2 decimal points + return [taxid, self.rank, taxpath, taxpathsn, percentage] + return [] @dataclass @@ -2189,7 +2210,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref return header, lingroup_results - def make_cami_bioboxes(self): + def make_cami_bioboxes(self): """ info: https://github.com/CAMI-challenge/contest_information/blob/master/file_formats/CAMI_TP_specification.mkd @@ -2224,7 +2245,7 @@ def make_cami_bioboxes(self): # see https://github.com/luizirber/2020-cami/blob/master/scripts/gather_to_opal.py # starting from https://github.com/sourmash-bio/sourmash/pull/1606/files - cami_results = [] + bioboxes_results = [] # build CAMI header info header_title = "# Taxonomic Profiling Output" version_info = "@Version:0.10.0" @@ -2235,15 +2256,20 @@ def make_cami_bioboxes(self): # if 'strain' in ranks: # ranks.remove('strain') rank_info = f"@Ranks:{'|'.join(ranks)}" + header_lines = [header_title, sample_info, version_info, rank_info, program] + header_lines.append("") # blank line + colnames = ["@@TAXID","RANK","TAXPATH","TAXPATHSN","PERCENTAGE"] + header_lines.append('\t'.join(colnames)) # now build results in CAMI format # order results by rank (descending), then percentage for rank in ranks: rank_results = self.summarized_lineage_results[rank] for res in rank_results: - cami_info = res.as_cami_bioboxes() - cami_results.append(cami_info) + bb_info = res.as_cami_bioboxes() + if bb_info: + bioboxes_results.append(bb_info) - return header_lines, cami_results + return header_lines, bioboxes_results diff --git a/tests/test_tax.py b/tests/test_tax.py index b5314bac3c..708f6104d6 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -295,6 +295,68 @@ def test_metagenome_kreport_out_fail(runtmp): assert "ERROR: cannot produce 'kreport' format from gather results before sourmash v4.5.0" in runtmp.last_result.err +def test_metagenome_bioboxes_stdout(runtmp): + # test CAMI bioboxes format output + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + + runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-F', "bioboxes") + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + + assert "# Taxonomic Profiling Output" in runtmp.last_result.out + assert "@SampleID:test1" in runtmp.last_result.out + assert "@Version:0.10.0" in runtmp.last_result.out + assert "@Ranks:superkingdom|phylum|class|order|family|genus|species|strain" in runtmp.last_result.out + assert "@__program__:sourmash" in runtmp.last_result.out + assert "2 superkingdom 2 Bacteria 13.08" in runtmp.last_result.out + assert "976 phylum 2|976 Bacteria|Bacteroidota 7.27" in runtmp.last_result.out + assert "1224 phylum 2|1224 Bacteria|Pseudomonadota 5.82" in runtmp.last_result.out + assert "200643 class 2|976|200643 Bacteria|Bacteroidota|Bacteroidia 7.27" in runtmp.last_result.out + assert "1236 class 2|1224|1236 Bacteria|Pseudomonadota|Gammaproteobacteria 5.82" in runtmp.last_result.out + assert "171549 order 2|976|200643|171549 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales 7.27" in runtmp.last_result.out + assert "91347 order 2|1224|1236|91347 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales 5.82" in runtmp.last_result.out + assert "171552 family 2|976|200643|171549|171552 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae 5.70" in runtmp.last_result.out + assert "543 family 2|1224|1236|91347|543 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 5.82" in runtmp.last_result.out + assert "815 family 2|976|200643|171549|815 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae 1.56" in runtmp.last_result.out + assert "838 genus 2|976|200643|171549|171552|838 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella 5.70" in runtmp.last_result.out + assert "561 genus 2|1224|1236|91347|543|561 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 5.82" in runtmp.last_result.out + assert "909656 genus 2|976|200643|171549|815|909656 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola 1.56" in runtmp.last_result.out + assert "165179 species 2|976|200643|171549|171552|838|165179 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella|Prevotella copri 5.70" in runtmp.last_result.out + assert "562 species 2|1224|1236|91347|543|561|562 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 5.82" in runtmp.last_result.out + assert "821 species 2|976|200643|171549|815|909656|821 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola|Phocaeicola vulgatus 1.56" in runtmp.last_result.out + + +def test_metagenome_bioboxes_outfile(runtmp): + # test CAMI bioboxes format output + g_csv = utils.get_test_data('tax/test1.gather.v450.csv') + tax = utils.get_test_data('tax/test.ncbi-taxonomy.csv') + csv_base = "out" + sum_csv = csv_base + ".bioboxes.profile" + csvout = runtmp.output(sum_csv) + outdir = os.path.dirname(csvout) + + runtmp.run_sourmash('tax', 'metagenome', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-F', "bioboxes", '-o', csv_base, '--output-dir', outdir,) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + + bb_results = [x.rstrip().split('\t') for x in open(csvout)] + assert f"saving 'bioboxes' output to '{csvout}'" in runtmp.last_result.err + print(bb_results) + assert ['# Taxonomic Profiling Output'] == bb_results[0] + assert ['@SampleID:test1'] == bb_results[1] + assert ['2', 'superkingdom', '2', 'Bacteria', '13.08'] == bb_results[7] + assert ['838', 'genus', '2|976|200643|171549|171552|838', 'Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella', '5.70'] == bb_results[17] + + def test_metagenome_krona_tsv_out(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 412340ae37..555d6b8e06 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -31,6 +31,24 @@ def make_mini_taxonomy(tax_info, LIN=False): taxD[name] = lineage.filled_lineage return taxD +def make_mini_taxonomy_with_taxids(tax_info, LIN=False): + taxD = {} + for (name, lin, taxids) in tax_info: + if LIN: + lineage = LINLineageInfo(lineage_str=lin) + else: + ranks = RankLineageInfo.ranks + txs = taxids.split(';') + lns = lin.split(';') + lineage_tups = [] + for n, taxname in enumerate(lns): + rk = ranks[n] + tx = txs[n] + this_lineage = LineagePair(rk, name=taxname, taxid=tx) + lineage_tups.append(this_lineage) + lineage = RankLineageInfo(lineage=lineage_tups) + taxD[name] = lineage.filled_lineage + return taxD def make_GatherRow(gather_dict=None, exclude_cols=[]): """Load artificial gather row (dict) into GatherRow class""" @@ -159,6 +177,41 @@ def test_SummarizedGatherResult(): print(lD) assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', 'family': '', 'genus': '', 'species': '', 'strain': ''} + cami = sgr.as_cami_bioboxes() + print(cami) + assert cami == [None, 'phylum', None, 'a|b', '30.00'] + + +def test_SummarizedGatherResult_withtaxids(): + "basic functionality of SummarizedGatherResult dataclass" + qInf = QueryInfo(query_name='q1', query_md5='md5', query_filename='f1',query_bp='100', + query_n_hashes='10',ksize='31',scaled='10', total_weighted_hashes='200') + lin = [LineagePair(rank='superkingdom', name='a', taxid='1'), LineagePair(rank='phylum', name='b', taxid=2)] + sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=RankLineageInfo(lineage=lin), + f_weighted_at_rank=0.3, bp_match_at_rank=30) + print(sgr) + assert sgr.rank=='phylum' + sumD = sgr.as_summary_dict(query_info=qInf) + print(sumD) + assert sumD == {'rank': 'phylum', 'fraction': "0.2", 'lineage': 'a;b', 'f_weighted_at_rank': "0.3", + 'bp_match_at_rank': "30", 'query_ani_at_rank': None, 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + hD = sgr.as_human_friendly_dict(query_info=qInf) + print(hD) + assert hD == {'rank': 'phylum', 'fraction': '0.200', 'lineage': 'a;b', 'f_weighted_at_rank': '30.0%', + 'bp_match_at_rank': "30", 'query_ani_at_rank': '- ', 'query_name': 'q1', + 'query_md5': 'md5', 'query_filename': 'f1', 'total_weighted_hashes': "200"} + krD = sgr.as_kreport_dict(query_info=qInf) + print(krD) + assert krD == {'ncbi_taxid': '2', 'sci_name': 'b', 'rank_code': 'P', 'num_bp_assigned': "0", + 'percent_containment': '30.00', 'num_bp_contained': "600"} + lD = sgr.as_lineage_dict(ranks = RankLineageInfo().ranks, query_info=qInf) + print(lD) + assert lD == {'ident': 'q1', 'superkingdom': 'a', 'phylum': 'b', 'class': '', 'order': '', + 'family': '', 'genus': '', 'species': '', 'strain': ''} + cami = sgr.as_cami_bioboxes() + print(cami) + assert cami == ['2', 'phylum', '1|2', 'a|b', '30.00'] def test_SummarizedGatherResult_LINs(): @@ -2743,7 +2796,7 @@ def test_make_kreport_results(): assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', 'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': None}, {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60', - 'sci_name': 'unclassified', 'rank_code': 'U'}, + 'sci_name': 'unclassified', 'rank_code': 'U', 'ncbi_taxid': None}, {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', 'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': None}, {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', @@ -2758,6 +2811,32 @@ def test_make_kreport_results(): 'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': None}] +def test_make_kreport_results_with_taxids(): + taxD = make_mini_taxonomy_with_taxids([("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")]) + print(taxD) + #need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + header, krepD = q_res.make_kreport_results() + print(krepD) + assert krepD == [{'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', + 'rank_code': 'D', 'sci_name': 'a', 'ncbi_taxid': '1'}, + {'num_bp_assigned': '60', 'percent_containment': '60.00', 'num_bp_contained': '60', + 'sci_name': 'unclassified', 'rank_code': 'U', 'ncbi_taxid': None}, + {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', + 'rank_code': 'P', 'sci_name': 'b', 'ncbi_taxid': '2'}, + {'num_bp_assigned': '0', 'percent_containment': '40.00', 'num_bp_contained': '40', + 'rank_code': 'C', 'sci_name': 'c', 'ncbi_taxid': '3'}, + {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'O', 'sci_name': 'd', 'ncbi_taxid': '4'}, + {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'F', 'sci_name': 'e', 'ncbi_taxid': '5'}, + {'num_bp_assigned': '0', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'G', 'sci_name': 'f', 'ncbi_taxid': '6'}, + {'num_bp_assigned': '20', 'percent_containment': '20.00', 'num_bp_contained': '20', + 'rank_code': 'S', 'sci_name': 'g', 'ncbi_taxid': '7'}] + + def test_make_kreport_results_fail(): taxD = make_mini_taxonomy([("gA", "a;b;c"), ("gB", "a;b;d")]) gather_results = [{}, {"name": 'gB'}] @@ -2778,6 +2857,23 @@ def test_make_kreport_results_fail_pre_v450(): assert "cannot produce 'kreport' format from gather results before sourmash v4.5.0" in str(exc) +def test_make_cami_results_with_taxids(): + taxD = make_mini_taxonomy_with_taxids([("gA", "a;b;c", "1;2;3"), ("gB", "a;b;c;d;e;f;g", "1;2;3;4;5;6;7")]) + print(taxD) + #need to go down to species to check that `num_bp_assigned` is happening correctly + gather_results = [{"total_weighted_hashes":100}, {"name": 'gB', "total_weighted_hashes":100}] + q_res = make_QueryTaxResults(gather_info=gather_results, taxD=taxD, single_query=True, summarize=True) + header, camires = q_res.make_cami_bioboxes() + print(camires) + assert camires == [['1', 'superkingdom', '1', 'a', '40.00'], + ['2', 'phylum', '1|2', 'a|b', '40.00'], + ['3', 'class', '1|2|3', 'a|b|c', '40.00'], + ['4', 'order', '1|2|3|4', 'a|b|c|d', '20.00'], + ['5', 'family', '1|2|3|4|5', 'a|b|c|d|e', '20.00'], + ['6', 'genus', '1|2|3|4|5|6', 'a|b|c|d|e|f', '20.00'], + ['7', 'species', '1|2|3|4|5|6|7', 'a|b|c|d|e|f|g', '20.00']] + + def test_make_lingroup_results(): taxD = make_mini_taxonomy([("gA", "1;0;0"), ("gB", "1;0;1"), ("gC", "1;1;0")], LIN=True) print(taxD) From d59fdc77df2cea0b6bb06a23c1a6e7cae3345247 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 15:07:52 -0800 Subject: [PATCH 65/78] rm blank --- src/sourmash/tax/tax_utils.py | 1 - tests/test_tax.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 665d47d99a..c4d871fdfd 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -2258,7 +2258,6 @@ def make_cami_bioboxes(self): rank_info = f"@Ranks:{'|'.join(ranks)}" header_lines = [header_title, sample_info, version_info, rank_info, program] - header_lines.append("") # blank line colnames = ["@@TAXID","RANK","TAXPATH","TAXPATHSN","PERCENTAGE"] header_lines.append('\t'.join(colnames)) diff --git a/tests/test_tax.py b/tests/test_tax.py index 708f6104d6..53678d986c 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -353,8 +353,8 @@ def test_metagenome_bioboxes_outfile(runtmp): print(bb_results) assert ['# Taxonomic Profiling Output'] == bb_results[0] assert ['@SampleID:test1'] == bb_results[1] - assert ['2', 'superkingdom', '2', 'Bacteria', '13.08'] == bb_results[7] - assert ['838', 'genus', '2|976|200643|171549|171552|838', 'Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella', '5.70'] == bb_results[17] + assert ['2', 'superkingdom', '2', 'Bacteria', '13.08'] == bb_results[6] + assert ['838', 'genus', '2|976|200643|171549|171552|838', 'Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella', '5.70'] == bb_results[16] def test_metagenome_krona_tsv_out(runtmp): From 78fbbe0452cc74837ca1c707a61ab9931be94daf Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 15:16:37 -0800 Subject: [PATCH 66/78] rm comment --- src/sourmash/tax/tax_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index c4d871fdfd..6f1202dc55 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -2242,10 +2242,6 @@ def make_cami_bioboxes(self): 204455 order 2|1224|28211|204455 Bacteria|Proteobacteria|Alphaproteobacteria|Rhodobacterales 8.42263 2158 order 2157|28890|183925|2158 Archaea|Euryarchaeotes|Methanobacteria|Methanobacteriales 1.18789 """ - # see https://github.com/luizirber/2020-cami/blob/master/scripts/gather_to_opal.py - - # starting from https://github.com/sourmash-bio/sourmash/pull/1606/files - bioboxes_results = [] # build CAMI header info header_title = "# Taxonomic Profiling Output" version_info = "@Version:0.10.0" @@ -2262,6 +2258,7 @@ def make_cami_bioboxes(self): header_lines.append('\t'.join(colnames)) # now build results in CAMI format + bioboxes_results = [] # order results by rank (descending), then percentage for rank in ranks: rank_results = self.summarized_lineage_results[rank] From 512f237198ab4854c9a3d68341c4094cff3757c1 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 15:21:19 -0800 Subject: [PATCH 67/78] add bioboxes lins test --- src/sourmash/tax/tax_utils.py | 2 +- tests/test_tax_utils.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6f1202dc55..eacc7844e2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1764,7 +1764,7 @@ def as_cami_bioboxes(self): Columns are: TAXID RANK TAXPATH TAXPATHSN PERCENTAGE """ if isinstance(self.lineage, LINLineageInfo): - raise ValueError("Cannot produce 'cami' results with LIN taxonomy.") + raise ValueError("Cannot produce 'bioboxes' with LIN taxonomy.") if self.lineage != RankLineageInfo(): # if not unassigned taxid = self.lineage.lowest_lineage_taxid if taxid: diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 555d6b8e06..bec97ad13a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -233,6 +233,10 @@ def test_SummarizedGatherResult_LINs(): sgr.as_kreport_dict(query_info=qInf) print(str(exc)) assert "Cannot produce 'kreport' with LIN taxonomy." in str(exc) + with pytest.raises(ValueError) as exc: + sgr.as_cami_bioboxes() + print(str(exc)) + assert "Cannot produce 'bioboxes' with LIN taxonomy." in str(exc) def test_SummarizedGatherResult_set_query_ani(): From 22ac224dd8b2a5ed21ad43a6d47b5b0032881e63 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 15:51:41 -0800 Subject: [PATCH 68/78] add bioboxes to docs --- doc/command-line.md | 48 ++++++++++++++++++++++++++++++ src/sourmash/cli/tax/metagenome.py | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/doc/command-line.md b/doc/command-line.md index e98e337d8b..654d5a63c2 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -737,6 +737,54 @@ lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 Related lingroup subpaths will be grouped in output, but exact ordering may change between runs. +#### `bioboxes` output format + +When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profilie, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output fits the [bioboxes profile](https://github.com/bioboxes/rfc/tree/master/data-format) which can be used as input for CAMI challenges. + +This output format starts with some header information: +``` +#CAMI Submission for Taxonomic Profiling +@Version:0.9.3 +@SampleID:SAMPLEID +@Ranks:superkingdom|phylum|class|order|family|genus|species|strain +@__program__:sourmash +@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE +``` +and then provides taxonomic profiling information in the tab-separated columns described by the last header line: + +- `TAXID` - specifies a unique alphanumeric ID for a node in a reference tree such as the NCBI taxonomy +- `RANK` - superkingdom --> strain +- `TAXPATH` - the path from the root of the reference taxonomy to the respective taxon +- `TAXPATHSN` - scientific names of taxpath +- `PERCENTAGE` (0-100) - field specifies what percentage of the sample was assigned to the respective TAXID + +example output (using small test data): +``` +# Taxonomic Profiling Output +@SampleID:test1 +@Version:0.10.0 +@Ranks:superkingdom|phylum|class|order|family|genus|species +@__program__:sourmash +@@TAXID RANK TAXPATH TAXPATHSN PERCENTAGE +2 superkingdom 2 Bacteria 13.08 +976 phylum 2|976 Bacteria|Bacteroidota 7.27 +1224 phylum 2|1224 Bacteria|Pseudomonadota 5.82 +200643 class 2|976|200643 Bacteria|Bacteroidota|Bacteroidia 7.27 +1236 class 2|1224|1236 Bacteria|Pseudomonadota|Gammaproteobacteria 5.82 +171549 order 2|976|200643|171549 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales 7.27 +91347 order 2|1224|1236|91347 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales 5.82 +171552 family 2|976|200643|171549|171552 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae 5.70 +543 family 2|1224|1236|91347|543 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 5.82 +815 family 2|976|200643|171549|815 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae 1.56 +838 genus 2|976|200643|171549|171552|838 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella 5.70 +561 genus 2|1224|1236|91347|543|561 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia 5.82 +909656 genus 2|976|200643|171549|815|909656 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola 1.56 +165179 species 2|976|200643|171549|171552|838|165179 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Prevotellaceae|Prevotella|Prevotella copri 5.70 +562 species 2|1224|1236|91347|543|561|562 Bacteria|Pseudomonadota|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae|Escherichia|Escherichia coli 5.82 +821 species 2|976|200643|171549|815|909656|821 Bacteria|Bacteroidota|Bacteroidia|Bacteroidales|Bacteroidaceae|Phocaeicola|Phocaeicola vulgatus 1.56 +``` + + ### `sourmash tax genome` - classify a genome using `gather` results `sourmash tax genome` reports likely classification for each query, diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index 40aa62adfe..423b008d33 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -93,7 +93,7 @@ def main(args): raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") if args.rank: args.rank = check_rank(args) - args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport']) + args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport']) except ValueError as exc: error(f"ERROR: {str(exc)}") From 10c67e51735d3bab9efe3243fa72d81cde5f69d7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 16:00:56 -0800 Subject: [PATCH 69/78] fix typo --- doc/command-line.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/command-line.md b/doc/command-line.md index 654d5a63c2..79c29353e4 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -739,7 +739,7 @@ Related lingroup subpaths will be grouped in output, but exact ordering may chan #### `bioboxes` output format -When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profilie, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output fits the [bioboxes profile](https://github.com/bioboxes/rfc/tree/master/data-format) which can be used as input for CAMI challenges. +When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profile, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output is organized according to the [bioboxes profile specifications](https://github.com/bioboxes/rfc/tree/master/data-format) so that this file can be used for CAMI challenges. This output format starts with some header information: ``` From 2a2856991e452399b8cf92f96b921840c232007e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 16:14:39 -0800 Subject: [PATCH 70/78] one more cli test --- tests/test_tax.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index 53678d986c..053a6adb62 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3660,7 +3660,12 @@ def test_metagenome_LIN_lingroups_bad_cli_inputs(runtmp): c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lingroup', lg_file) print(c.last_result.err) assert c.last_result.status != 0 - assert "Must enable LIN taxonomy via '--lins' in order to use lingroups." in c.last_result.err + + with pytest.raises(SourmashCommandFailed): + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '-F', 'bioboxes') + print(c.last_result.err) + assert c.last_result.status != 0 + assert "ERROR: The following outputs are incompatible with '--lins': : bioboxes, kreport" in c.last_result.err def test_metagenome_mult_outputs_stdout_fail(runtmp): From aaadb3faa89bf3c6fe886480958d5d21c2a58d0b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 16:58:13 -0800 Subject: [PATCH 71/78] init genome lins --- src/sourmash/cli/tax/genome.py | 32 +++++++++++------------ src/sourmash/tax/__main__.py | 4 +-- tests/test_tax.py | 47 ++++++++++++++++++++++++++++++++-- 3 files changed, 63 insertions(+), 20 deletions(-) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 555f812a25..fd6793c2b7 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -34,7 +34,7 @@ import argparse import sourmash from sourmash.logging import notify, print_results, error -from sourmash.cli.utils import add_tax_threshold_arg +from sourmash.cli.utils import add_tax_threshold_arg, check_rank, check_tax_outputs, add_rank_arg def subparser(subparsers): subparser = subparsers.add_parser('genome', @@ -65,10 +65,6 @@ def subparser(subparsers): '--output-dir', default= "", help='directory for output files' ) - subparser.add_argument( - '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], - help='Summarize genome taxonomy at this rank and above. Note that the taxonomy CSV must contain lineage information at this rank.' - ) subparser.add_argument( '--keep-full-identifiers', action='store_true', help='do not split identifiers on whitespace' @@ -90,25 +86,29 @@ def subparser(subparsers): '-f', '--force', action = 'store_true', help='continue past survivable errors in loading taxonomy database or gather results', ) + subparser.add_argument( + '--lins', '--lin-taxonomy', action='store_true', default=False, + help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information." + ) + subparser.add_argument( + '--lingroup', '--lingroups', metavar='FILE', default=None, + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will classify to these groups." + ) add_tax_threshold_arg(subparser, 0.1) + add_rank_arg(subparser) def main(args): import sourmash try: + if not args.gather_csv and not args.from_file: raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") - # handle output formats - print(args.output_format) - if not args.rank: - if any(x in ["krona"] for x in args.output_format): - raise ValueError(f"Rank (--rank) is required for krona output format.") - if len(args.output_format) > 1: - if args.output_base == "-": - raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") - elif not args.output_format: - # change to "human" for 5.0 - args.output_format = ["csv_summary"] + if args.rank: + args.rank = check_rank(args) + + # args.lingroup=None # if we don't have lingroup arg, above, need this. + args.output_format = check_tax_outputs(args, rank_required = ['krona']) except ValueError as exc: error(f"ERROR: {str(exc)}") diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index dc01abaded..1baf76a445 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -193,7 +193,7 @@ def genome(args): tax_assign = MultiLineageDB.load(args.taxonomy_csv, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, - force=args.force) + force=args.force, lins=args.lins) available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") @@ -215,7 +215,7 @@ def genome(args): fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - ) + lins=args.lins) except ValueError as exc: error(f"ERROR: {str(exc)}") diff --git a/tests/test_tax.py b/tests/test_tax.py index b5314bac3c..326b434789 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -443,7 +443,8 @@ def test_genome_no_rank_krona(runtmp): with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') - assert "Rank (--rank) is required for krona output format." in str(exc.value) + # assert "Rank (--rank) is required for krona output format." in str(exc.value) + assert "ERROR: Rank (--rank) is required for krona output formats" in str(exc.value) def test_metagenome_rank_not_available(runtmp): @@ -1019,8 +1020,10 @@ def test_genome_empty_gather_results(runtmp): with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax) - assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value) assert runtmp.last_result.status == -1 + print(runtmp.last_result.err) + print(runtmp.last_result.out) + assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value) def test_genome_bad_gather_header(runtmp): @@ -1916,6 +1919,7 @@ def test_genome_empty_gather_results_with_csv_force(runtmp): assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out + def test_genome_containment_threshold_bounds(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -2150,6 +2154,45 @@ def test_annotate_no_gather_csv(runtmp): print(runtmp.last_result.err) +def test_genome_LIN(runtmp): + # test basic genome with LIN taxonomy + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.93') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out + assert "test1,below_threshold,0,0.089,1,md5,test1.sig,0.057,444000,0.925" in c.last_result.out + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.924') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out + assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--rank', '4') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out + assert "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + + + def test_annotate_0(runtmp): # test annotate basics c = runtmp From 3ba7865a435a057d08d8d78ff77dd7511e7b6ce8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Wed, 8 Mar 2023 18:25:10 -0800 Subject: [PATCH 72/78] lingroups --- src/sourmash/cli/tax/metagenome.py | 2 +- src/sourmash/cli/utils.py | 4 +-- src/sourmash/tax/__main__.py | 10 +++++++- src/sourmash/tax/tax_utils.py | 41 ++++++++++++++++++++---------- tests/test_tax.py | 38 +++++++++++++++++++++++++++ tests/test_tax_utils.py | 2 +- 6 files changed, 79 insertions(+), 18 deletions(-) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index cbcca18fad..84b0f21be7 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -93,7 +93,7 @@ def main(args): raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") if args.rank: args.rank = check_rank(args) - args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary']) + args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], use_lingroup_format=True) except ValueError as exc: error(f"ERROR: {str(exc)}") diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 4e1e2d9ff6..f7833e2f58 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -157,7 +157,7 @@ def add_rank_arg(parser): Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position" ) -def check_tax_outputs(args, rank_required = ["krona"]): +def check_tax_outputs(args, rank_required = ["krona"], use_lingroup_format=False): "Handle ouput format combinations" # check that rank is passed for formats requiring rank. if not args.rank: @@ -167,7 +167,7 @@ def check_tax_outputs(args, rank_required = ["krona"]): # check that '--lins' is specified and '--lingroup' file exists if needed if args.lins: if args.lingroup: - if "lingroup" not in args.output_format: + if use_lingroup_format and "lingroup" not in args.output_format: args.output_format.append("lingroup") elif "lingroup" in args.output_format: raise ValueError(f"Must provide lingroup csv via '--lingroup' in order to output a lingroup report.") diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 1baf76a445..5b2def9280 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -195,6 +195,13 @@ def genome(args): keep_identifier_versions=args.keep_identifier_versions, force=args.force, lins=args.lins) available_ranks = tax_assign.available_ranks + + lg_ranks=None + all_lgs=None + if args.lingroup: + lingroups = tax_utils.read_lingroups(args.lingroup) + lg_ranks, all_lgs = tax_utils.parse_lingroups(lingroups) + except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -230,7 +237,8 @@ def genome(args): try: queryResult.build_classification_result(rank=args.rank, ani_threshold=args.ani_threshold, - containment_threshold=args.containment_threshold) + containment_threshold=args.containment_threshold, + lingroup_ranks=lg_ranks, lingroups=all_lgs) except ValueError as exc: error(f"ERROR: {str(exc)}") diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6cb83f5eda..035165db06 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -19,7 +19,7 @@ __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', 'load_gather_results', 'check_and_load_gather_csvs' 'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank' - 'format_for_krona', + 'format_for_krona', 'parse_lingroups', 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', 'MultiLineageDB', 'RankLineageInfo', 'LINLineageInfo'] @@ -632,6 +632,20 @@ def read_lingroups(lingroup_csv): return lingroupD +def parse_lingroups(lingroupD): + # find the ranks we need to consider + all_lgs = set() + lg_ranks = set() + for lg_prefix in lingroupD.keys(): + # store lineage info for LCA pathfinding + lg_info = LINLineageInfo(lineage_str=lg_prefix) + all_lgs.add(lg_info) + # store rank so we only go through summarized results at these ranks + lg_rank = str(lg_info.lowest_rank) + lg_ranks.add(lg_rank) + return lg_ranks, all_lgs + + def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False, skip_idents = None, fail_on_missing_taxonomy=False, keep_full_identifiers=False, keep_identifier_versions=False, @@ -1944,7 +1958,7 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False): fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani) self.summarized_lineage_results[rank].append(sres) - def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False): + def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False, lingroup_ranks=None, lingroups=None): if containment_threshold is not None and not 0 <= containment_threshold <= 1: raise ValueError(f"Containment threshold must be between 0 and 1 (input value: {containment_threshold}).") if ani_threshold is not None and not 0 <= ani_threshold <= 1: @@ -1960,7 +1974,13 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment raise ValueError(f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}") else: self.classified_ranks = [rank] + if lingroup_ranks: + notify("Restricting classification to lingroups.") + self.classified_ranks = [x for x in self.classified_ranks if x in lingroup_ranks] + if not self.classified_ranks: + raise ValueError(f"Error: no ranks remain for classification.") # CLASSIFY using summarization--> best only result. Best way = use ANI or containment threshold + classif = None for this_rank in self.classified_ranks: # ascending order or just single rank # reset for this rank f_weighted=0.0 @@ -1972,6 +1992,10 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment sorted_sum_uniq_to_query.sort(key = lambda x: -x[1]) # select best-at-rank only this_lineage, f_unique_at_rank = sorted_sum_uniq_to_query[0] + # if in desired lineage groups, continue (or??) + if lingroups and this_lineage not in lingroups: + # ignore this lineage and continue up + continue bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage] f_weighted = self.sum_uniq_weighted[this_rank][this_lineage] @@ -2134,23 +2158,14 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref header = ["name", "lin", "percent_containment", "num_bp_contained"] if self.query_info.total_weighted_hashes == 0: - raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") + raise ValueError("ERROR: cannot produce 'lingroup' format from gather results before sourmash v4.5.0") # find the ranks we need to consider - all_lgs = set() - lg_ranks = set() - for lg_prefix in LINgroupsD.keys(): - # store lineage info for LCA pathfinding - lg_info = LINLineageInfo(lineage_str=lg_prefix) - all_lgs.add(lg_info) - # store rank so we only go through summarized results at these ranks - lg_rank = int(lg_info.lowest_rank) - lg_ranks.add(lg_rank) + lg_ranks, all_lgs = parse_lingroups(LINgroupsD) # grab summarized results matching LINgroup prefixes lg_results = {} for rank in lg_ranks: - rank = str(rank) rank_results = self.summarized_lineage_results[rank] for res in rank_results: if res.lineage in all_lgs:# is this lineage in the list of LINgroups? diff --git a/tests/test_tax.py b/tests/test_tax.py index 326b434789..84f79b58ea 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2192,6 +2192,44 @@ def test_genome_LIN(runtmp): assert "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out +def test_genome_LIN_lingroups(runtmp): + # test basic genome with LIN taxonomy + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.LIN-taxonomy.csv') + + lg_file = runtmp.output("test.lg.csv") + + with open(lg_file, 'w') as out: + out.write('lin,name\n') + out.write('0;0;0,lg1\n') + out.write('1;0;0,lg2\n') + out.write('2;0;0,lg3\n') + out.write('1;0;1,lg3\n') + # write a 19 so we can check the end + out.write('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out + assert "test1,below_threshold,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file, '--ani-threshold', '0.924') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out + assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out + def test_annotate_0(runtmp): # test annotate basics diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 412340ae37..d01ce6f404 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2812,7 +2812,7 @@ def test_make_lingroup_results_fail_pre_v450(): with pytest.raises(ValueError) as exc: q_res.make_lingroup_results(lingroupD) print(str(exc)) - assert "cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0" in str(exc) + assert "cannot produce 'lingroup' format from gather results before sourmash v4.5.0" in str(exc) def test_read_lingroups(runtmp): From 6ecef9cb7347af3dc99c817e1a78d56362591474 Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Thu, 9 Mar 2023 08:31:57 -0800 Subject: [PATCH 73/78] Update doc/command-line.md Co-authored-by: C. Titus Brown --- doc/command-line.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/command-line.md b/doc/command-line.md index 79c29353e4..9be4a68e2a 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -739,7 +739,7 @@ Related lingroup subpaths will be grouped in output, but exact ordering may chan #### `bioboxes` output format -When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profile, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o`,` --output-base` option. This output is organized according to the [bioboxes profile specifications](https://github.com/bioboxes/rfc/tree/master/data-format) so that this file can be used for CAMI challenges. +When using standard taxonomic ranks (not lins), you can choose to output a 'bioboxes' profile, `{base}.bioboxes.profile`, where `{base}` is the name provided via the `-o/--output-base` option. This output is organized according to the [bioboxes profile specifications](https://github.com/bioboxes/rfc/tree/master/data-format) so that this file can be used for CAMI challenges. This output format starts with some header information: ``` From fb4d213ba9b1cecbc5bde71f41fe9709be9043b3 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Mar 2023 10:10:12 -0800 Subject: [PATCH 74/78] clean up --- src/sourmash/cli/tax/genome.py | 5 +---- src/sourmash/cli/utils.py | 5 ++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index fd6793c2b7..3f3ee41578 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -92,7 +92,7 @@ def subparser(subparsers): ) subparser.add_argument( '--lingroup', '--lingroups', metavar='FILE', default=None, - help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will classify to these groups." + help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups." ) add_tax_threshold_arg(subparser, 0.1) add_rank_arg(subparser) @@ -101,13 +101,10 @@ def subparser(subparsers): def main(args): import sourmash try: - if not args.gather_csv and not args.from_file: raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.") if args.rank: args.rank = check_rank(args) - - # args.lingroup=None # if we don't have lingroup arg, above, need this. args.output_format = check_tax_outputs(args, rank_required = ['krona']) except ValueError as exc: diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index f7833e2f58..7d86706ca1 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -139,7 +139,6 @@ def check_rank(args): standard_ranks =['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] if args.lins: if args.rank.isdigit(): - #if isinstance(args.rank, int): return str(args.rank) raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position.") elif args.rank in standard_ranks: @@ -152,8 +151,8 @@ def add_rank_arg(parser): parser.add_argument( '-r', '--rank', '--position', '--lin-position', - help="For non-default output formats: Summarize genome taxonomy at this rank (or LIN position) and above. \ - Note that the taxonomy CSV must contain lineage information at this rank (or LIN position). \ + help="For non-default output formats. Classify to this rank (tax genome) or summarize taxonomy at this rank and above (tax metagenome). \ + Note that the taxonomy CSV must contain lineage information at this rank. \ Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position" ) From fd60be66b25af7efc7e827b3077a50346111e073 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Mar 2023 10:16:23 -0800 Subject: [PATCH 75/78] add lins/lingroup to docs --- doc/command-line.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/command-line.md b/doc/command-line.md index e98e337d8b..b81d821f73 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -755,6 +755,13 @@ Related lingroup subpaths will be grouped in output, but exact ordering may chan Optionally, `genome` can instead report classifications at a desired `rank`, regardless of match threshold (`--rank` argument, e.g. `--rank species`). +If using `--lins` taxonomy, you can also provide a `--lingroup` file containing two +columns, `name`, and `lin`, which provide a series of lin prefixes of interest. +If provided, genome classification will be restricted to provided lingroups only. +All other options (`--rank`, `--ani-threshold`, etc) should continue to function. +However, use caution with `--rank` here, as if you specify a `--rank` that does +not have an associated lingroup, you eliminate all classification options. + Note that these thresholds and strategies are under active testing. To illustrate the utility of `genome`, let's consider a signature consisting From 521bc1124670f1c55e241d09711093e1af909722 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Mar 2023 10:39:51 -0800 Subject: [PATCH 76/78] less forbidding :) --- doc/command-line.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index b81d821f73..2ffc70bf2f 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -759,8 +759,8 @@ If using `--lins` taxonomy, you can also provide a `--lingroup` file containing columns, `name`, and `lin`, which provide a series of lin prefixes of interest. If provided, genome classification will be restricted to provided lingroups only. All other options (`--rank`, `--ani-threshold`, etc) should continue to function. -However, use caution with `--rank` here, as if you specify a `--rank` that does -not have an associated lingroup, you eliminate all classification options. +If you specify a `--rank` that does not have an associated lingroup, sourmash will +notify you that you eliminated all classification options. Note that these thresholds and strategies are under active testing. From a04a0606932c6a38fe2cb7e995075c340c61e694 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Mar 2023 10:45:00 -0800 Subject: [PATCH 77/78] rm irrelevant comment --- tests/test_tax.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index 84f79b58ea..140d468afe 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -443,7 +443,6 @@ def test_genome_no_rank_krona(runtmp): with pytest.raises(SourmashCommandFailed) as exc: runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') - # assert "Rank (--rank) is required for krona output format." in str(exc.value) assert "ERROR: Rank (--rank) is required for krona output formats" in str(exc.value) From e606a3094b593eb00bc0c58c5d1076c69e4daf70 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Thu, 9 Mar 2023 10:53:42 -0800 Subject: [PATCH 78/78] add note about lin positions starting at 0 --- src/sourmash/cli/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 7d86706ca1..779e02c6ca 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -152,7 +152,7 @@ def add_rank_arg(parser): '-r', '--rank', '--position', '--lin-position', help="For non-default output formats. Classify to this rank (tax genome) or summarize taxonomy at this rank and above (tax metagenome). \ - Note that the taxonomy CSV must contain lineage information at this rank. \ + Note that the taxonomy CSV must contain lineage information at this rank, and that LIN positions start at 0. \ Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position" )