From e35db9194b458298a16389394239888d6ae5457b Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce-Ward" Date: Fri, 3 Mar 2023 10:29:21 -0800 Subject: [PATCH] rm num_bp_assigned in LINgroup report bc doesnt mean anything --- src/sourmash/tax/tax_utils.py | 17 +++-------------- tests/test_tax.py | 14 +++++++------- tests/test_tax_utils.py | 16 ++++++++-------- 3 files changed, 18 insertions(+), 29 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 60f0b07950..2bd9050d40 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1729,7 +1729,7 @@ def as_kreport_dict(self, query_info): sD["num_bp_assigned"] = sD["num_bp_contained"] return sD - def as_lingroup_dict(self, query_info, lg_name, lowest_rank): + def as_lingroup_dict(self, query_info, lg_name): """ Produce LINgroup report dict for LINgroups. """ @@ -1737,13 +1737,6 @@ def as_lingroup_dict(self, query_info, lg_name, lowest_rank): # total percent containment, weighted to include abundance info sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}' sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp)) - sD["num_bp_assigned"] = str(0) - if self.lineage.n_lin_positions != 0: #empty lineage - # the number of bp actually 'assigned' at this rank. Sourmash assigns everything - # at genome level - not sure how we want to handle 'num_bp_assigned' here.. - if self.lineage.lowest_rank == lowest_rank: - sD["num_bp_assigned"] = sD["num_bp_contained"] - sD["LINgroup_prefix"] = self.lineage.display_lineage() sD["LINgroup_name"] = lg_name return sD @@ -2138,7 +2131,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref Keep LCA paths in order as much as possible. """ self.check_summarization() - header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"] + header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained"] if self.query_info.total_weighted_hashes == 0: raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0") @@ -2154,10 +2147,6 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref lg_rank = int(lg_info.lowest_rank) lg_ranks.add(lg_rank) - # find lowest rank, for "assignment" column [do we even want this???] - ordered_lg_ranks = sorted(lg_ranks) - lowest_rank = str(ordered_lg_ranks[-1]) - # grab summarized results matching LINgroup prefixes lg_results = {} for rank in lg_ranks: @@ -2166,7 +2155,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref for res in rank_results: if res.lineage in all_lgs:# is this lineage in the list of LINgroups? this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)] - lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank) + lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name) lg_results[res.lineage] = lg_resD # We want to return in ~ depth order: descending each specific path in order diff --git a/tests/test_tax.py b/tests/test_tax.py index 199ceffec4..9f4b1ebe13 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3415,7 +3415,7 @@ def test_metagenome_LIN_LINgroups(runtmp): out.write('1;0;0,lg2\n') out.write('2;0;0,lg3\n') out.write('1;0;1,lg3\n') - # write a 19 so we can check 'num_bp_assigned' + # write a 19 so we can check the end out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n') c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax, @@ -3428,12 +3428,12 @@ def test_metagenome_LIN_LINgroups(runtmp): assert c.last_result.status == 0 assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err assert "Read 5 LINgroup rows and found 5 distinct LINgroup prefixes." in c.last_result.err - assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained num_bp_assigned" in c.last_result.out - assert "lg1 0;0;0 5.82 714000 0" in c.last_result.out - assert "lg2 1;0;0 5.05 620000 0" in c.last_result.out - assert "lg3 2;0;0 1.56 192000 0" in c.last_result.out - assert "lg3 1;0;1 0.65 80000 0" in c.last_result.out - assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000" in c.last_result.out + assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained" in c.last_result.out + assert "lg1 0;0;0 5.82 714000" in c.last_result.out + assert "lg2 1;0;0 5.05 620000" in c.last_result.out + assert "lg3 2;0;0 1.56 192000" in c.last_result.out + assert "lg3 1;0;1 0.65 80000" in c.last_result.out + assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000" in c.last_result.out def test_metagenome_LIN_human_summary_no_lin_position(runtmp): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index ff26fefd41..8d24f44382 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -168,14 +168,14 @@ def test_SummarizedGatherResult_LINs(): sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"), f_weighted_at_rank=0.3, bp_match_at_rank=30) - lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="4") + lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) - assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'num_bp_assigned': "0", + assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'percent_containment': '30.00', 'num_bp_contained': "600"} - lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="3") + lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name") print(lgD) assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", - 'num_bp_assigned': "0",'percent_containment': '30.00', 'num_bp_contained': "600"} + 'percent_containment': '30.00', 'num_bp_contained': "600"} with pytest.raises(ValueError) as exc: sgr.as_kreport_dict(query_info=qInf) print(str(exc)) @@ -2791,13 +2791,13 @@ def test_make_lingroup_results(): header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD) print(header) - assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained', 'num_bp_assigned'] + assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained'] # order may change, just check that each lg entry is present in list of results - lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', 'num_bp_assigned': '0', + lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', 'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'} - lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', 'num_bp_assigned': '40', + lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', 'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'} - lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '20', + lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', 'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'} assert lg1 in lgD assert lg2 in lgD