rm num_bp_assigned in LINgroup report bc doesnt mean anything

sourmash-bio · Mar 3, 2023 · e35db91 · e35db91
1 parent 55ca620
commit e35db91
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 29 deletions.
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -1729,21 +1729,14 @@ def as_kreport_dict(self, query_info):
             sD["num_bp_assigned"] = sD["num_bp_contained"]
         return sD
 
-    def as_lingroup_dict(self, query_info, lg_name, lowest_rank):
+    def as_lingroup_dict(self, query_info, lg_name):
         """
         Produce LINgroup report dict for LINgroups.
         """
         sD = {}
         # total percent containment, weighted to include abundance info
         sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
         sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp))
-        sD["num_bp_assigned"] = str(0)
-        if self.lineage.n_lin_positions != 0: #empty lineage
-            # the number of bp actually 'assigned' at this rank. Sourmash assigns everything
-            # at genome level - not sure how we want to handle 'num_bp_assigned' here..
-            if self.lineage.lowest_rank == lowest_rank:
-                sD["num_bp_assigned"] = sD["num_bp_contained"]
-
         sD["LINgroup_prefix"] = self.lineage.display_lineage()
         sD["LINgroup_name"] = lg_name
         return sD
@@ -2138,7 +2131,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
         Keep LCA paths in order as much as possible.
         """
         self.check_summarization()
-        header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"]
+        header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained"]
 
         if self.query_info.total_weighted_hashes == 0:
             raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0")
@@ -2154,10 +2147,6 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
             lg_rank = int(lg_info.lowest_rank)
             lg_ranks.add(lg_rank)
 
-       # find lowest rank, for "assignment" column [do we even want this???]
-        ordered_lg_ranks = sorted(lg_ranks)
-        lowest_rank = str(ordered_lg_ranks[-1])
-
         # grab summarized results matching LINgroup prefixes
         lg_results = {}
         for rank in lg_ranks:
@@ -2166,7 +2155,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
             for res in rank_results:
                 if res.lineage in all_lgs:# is this lineage in the list of LINgroups?
                     this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)]
-                    lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank)
+                    lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name)
                     lg_results[res.lineage] = lg_resD
 
         # We want to return in ~ depth order: descending each specific path in order

diff --git a/tests/test_tax.py b/tests/test_tax.py
@@ -3415,7 +3415,7 @@ def test_metagenome_LIN_LINgroups(runtmp):
         out.write('1;0;0,lg2\n')
         out.write('2;0;0,lg3\n')
         out.write('1;0;1,lg3\n')
-        # write a 19 so we can check 'num_bp_assigned'
+        # write a 19 so we can check the end
         out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n')
 
     c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax,
@@ -3428,12 +3428,12 @@ def test_metagenome_LIN_LINgroups(runtmp):
     assert c.last_result.status == 0
     assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err
     assert "Read 5 LINgroup rows and found 5 distinct LINgroup prefixes." in c.last_result.err
-    assert "LINgroup_name	LINgroup_prefix	percent_containment	num_bp_contained	num_bp_assigned" in c.last_result.out
-    assert "lg1	0;0;0	5.82	714000	0" in c.last_result.out
-    assert "lg2	1;0;0	5.05	620000	0" in c.last_result.out
-    assert "lg3	2;0;0	1.56	192000	0" in c.last_result.out
-    assert "lg3	1;0;1	0.65	80000	0" in c.last_result.out
-    assert "lg4	1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0	0.65	80000	80000" in c.last_result.out
+    assert "LINgroup_name	LINgroup_prefix	percent_containment	num_bp_contained" in c.last_result.out
+    assert "lg1	0;0;0	5.82	714000" in c.last_result.out
+    assert "lg2	1;0;0	5.05	620000" in c.last_result.out
+    assert "lg3	2;0;0	1.56	192000" in c.last_result.out
+    assert "lg3	1;0;1	0.65	80000" in c.last_result.out
+    assert "lg4	1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0	0.65	80000" in c.last_result.out
 
 
 def test_metagenome_LIN_human_summary_no_lin_position(runtmp):

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
@@ -168,14 +168,14 @@ def test_SummarizedGatherResult_LINs():
     sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"),
                                  f_weighted_at_rank=0.3, bp_match_at_rank=30)
 
-    lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="4")
+    lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name")
     print(lgD)
-    assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'num_bp_assigned': "0",
+    assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1",
                    'percent_containment': '30.00', 'num_bp_contained': "600"}
-    lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="3")
+    lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name")
     print(lgD)
     assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1",
-                   'num_bp_assigned': "0",'percent_containment': '30.00', 'num_bp_contained': "600"}
+                   'percent_containment': '30.00', 'num_bp_contained': "600"}
     with pytest.raises(ValueError) as exc:
         sgr.as_kreport_dict(query_info=qInf)
     print(str(exc))
@@ -2791,13 +2791,13 @@ def test_make_lingroup_results():
 
     header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD)
     print(header)
-    assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained', 'num_bp_assigned']
+    assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained']
     # order may change, just check that each lg entry is present in list of results
-    lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', 'num_bp_assigned': '0',
+    lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60',
                     'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'}
-    lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', 'num_bp_assigned': '40',
+    lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40',
                     'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'}
-    lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '20',
+    lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20',
                     'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'}
     assert lg1 in lgD
     assert lg2 in lgD