Skip to content

Commit

Permalink
rm num_bp_assigned in LINgroup report bc doesnt mean anything
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed Mar 3, 2023
1 parent 55ca620 commit e35db91
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 29 deletions.
17 changes: 3 additions & 14 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1729,21 +1729,14 @@ def as_kreport_dict(self, query_info):
sD["num_bp_assigned"] = sD["num_bp_contained"]
return sD

def as_lingroup_dict(self, query_info, lg_name, lowest_rank):
def as_lingroup_dict(self, query_info, lg_name):
"""
Produce LINgroup report dict for LINgroups.
"""
sD = {}
# total percent containment, weighted to include abundance info
sD['percent_containment'] = f'{self.f_weighted_at_rank * 100:.2f}'
sD["num_bp_contained"] = str(int(self.f_weighted_at_rank * query_info.total_weighted_bp))
sD["num_bp_assigned"] = str(0)
if self.lineage.n_lin_positions != 0: #empty lineage
# the number of bp actually 'assigned' at this rank. Sourmash assigns everything
# at genome level - not sure how we want to handle 'num_bp_assigned' here..
if self.lineage.lowest_rank == lowest_rank:
sD["num_bp_assigned"] = sD["num_bp_contained"]

sD["LINgroup_prefix"] = self.lineage.display_lineage()
sD["LINgroup_name"] = lg_name
return sD
Expand Down Expand Up @@ -2138,7 +2131,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
Keep LCA paths in order as much as possible.
"""
self.check_summarization()
header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained", "num_bp_assigned"]
header = ["LINgroup_name", "LINgroup_prefix", "percent_containment", "num_bp_contained"]

if self.query_info.total_weighted_hashes == 0:
raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0")
Expand All @@ -2154,10 +2147,6 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
lg_rank = int(lg_info.lowest_rank)
lg_ranks.add(lg_rank)

# find lowest rank, for "assignment" column [do we even want this???]
ordered_lg_ranks = sorted(lg_ranks)
lowest_rank = str(ordered_lg_ranks[-1])

# grab summarized results matching LINgroup prefixes
lg_results = {}
for rank in lg_ranks:
Expand All @@ -2166,7 +2155,7 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
for res in rank_results:
if res.lineage in all_lgs:# is this lineage in the list of LINgroups?
this_lingroup_name = LINgroupsD[res.lineage.display_lineage(truncate_empty=True)]
lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name, lowest_rank)
lg_resD = res.as_lingroup_dict(self.query_info, this_lingroup_name)
lg_results[res.lineage] = lg_resD

# We want to return in ~ depth order: descending each specific path in order
Expand Down
14 changes: 7 additions & 7 deletions tests/test_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -3415,7 +3415,7 @@ def test_metagenome_LIN_LINgroups(runtmp):
out.write('1;0;0,lg2\n')
out.write('2;0;0,lg3\n')
out.write('1;0;1,lg3\n')
# write a 19 so we can check 'num_bp_assigned'
# write a 19 so we can check the end
out.write('1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n')

c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax,
Expand All @@ -3428,12 +3428,12 @@ def test_metagenome_LIN_LINgroups(runtmp):
assert c.last_result.status == 0
assert "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0" in c.last_result.err
assert "Read 5 LINgroup rows and found 5 distinct LINgroup prefixes." in c.last_result.err
assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained num_bp_assigned" in c.last_result.out
assert "lg1 0;0;0 5.82 714000 0" in c.last_result.out
assert "lg2 1;0;0 5.05 620000 0" in c.last_result.out
assert "lg3 2;0;0 1.56 192000 0" in c.last_result.out
assert "lg3 1;0;1 0.65 80000 0" in c.last_result.out
assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000 80000" in c.last_result.out
assert "LINgroup_name LINgroup_prefix percent_containment num_bp_contained" in c.last_result.out
assert "lg1 0;0;0 5.82 714000" in c.last_result.out
assert "lg2 1;0;0 5.05 620000" in c.last_result.out
assert "lg3 2;0;0 1.56 192000" in c.last_result.out
assert "lg3 1;0;1 0.65 80000" in c.last_result.out
assert "lg4 1;0;1;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 0.65 80000" in c.last_result.out


def test_metagenome_LIN_human_summary_no_lin_position(runtmp):
Expand Down
16 changes: 8 additions & 8 deletions tests/test_tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,14 @@ def test_SummarizedGatherResult_LINs():
sgr = SummarizedGatherResult(rank="phylum", fraction=0.2, lineage=LINLineageInfo(lineage_str="0;0;1"),
f_weighted_at_rank=0.3, bp_match_at_rank=30)

lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="4")
lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name")
print(lgD)
assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1", 'num_bp_assigned': "0",
assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1",
'percent_containment': '30.00', 'num_bp_contained': "600"}
lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name", lowest_rank="3")
lgD = sgr.as_lingroup_dict(query_info=qInf, lg_name="lg_name")
print(lgD)
assert lgD == {'LINgroup_name': "lg_name", "LINgroup_prefix": "0;0;1",
'num_bp_assigned': "0",'percent_containment': '30.00', 'num_bp_contained': "600"}
'percent_containment': '30.00', 'num_bp_contained': "600"}
with pytest.raises(ValueError) as exc:
sgr.as_kreport_dict(query_info=qInf)
print(str(exc))
Expand Down Expand Up @@ -2791,13 +2791,13 @@ def test_make_lingroup_results():

header, lgD = q_res.make_lingroup_results(LINgroupsD = lingroupD)
print(header)
assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained', 'num_bp_assigned']
assert header == ['LINgroup_name', 'LINgroup_prefix', 'percent_containment', 'num_bp_contained']
# order may change, just check that each lg entry is present in list of results
lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60', 'num_bp_assigned': '0',
lg1 = {'percent_containment': '60.00', 'num_bp_contained': '60',
'LINgroup_prefix': '1', 'LINgroup_name': 'lg1'}
lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40', 'num_bp_assigned': '40',
lg2 = {'percent_containment': '40.00', 'num_bp_contained': '40',
'LINgroup_prefix': '1;0', 'LINgroup_name': 'lg2'}
lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20', 'num_bp_assigned': '20',
lg3 = {'percent_containment': '20.00', 'num_bp_contained': '20',
'LINgroup_prefix': '1;1', 'LINgroup_name': 'lg3'}
assert lg1 in lgD
assert lg2 in lgD
Expand Down

0 comments on commit e35db91

Please sign in to comment.