Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: Fix tax metagenome to work on gather output created with --estimate-ani-ci #2952

Merged
merged 3 commits into from
Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1540,6 +1540,10 @@ class GatherRow:
n_unique_weighted_found: int = None
sum_weighted_found: int = None
total_weighted_hashes: int = None
query_containment_ani_low: float = None
query_containment_ani_high: float = None
match_containment_ani_low: float = None
match_containment_ani_high: float = None


@dataclass
Expand Down
43 changes: 43 additions & 0 deletions tests/test_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -2201,6 +2201,49 @@ def test_genome_ani_lemonade_classify(runtmp):
assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output


def test_genome_ani_lemonade_classify_estimate_ani_ci(runtmp):
# test a complete MAG classification with lemonade MAG from STAMPS 2022
# (real data!)
c = runtmp

## first run gather
genome = utils.get_test_data('tax/lemonade-MAG3.sig.gz')
matches = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.zip')

c.run_sourmash('gather', genome, matches,
'--threshold-bp=5000', '-o', 'gather.csv', '--estimate-ani')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0

this_gather_file = c.output('gather.csv')
this_gather = Path(this_gather_file).read_text().splitlines()

assert len(this_gather) == 4

## now run 'tax genome' with human output
taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv')
c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file,
'--ani', '0.8', '-F', 'human')

output = c.last_result.out
assert 'MAG3_1 match 5.3% 91.0% d__Bacteria;p__Bacteroidota;c__Chlorobia;o__Chlorobiales;f__Chlorobiaceae;g__Prosthecochloris;s__Prosthecochloris vibrioformis' in output

# aaand classify to lineage_csv
c.run_sourmash('tax', 'genome', '-g', this_gather_file, '-t', taxonomy_file,
'--ani', '0.8', '-F', 'lineage_csv')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)
output = c.last_result.out
assert 'ident,superkingdom,phylum,class,order,family,genus,species' in output
assert 'MAG3_1,d__Bacteria,p__Bacteroidota,c__Chlorobia,o__Chlorobiales,f__Chlorobiaceae,g__Prosthecochloris,s__Prosthecochloris vibrioformis' in output


def test_metagenome_no_gather_csv(runtmp):
# test tax metagenome with no -g
taxonomy_file = utils.get_test_data('tax/lemonade-MAG3.x.gtdb.matches.tax.csv')
Expand Down
Loading