Skip to content

Commit

Permalink
cleanup; add test for multiple
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Nov 13, 2022
1 parent 5e7bdef commit 43e094d
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 37 deletions.
13 changes: 3 additions & 10 deletions src/sourmash/tax/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,8 +453,9 @@ def summarize(args):

notify(f"...loaded {len(tax_assign)} entries.")

print_results(f"num idents: {len(tax_assign)}")
print_results(f"number of distinct taxonomic lineages: {len(tax_assign)}")

# count the number of distinct lineage names seen
rank_counts = defaultdict(int)
name_seen = set()
for v in tax_assign.values():
Expand All @@ -464,19 +465,11 @@ def summarize(args):
rank_counts[rank] += 1
name_seen.add(name)

if 0:
# @CTB
# check duplicates?
sofar.append(name)
tup = tuple(sofar)
seen.add(tuple(sofar))
#break

rank_count_items = list(rank_counts.items())
rank_count_items.sort(key=lambda x: x[1])
for rank, count in rank_count_items:
rank_name_str = f"{rank}:"
print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers")
print_results(f"rank {rank_name_str:<20s} {count} distinct taxonomic lineages")

if args.output_lineage_information:
notify("now calculating detailed lineage counts...")
Expand Down
74 changes: 47 additions & 27 deletions tests/test_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -2982,14 +2982,34 @@ def test_tax_summarize(runtmp):
out = runtmp.last_result.out
err = runtmp.last_result.err

assert "num idents: 6" in out
assert "rank superkingdom: 1 distinct identifiers" in out
assert "rank phylum: 2 distinct identifiers" in out
assert "rank class: 2 distinct identifiers" in out
assert "rank order: 2 distinct identifiers" in out
assert "rank family: 3 distinct identifiers" in out
assert "rank genus: 4 distinct identifiers" in out
assert "rank species: 4 distinct identifiers" in out
assert "number of distinct taxonomic lineages: 6" in out
assert "rank superkingdom: 1 distinct taxonomic lineages" in out
assert "rank phylum: 2 distinct taxonomic lineages" in out
assert "rank class: 2 distinct taxonomic lineages" in out
assert "rank order: 2 distinct taxonomic lineages" in out
assert "rank family: 3 distinct taxonomic lineages" in out
assert "rank genus: 4 distinct taxonomic lineages" in out
assert "rank species: 4 distinct taxonomic lineages" in out


def test_tax_summarize_multiple(runtmp):
# test basic operation with summarize on multiple files
tax1 = utils.get_test_data('tax/bacteria_refseq_lineage.csv')
tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv')

runtmp.sourmash('tax', 'summarize', tax1, tax2)

out = runtmp.last_result.out
err = runtmp.last_result.err

assert "number of distinct taxonomic lineages: 6" in out
assert "rank superkingdom: 2 distinct taxonomic lineages" in out
assert "rank phylum: 3 distinct taxonomic lineages" in out
assert "rank class: 4 distinct taxonomic lineages" in out
assert "rank order: 4 distinct taxonomic lineages" in out
assert "rank family: 5 distinct taxonomic lineages" in out
assert "rank genus: 5 distinct taxonomic lineages" in out
assert "rank species: 5 distinct taxonomic lineages" in out


def test_tax_summarize_empty_line(runtmp):
Expand All @@ -3001,14 +3021,14 @@ def test_tax_summarize_empty_line(runtmp):
out = runtmp.last_result.out
err = runtmp.last_result.err

assert "num idents: 6" in out
assert "rank superkingdom: 1 distinct identifiers" in out
assert "rank phylum: 2 distinct identifiers" in out
assert "rank class: 2 distinct identifiers" in out
assert "rank order: 2 distinct identifiers" in out
assert "rank family: 3 distinct identifiers" in out
assert "rank genus: 4 distinct identifiers" in out
assert "rank species: 4 distinct identifiers" in out
assert "number of distinct taxonomic lineages: 6" in out
assert "rank superkingdom: 1 distinct taxonomic lineages" in out
assert "rank phylum: 2 distinct taxonomic lineages" in out
assert "rank class: 2 distinct taxonomic lineages" in out
assert "rank order: 2 distinct taxonomic lineages" in out
assert "rank family: 3 distinct taxonomic lineages" in out
assert "rank genus: 4 distinct taxonomic lineages" in out
assert "rank species: 4 distinct taxonomic lineages" in out


def test_tax_summarize_empty(runtmp):
Expand All @@ -3032,7 +3052,7 @@ def test_tax_summarize_csv(runtmp):
out = runtmp.last_result.out
err = runtmp.last_result.err

assert "num idents: 6" in out
assert "number of distinct taxonomic lineages: 6" in out
assert "saved 18 lineage counts to 'ranks.csv'" in err

csv_out = runtmp.output('ranks.csv')
Expand Down Expand Up @@ -3074,14 +3094,14 @@ def test_tax_summarize_on_annotate(runtmp):
print(out)
print(err)

assert "num idents: 4" in out
assert "rank superkingdom: 1 distinct identifiers" in out
assert "rank phylum: 2 distinct identifiers" in out
assert "rank class: 2 distinct identifiers" in out
assert "rank order: 2 distinct identifiers" in out
assert "rank family: 2 distinct identifiers" in out
assert "rank genus: 3 distinct identifiers" in out
assert "rank species: 3 distinct identifiers" in out
assert "number of distinct taxonomic lineages: 4" in out
assert "rank superkingdom: 1 distinct taxonomic lineages" in out
assert "rank phylum: 2 distinct taxonomic lineages" in out
assert "rank class: 2 distinct taxonomic lineages" in out
assert "rank order: 2 distinct taxonomic lineages" in out
assert "rank family: 2 distinct taxonomic lineages" in out
assert "rank genus: 3 distinct taxonomic lineages" in out
assert "rank species: 3 distinct taxonomic lineages" in out


def test_tax_summarize_strain_csv(runtmp):
Expand All @@ -3093,7 +3113,7 @@ def test_tax_summarize_strain_csv(runtmp):
out = runtmp.last_result.out
err = runtmp.last_result.err

assert "num idents: 6" in out
assert "number of distinct taxonomic lineages: 6" in out
assert "saved 24 lineage counts to 'ranks.csv'" in err

csv_out = runtmp.output('ranks.csv')
Expand Down Expand Up @@ -3132,7 +3152,7 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp):
out = runtmp.last_result.out
err = runtmp.last_result.err

assert "num idents: 6" in out
assert "number of distinct taxonomic lineages: 6" in out
assert "saved 24 lineage counts to" in err

csv_out = runtmp.output('ranks.csv')
Expand Down

0 comments on commit 43e094d

Please sign in to comment.