From 6164f880c2fd460244ac446e81634372bc486e35 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 15 Oct 2022 09:46:25 -0700 Subject: [PATCH 01/15] initial addition of tax summarize --- src/sourmash/cli/tax/__init__.py | 1 + src/sourmash/cli/tax/summarize.py | 53 +++++++++++++++++++++++++++++++ src/sourmash/tax/__main__.py | 42 +++++++++++++++++++++++- 3 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 src/sourmash/cli/tax/summarize.py diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index 3cdd006215..b8bf95f8d8 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -9,6 +9,7 @@ from . import annotate from . import prepare from . import grep +from . import summarize from ..utils import command_list from argparse import SUPPRESS, RawDescriptionHelpFormatter diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py new file mode 100644 index 0000000000..6e3a0a3e8c --- /dev/null +++ b/src/sourmash/cli/tax/summarize.py @@ -0,0 +1,53 @@ +"""summarize taxonomy/lineage information""" + +usage=""" + + sourmash tax summarize [ ... ] + +The 'tax summarize' command reads in one or more taxonomy databases +or lineage files (produced by 'tax annotate'), combines them, +and produces a human readable summary. + +Please see the 'tax summarize' documentation for more details: + https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-summarize +@CTB check URL :) +""" + +import sourmash +from sourmash.logging import notify, print_results, error + + +def subparser(subparsers): + subparser = subparsers.add_parser('summarize', + usage=usage) + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + 'taxonomy_files', + metavar='FILE', + nargs="+", action="extend", + help='database lineages' + ) + subparser.add_argument( + '-o', '--output', + help='output file', + # @CTB implement + ) + subparser.add_argument( + '--keep-full-identifiers', action='store_true', + help='do not split identifiers on whitespace' + ) + subparser.add_argument( + '--keep-identifier-versions', action='store_true', + help='after splitting identifiers, do not remove accession versions' + ) + subparser.add_argument( + '-f', '--force', action = 'store_true', + help='continue past errors in file and taxonomy loading', + ) + +def main(args): + import sourmash + return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index f9a274d5c6..b8c3d8e8d5 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -9,7 +9,7 @@ import sourmash from ..sourmash_args import FileOutputCSV, FileOutput -from sourmash.logging import set_quiet, error, notify +from sourmash.logging import set_quiet, error, notify, print_results from sourmash.lca.lca_utils import display_lineage from . import tax_utils @@ -438,6 +438,46 @@ def search_pattern(l, r): notify(f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'") +def summarize(args): + "Summarize multiple taxonomy databases." + notify("loading taxonomies...") + try: + tax_assign = MultiLineageDB.load(args.taxonomy_files, + force=args.force, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions) + except ValueError as exc: + error("ERROR while loading taxonomies!") + error(str(exc)) + sys.exit(-1) + + notify(f"...loaded {len(tax_assign)} entries.") + + print_results(f"num idents: {len(tax_assign)}") + + rank_counts = defaultdict(int) + name_seen = set() + for v in tax_assign.values(): + sofar = [] + for rank, name in v: + if name not in name_seen: + rank_counts[rank] += 1 + name_seen.add(name) + + if 0: + # check duplicates? + sofar.append(name) + tup = tuple(sofar) + seen.add(tuple(sofar)) + #break + + rank_count_items = list(rank_counts.items()) + rank_count_items.sort(key=lambda x: x[1]) + for rank, count in rank_count_items: + rank_name_str = f"{rank}:" + print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers") + + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) From 06159c1456bf2def8043efd720e4a35bfa42092e Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 15 Oct 2022 09:46:35 -0700 Subject: [PATCH 02/15] report oddball CSV headers --- src/sourmash/tax/tax_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 5dc85ceb63..0bf9038345 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -732,7 +732,8 @@ def load(cls, filename, *, delimiter=',', force=False, identifier = 'accession' header = ["ident" if "accession" == x else x for x in header] else: - raise ValueError('No taxonomic identifiers found.') + header_str = ",".join([repr(x) for x in header]) + raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') # is "strain" an available rank? if "strain" in header: include_strain=True From 2dad65ecfad735522c02127eabef4c82411d1979 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 15 Oct 2022 10:17:27 -0700 Subject: [PATCH 03/15] add provisional support for loading gather-with-lineages files as native taxonomies --- src/sourmash/tax/tax_utils.py | 64 ++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0bf9038345..a832678987 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -731,6 +731,9 @@ def load(cls, filename, *, delimiter=',', force=False, elif 'accession' in header: identifier = 'accession' header = ["ident" if "accession" == x else x for x in header] + elif 'name' in header and 'lineage' in header: + return cls.load_from_gather_with_lineages(filename, + force=force) else: header_str = ",".join([repr(x) for x in header]) raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') @@ -738,7 +741,7 @@ def load(cls, filename, *, delimiter=',', force=False, if "strain" in header: include_strain=True - # check that all ranks are in header + # check that all ranks are in header ranks = list(lca_utils.taxlist(include_strain=include_strain)) if not set(ranks).issubset(header): # for now, just raise err if not all ranks are present. @@ -763,6 +766,7 @@ def load(cls, filename, *, delimiter=',', force=False, ident = row[identifier] # fold, spindle, and mutilate ident? + # @CTB: replace with get_ident? if not keep_full_identifiers: ident = ident.split(' ')[0] @@ -796,6 +800,64 @@ def load(cls, filename, *, delimiter=',', force=False, return LineageDB(assignments, ranks) + @classmethod + def load_from_gather_with_lineages(cls, filename, *, force=False): + """ + Load an annotated gather-with-lineages CSV file produced by + 'tax annotate' into a LineageDB. + """ + include_strain = False + + if not os.path.exists(filename): + raise ValueError(f"'{filename}' does not exist") + + if os.path.isdir(filename): + raise ValueError(f"'{filename}' is a directory") + + with sourmash_args.FileInputCSV(filename) as r: + header = r.fieldnames + if not header: + raise ValueError(f'cannot read taxonomy assignments from {filename}') + + if "name" not in header or "lineage" not in header: + raise ValueError(f"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?") + + ranks = list(lca_utils.taxlist(include_strain=include_strain)) + assignments = {} + num_rows = 0 + n_species = 0 + n_strains = 0 + + # now parse and load lineages + for n, row in enumerate(r): + if row: + num_rows += 1 + + name = row['name'] + ident = get_ident(name) + lineage = row['lineage'] + lineage = lca_utils.make_lineage(lineage) + + print(ident, row['lineage']) + + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + # @CTB what to do here... + if not force: + raise ValueError(f"multiple lineages for identifier {ident}") + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 + + return LineageDB(assignments, ranks) + + class LineageDB_Sqlite(abc.Mapping): """ A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table. From 07e11b7ddb5698227e61489be4a694a5a3341c83 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 15 Oct 2022 10:18:26 -0700 Subject: [PATCH 04/15] remove print --- src/sourmash/tax/tax_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index a832678987..0605d3ce9d 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -838,8 +838,6 @@ def load_from_gather_with_lineages(cls, filename, *, force=False): lineage = row['lineage'] lineage = lca_utils.make_lineage(lineage) - print(ident, row['lineage']) - # check duplicates if ident in assignments: if assignments[ident] != tuple(lineage): From 15ce0ebbb77982680414137d7f69a7c1beae638a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 15 Oct 2022 16:56:25 -0700 Subject: [PATCH 05/15] add per-rank lineage info --- src/sourmash/cli/tax/summarize.py | 4 ++-- src/sourmash/tax/__main__.py | 28 ++++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 6e3a0a3e8c..771164172c 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -31,8 +31,8 @@ def subparser(subparsers): help='database lineages' ) subparser.add_argument( - '-o', '--output', - help='output file', + '-o', '--output-lineage-information', + help='output a CSV file containing individual lineage counts', # @CTB implement ) subparser.add_argument( diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index b8c3d8e8d5..754f1a58e4 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -4,13 +4,13 @@ import sys import csv import os -from collections import defaultdict +from collections import defaultdict, Counter import re import sourmash from ..sourmash_args import FileOutputCSV, FileOutput from sourmash.logging import set_quiet, error, notify, print_results -from sourmash.lca.lca_utils import display_lineage +from sourmash.lca.lca_utils import display_lineage, zip_lineage from . import tax_utils from .tax_utils import ClassificationResult, MultiLineageDB @@ -478,6 +478,30 @@ def summarize(args): print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers") + if args.output_lineage_information: + notify("now calculating detailed lineage counts...") + lineage_counts = Counter() + for v in tax_assign.values(): + tup = v + while tup: + lineage_counts[tup] += 1 + tup = tup[:-1] + notify("...done!") + + with FileOutputCSV(args.output_lineage_information) as fp: + w = csv.writer(fp) + w.writerow(['rank', 'count', 'lineage']) + + # output in order of most common + for lineage, count in lineage_counts.most_common(): + rank = lineage[-1].rank + lin = ";".join(zip_lineage(lineage, truncate_empty=True)) + w.writerow([rank, str(count), lin]) + + n = len(lineage_counts) + notify(f"saved {n} lineage counts to '{args.output_lineage_information}'") + + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) From b5ac05cddfacdcb51aeba0f21bf5d93ffab859a6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 15 Oct 2022 21:16:42 -0700 Subject: [PATCH 06/15] add some basic tests for summarize --- tests/test_tax.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index b9ac15c171..83fb4e299e 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -5,6 +5,7 @@ import csv import pytest import gzip +from collections import Counter import sourmash import sourmash_tst_utils as utils @@ -13,6 +14,7 @@ from sourmash import sqlite_utils from sourmash.exceptions import IndexNotSupported +from sourmash import sourmash_args ## command line tests def test_run_sourmash_tax(): @@ -2902,3 +2904,84 @@ def test_tax_grep_duplicate_csv(runtmp): assert 'GCF_000017325.1' in names assert 'GCF_000021665.1' in names assert 'GCF_001881345.1' in names + + +def test_tax_summarize(runtmp): + # test basic operation + taxfile = utils.get_test_data('tax/test.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "num idents: 6" in out + assert "rank superkingdom: 1 distinct identifiers" in out + assert "rank phylum: 2 distinct identifiers" in out + assert "rank class: 2 distinct identifiers" in out + assert "rank order: 2 distinct identifiers" in out + assert "rank family: 3 distinct identifiers" in out + assert "rank genus: 4 distinct identifiers" in out + assert "rank species: 4 distinct identifiers" in out + + +def test_tax_summarize_csv(runtmp): + # test basic operation w/csv output + taxfile = utils.get_test_data('tax/test.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "num idents: 6" in out + assert "saved 18 lineage counts to 'ranks.csv'" in err + + csv_out = runtmp.output('ranks.csv') + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + val = row['count'] + c[val] += 1 + + assert c['3'] == 7 + assert c['2'] == 5 + assert c['1'] == 5 + + +def test_summarize_on_annotate(runtmp): + # test summarize on output of annotate basics + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + runtmp.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert os.path.exists(csvout) + + # so far so good - now see if we can run summarize! + + runtmp.run_sourmash('tax', 'summarize', csvout) + out = runtmp.last_result.out + err = runtmp.last_result.err + + print(out) + print(err) + + assert "num idents: 4" in out + assert "rank superkingdom: 1 distinct identifiers" in out + assert "rank phylum: 2 distinct identifiers" in out + assert "rank class: 2 distinct identifiers" in out + assert "rank order: 2 distinct identifiers" in out + assert "rank family: 2 distinct identifiers" in out + assert "rank genus: 3 distinct identifiers" in out + assert "rank species: 3 distinct identifiers" in out + From 8501a184c42fa9eb1405bde134b595a2e30d42c7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Nov 2022 19:31:06 -0800 Subject: [PATCH 07/15] add documentation section for tax summarize --- doc/command-line.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/command-line.md b/doc/command-line.md index b8c6c7d378..42657819a5 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -92,6 +92,7 @@ information; these are grouped under the `sourmash tax` and * `tax genome` - summarize single-genome gather results and report most likely classification. * `tax annotate` - annotate gather results with lineage information (no summarization or classification). * `tax grep` - subset taxonomies and create picklists based on taxonomy string matches. +* `tax summarize` - print summary information (counts of lineages) for a taxonomy lineages file or database. `sourmash lca` commands: @@ -912,6 +913,12 @@ _not_ match the pattern. Currently only CSV output (optionally gzipped) is supported; use `sourmash tax prepare` to convert CSV output from `tax grep` into a sqlite3 taxonomy database. +### `sourmash tax summarize` - print summary information for lineage spreadsheets or taxonomy databases + +(`sourmash tax summarize` is a new command as of sourmash v4.5.0.) @CTB + +@CTB + ## `sourmash lca` subcommands for in-memory taxonomy integration These commands use LCA databases (created with `lca index`, below, or From aa579ee598375f77bb83ec3ced0d3bfd0902dbb5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 12 Nov 2022 20:02:10 -0800 Subject: [PATCH 08/15] cover more of the code w/tests --- tests/test_tax_utils.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e2f4838476..60302657e5 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -3,6 +3,7 @@ """ import pytest +import os from os.path import basename import gzip @@ -129,6 +130,43 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): assert ids_missing == {"gA"} +def test_check_and_load_gather_lineage_csvs_empty(runtmp): + g_res = runtmp.output('empty.gather-tax.csv') + with open(g_res, 'w') as fp: + fp.write("") + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "cannot read taxonomy assignments" in str(exc.value) + + +def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): + g_res = runtmp.output('empty.gather-tax.csv') + with open(g_res, 'w', newline="") as fp: + fp.write("x,y,z") + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value) + + +def test_check_and_load_gather_lineage_csvs_dne(runtmp): + g_res = runtmp.output('empty.gather-tax.csv') + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "does not exist" in str(exc.value) + + +def test_check_and_load_gather_lineage_csvs_isdir(runtmp): + g_res = runtmp.output('empty.gather-tax.csv') + os.mkdir(g_res) + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "is a directory" in str(exc.value) + + def test_check_and_load_gather_csvs_fail_on_missing(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') # make gather results with taxonomy name not in tax_assign From ae7b441c6f421945d21745a472e1bb5acab02e67 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Nov 2022 07:44:34 -0800 Subject: [PATCH 09/15] add test; change column name --- src/sourmash/cli/tax/summarize.py | 4 ++-- src/sourmash/tax/__main__.py | 3 ++- tests/test_tax.py | 14 +++++++++++++- 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 771164172c..10bb3ee35d 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -9,8 +9,8 @@ and produces a human readable summary. Please see the 'tax summarize' documentation for more details: - https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-summarize -@CTB check URL :) + https://sourmash.readthedocs.io/en/latest/command-line.html#command-line.html#sourmash-tax-summarize-print-summary-information-for-lineage-spreadsheets-or-taxonomy-databases + """ import sourmash diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 754f1a58e4..6fdb8d0900 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -465,6 +465,7 @@ def summarize(args): name_seen.add(name) if 0: + # @CTB # check duplicates? sofar.append(name) tup = tuple(sofar) @@ -490,7 +491,7 @@ def summarize(args): with FileOutputCSV(args.output_lineage_information) as fp: w = csv.writer(fp) - w.writerow(['rank', 'count', 'lineage']) + w.writerow(['rank', 'lineage_count', 'lineage']) # output in order of most common for lineage, count in lineage_counts.most_common(): diff --git a/tests/test_tax.py b/tests/test_tax.py index d2a30d60e4..1f1123a22f 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2959,6 +2959,18 @@ def test_tax_summarize(runtmp): assert "rank species: 4 distinct identifiers" in out +def test_tax_summarize_empty(runtmp): + # test failure on empty file + taxfile = runtmp.output('no-exist') + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('tax', 'summarize', taxfile) + + out = runtmp.last_result.out + err = runtmp.last_result.err + assert "ERROR while loading taxonomies" in err + + def test_tax_summarize_csv(runtmp): # test basic operation w/csv output taxfile = utils.get_test_data('tax/test.taxonomy.csv') @@ -2977,7 +2989,7 @@ def test_tax_summarize_csv(runtmp): # count number across ranks as a cheap consistency check c = Counter() for row in r: - val = row['count'] + val = row['lineage_count'] c[val] += 1 assert c['3'] == 7 From 4ce8ff7db892095aa8e514a7843a7b3d6b1502d3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Nov 2022 07:46:07 -0800 Subject: [PATCH 10/15] refactor to use get_ident --- src/sourmash/tax/tax_utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b7ba259ba6..b7fe1caf7d 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -765,12 +765,9 @@ def load(cls, filename, *, delimiter=',', force=False, ident = row[identifier] # fold, spindle, and mutilate ident? - # @CTB: replace with get_ident? - if not keep_full_identifiers: - ident = ident.split(' ')[0] - - if not keep_identifier_versions: - ident = ident.split('.')[0] + ident = get_ident(ident, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) # clean lineage of null names, replace with 'unassigned' lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] From d1f5477399661a7b35e0a5236c9f3818caac8d3f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Nov 2022 07:59:29 -0800 Subject: [PATCH 11/15] test duplicate/different tax assignments in lineages file --- src/sourmash/tax/tax_utils.py | 47 +++++++++++++++++++---------------- tests/test_tax.py | 33 ++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b7fe1caf7d..699e80177f 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -826,28 +826,33 @@ def load_from_gather_with_lineages(cls, filename, *, force=False): # now parse and load lineages for n, row in enumerate(r): - if row: - num_rows += 1 + # skip empty rows + if not row: + continue - name = row['name'] - ident = get_ident(name) - lineage = row['lineage'] - lineage = lca_utils.make_lineage(lineage) - - # check duplicates - if ident in assignments: - if assignments[ident] != tuple(lineage): - # @CTB what to do here... - if not force: - raise ValueError(f"multiple lineages for identifier {ident}") - else: - assignments[ident] = tuple(lineage) - - if lineage[-1].rank == 'species': - n_species += 1 - elif lineage[-1].rank == 'strain': - n_species += 1 - n_strains += 1 + num_rows += 1 + + name = row['name'] + ident = get_ident(name) + lineage = row['lineage'] + lineage = lca_utils.make_lineage(lineage) + + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + # this should not happen with valid + # sourmash tax annotate output, but check anyway. + if not force: + raise ValueError(f"multiple lineages for identifier {ident}") + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + assert 0 # @CTB + n_species += 1 + n_strains += 1 return LineageDB(assignments, ranks) diff --git a/tests/test_tax.py b/tests/test_tax.py index 1f1123a22f..e6de945227 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -10,6 +10,7 @@ import sourmash import sourmash_tst_utils as utils from sourmash.tax import tax_utils +from sourmash.lca import lca_utils from sourmash_tst_utils import SourmashCommandFailed from sourmash import sqlite_utils @@ -1381,6 +1382,38 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp): assert "multiple lineages for identifier GCF_001881345" in str(exc.value) +def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): + c = runtmp + + # write temp taxonomy with duplicates => lineages-style file + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxdb = tax_utils.LineageDB.load(taxonomy_csv) + + for k, v in taxdb.items(): + print(k, v) + + lineage_csv = runtmp.output('lin.csv') + with open(lineage_csv, 'w', newline="") as fp: + w = csv.writer(fp) + w.writerow(['name', 'lineage']) + for k, v in taxdb.items(): + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + # duplicate each row, changing something (truncate species, here) + v = v[:-1] + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'summarize', lineage_csv) + print(c.last_result.out) + print(c.last_result.err) + + assert "cannot read taxonomy assignments" in str(exc.value) + assert "multiple lineages for identifier GCF_001881345" in str(exc.value) + + def test_genome_rank_duplicated_taxonomy_force(runtmp): c = runtmp # write temp taxonomy with duplicates From 4351649255a6ca9f4fdc0818ff35392ae6e83e5d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Nov 2022 09:21:31 -0800 Subject: [PATCH 12/15] test strains and lineage stuff --- src/sourmash/tax/tax_utils.py | 1 - tests/test_tax.py | 68 +++++++++++++++++++++++++++++++++++ tests/test_tax_utils.py | 3 ++ 3 files changed, 71 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 699e80177f..4c281c2f70 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -850,7 +850,6 @@ def load_from_gather_with_lineages(cls, filename, *, force=False): if lineage[-1].rank == 'species': n_species += 1 elif lineage[-1].rank == 'strain': - assert 0 # @CTB n_species += 1 n_strains += 1 diff --git a/tests/test_tax.py b/tests/test_tax.py index e6de945227..78774c06c0 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -3064,3 +3064,71 @@ def test_summarize_on_annotate(runtmp): assert "rank genus: 3 distinct identifiers" in out assert "rank species: 3 distinct identifiers" in out + +def test_tax_summarize_strain_csv(runtmp): + # test basic operation w/csv output on taxonomy with strains + taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "num idents: 6" in out + assert "saved 24 lineage counts to 'ranks.csv'" in err + + csv_out = runtmp.output('ranks.csv') + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + print(row) + val = row['lineage_count'] + c[val] += 1 + + print(list(c.most_common())) + + assert c['3'] == 7 + assert c['2'] == 5 + assert c['6'] == 1 + assert c['1'] == 11 + + +def test_tax_summarize_strain_csv_with_lineages(runtmp): + # test basic operation w/csv output + taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') + lineage_csv = runtmp.output('lin-with-strains.csv') + + taxdb = tax_utils.LineageDB.load(taxfile) + with open(lineage_csv, 'w', newline="") as fp: + w = csv.writer(fp) + w.writerow(['name', 'lineage']) + for k, v in taxdb.items(): + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "num idents: 6" in out + assert "saved 24 lineage counts to" in err + + csv_out = runtmp.output('ranks.csv') + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + print(row) + val = row['lineage_count'] + c[val] += 1 + + print(list(c.most_common())) + + assert c['3'] == 7 + assert c['2'] == 5 + assert c['6'] == 1 + assert c['1'] == 11 diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 60302657e5..2354b8da2c 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -141,6 +141,7 @@ def test_check_and_load_gather_lineage_csvs_empty(runtmp): def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): + # test on file with wrong headers g_res = runtmp.output('empty.gather-tax.csv') with open(g_res, 'w', newline="") as fp: fp.write("x,y,z") @@ -151,6 +152,7 @@ def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): def test_check_and_load_gather_lineage_csvs_dne(runtmp): + # test loading with-lineage file that does not exist g_res = runtmp.output('empty.gather-tax.csv') with pytest.raises(ValueError) as exc: @@ -159,6 +161,7 @@ def test_check_and_load_gather_lineage_csvs_dne(runtmp): def test_check_and_load_gather_lineage_csvs_isdir(runtmp): + # test loading a with-lineage file that is actually a directory g_res = runtmp.output('empty.gather-tax.csv') os.mkdir(g_res) From d3ab0447863618e0859da1f13ba5cec1cad516be Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Nov 2022 11:53:27 -0800 Subject: [PATCH 13/15] test on empty line --- src/sourmash/cli/tax/summarize.py | 1 - src/sourmash/tax/__main__.py | 1 - src/sourmash/tax/tax_utils.py | 77 +++++++++---------- .../tax/test-empty-line.taxonomy.csv | 8 ++ tests/test_tax.py | 29 +++++-- tests/test_tax_utils.py | 1 + 6 files changed, 69 insertions(+), 48 deletions(-) create mode 100644 tests/test-data/tax/test-empty-line.taxonomy.csv diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 10bb3ee35d..7fca17e837 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -33,7 +33,6 @@ def subparser(subparsers): subparser.add_argument( '-o', '--output-lineage-information', help='output a CSV file containing individual lineage counts', - # @CTB implement ) subparser.add_argument( '--keep-full-identifiers', action='store_true', diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 6fdb8d0900..d003962117 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -478,7 +478,6 @@ def summarize(args): rank_name_str = f"{rank}:" print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers") - if args.output_lineage_information: notify("now calculating detailed lineage counts...") lineage_counts = Counter() diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 4c281c2f70..d504d4ab9c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -755,43 +755,42 @@ def load(cls, filename, *, delimiter=',', force=False, # now parse and load lineages for n, row in enumerate(r): - if row: - num_rows += 1 - lineage = [] - # read row into a lineage pair - for rank in lca_utils.taxlist(include_strain=include_strain): - lin = row[rank] - lineage.append(LineagePair(rank, lin)) - ident = row[identifier] - - # fold, spindle, and mutilate ident? - ident = get_ident(ident, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) - - # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] - - # remove end nulls - while lineage and lineage[-1].name == 'unassigned': - lineage = lineage[:-1] - - # store lineage tuple - if lineage: - # check duplicates - if ident in assignments: - if assignments[ident] != tuple(lineage): - if not force: - raise ValueError(f"multiple lineages for identifier {ident}") - else: - assignments[ident] = tuple(lineage) - - if lineage[-1].rank == 'species': - n_species += 1 - elif lineage[-1].rank == 'strain': - n_species += 1 - n_strains += 1 + num_rows += 1 + lineage = [] + # read row into a lineage pair + for rank in lca_utils.taxlist(include_strain=include_strain): + lin = row[rank] + lineage.append(LineagePair(rank, lin)) + ident = row[identifier] + + # fold, spindle, and mutilate ident? + ident = get_ident(ident, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) + + # clean lineage of null names, replace with 'unassigned' + lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] + lineage = [ LineagePair(a, b) for (a, b) in lineage ] + + # remove end nulls + while lineage and lineage[-1].name == 'unassigned': + lineage = lineage[:-1] + + # store lineage tuple + if lineage: + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + if not force: + raise ValueError(f"multiple lineages for identifier {ident}") + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 return LineageDB(assignments, ranks) @@ -826,10 +825,6 @@ def load_from_gather_with_lineages(cls, filename, *, force=False): # now parse and load lineages for n, row in enumerate(r): - # skip empty rows - if not row: - continue - num_rows += 1 name = row['name'] diff --git a/tests/test-data/tax/test-empty-line.taxonomy.csv b/tests/test-data/tax/test-empty-line.taxonomy.csv new file mode 100644 index 0000000000..a197efb7fa --- /dev/null +++ b/tests/test-data/tax/test-empty-line.taxonomy.csv @@ -0,0 +1,8 @@ +ident,superkingdom,phylum,class,order,family,genus,species + +GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli +GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri +GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus +GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri +GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica +GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica diff --git a/tests/test_tax.py b/tests/test_tax.py index 78774c06c0..be5dcf14e0 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1383,9 +1383,9 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp): def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): + # write temp taxonomy with duplicates => lineages-style file c = runtmp - # write temp taxonomy with duplicates => lineages-style file taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') taxdb = tax_utils.LineageDB.load(taxonomy_csv) @@ -1415,8 +1415,8 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): def test_genome_rank_duplicated_taxonomy_force(runtmp): - c = runtmp # write temp taxonomy with duplicates + c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') duplicated_csv = runtmp.output("duplicated_taxonomy.csv") with open(duplicated_csv, 'w') as dup: @@ -2974,7 +2974,7 @@ def test_tax_grep_duplicate_csv(runtmp): def test_tax_summarize(runtmp): - # test basic operation + # test basic operation with summarize taxfile = utils.get_test_data('tax/test.taxonomy.csv') runtmp.sourmash('tax', 'summarize', taxfile) @@ -2992,6 +2992,25 @@ def test_tax_summarize(runtmp): assert "rank species: 4 distinct identifiers" in out +def test_tax_summarize_empty_line(runtmp): + # test basic operation with summarize on a file w/empty line + taxfile = utils.get_test_data('tax/test-empty-line.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "num idents: 6" in out + assert "rank superkingdom: 1 distinct identifiers" in out + assert "rank phylum: 2 distinct identifiers" in out + assert "rank class: 2 distinct identifiers" in out + assert "rank order: 2 distinct identifiers" in out + assert "rank family: 3 distinct identifiers" in out + assert "rank genus: 4 distinct identifiers" in out + assert "rank species: 4 distinct identifiers" in out + + def test_tax_summarize_empty(runtmp): # test failure on empty file taxfile = runtmp.output('no-exist') @@ -3030,7 +3049,7 @@ def test_tax_summarize_csv(runtmp): assert c['1'] == 5 -def test_summarize_on_annotate(runtmp): +def test_tax_summarize_on_annotate(runtmp): # test summarize on output of annotate basics g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -3096,7 +3115,7 @@ def test_tax_summarize_strain_csv(runtmp): def test_tax_summarize_strain_csv_with_lineages(runtmp): - # test basic operation w/csv output + # test basic operation w/csv output on lineages-style file w/strain csv taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') lineage_csv = runtmp.output('lin-with-strains.csv') diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 2354b8da2c..7652970f60 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -131,6 +131,7 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): def test_check_and_load_gather_lineage_csvs_empty(runtmp): + # try loading an empty annotated gather file g_res = runtmp.output('empty.gather-tax.csv') with open(g_res, 'w') as fp: fp.write("") From 5e7bdefaeaf61bb4e838667125200be8312e21f4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Nov 2022 12:28:33 -0800 Subject: [PATCH 14/15] add docs --- doc/command-line.md | 50 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 42657819a5..939f62f7df 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -516,7 +516,6 @@ As with all reference-based analysis, results can be affected by the For more details on how `gather` works and can be used to classify signatures, see [classifying-signatures](classifying-signatures.md). - ### `sourmash tax metagenome` - summarize metagenome content from `gather` results `sourmash tax metagenome` summarizes gather results for each query metagenome by @@ -837,6 +836,10 @@ sourmash tax annotate --taxonomy gtdb-rs202.taxonomy.v2.csv ``` +The `with-lineages` output file format can be summarized with +`sourmash tax summarize` and can also be used as an input taxonomy +spreadsheet for any of the tax subcommands (new as of v4.6.0). + ### `sourmash tax prepare` - prepare and/or combine taxonomy files `sourmash tax prepare` prepares taxonomy files for other `sourmash tax` @@ -866,6 +869,9 @@ can be set to CSV like so: sourmash tax prepare --taxonomy file1.csv file2.db -o tax.csv -F csv ``` +**Note:** As of sourmash v4.6.0, the output of `sourmash tax annotate` can + be used as a taxonomy input spreadsheet as well. + ### `sourmash tax grep` - subset taxonomies and create picklists based on taxonomy string matches (`sourmash tax grep` is a new command as of sourmash v4.5.0.) @@ -896,9 +902,8 @@ sourmash search query.sig gtdb-rs207.genomic.k31.zip \ --picklist shew-picklist.csv:ident:ident ``` - `tax grep` can also restrict string matching to a specific taxonomic rank -with `-r/--rank`; for examplem +with `-r/--rank`; for example, ``` sourmash tax grep Shew -t gtdb-rs207.taxonomy.sqldb \ -o shew-picklist.csv -r genus @@ -915,9 +920,44 @@ convert CSV output from `tax grep` into a sqlite3 taxonomy database. ### `sourmash tax summarize` - print summary information for lineage spreadsheets or taxonomy databases -(`sourmash tax summarize` is a new command as of sourmash v4.5.0.) @CTB +(`sourmash tax summarize` is a new command as of sourmash v4.6.0.) + +`sourmash tax summarize` loads in one or more lineage spreadsheets, +counts the distinct taxonomic lineages, and outputs a summary. It +optionally will output a CSV file with a detailed count of how many +identifiers belong to each taxonomic lineage. + +For example, +``` +sourmash tax summarize gtdb-rs202.taxonomy.v2.db -o ranks.csv +``` +outputs +``` +number of distinct taxonomic lineages: 258406 +rank superkingdom: 2 distinct taxonomic lineages +rank phylum: 169 distinct taxonomic lineages +rank class: 419 distinct taxonomic lineages +rank order: 1312 distinct taxonomic lineages +rank family: 3264 distinct taxonomic lineages +rank genus: 12888 distinct taxonomic lineages +rank species: 47894 distinct taxonomic lineages +``` + +and creates a file `ranks.csv` with the number of distinct identifier +counts for each lineage at each rank: +``` +rank,lineage_count,lineage +superkingdom,254090,d__Bacteria +phylum,120757,d__Bacteria;p__Proteobacteria +class,104665,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria +order,64157,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales +family,55347,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae +... +``` +That is, there are 254,090 identifiers in GTDB rs202 under `d__Bacteria`, +and 120,757 within the `p__Proteobacteria`. -@CTB +`tax summarize` can also be used to summarize the output of `tax annotate`. ## `sourmash lca` subcommands for in-memory taxonomy integration From 43e094d59550e2095e83b0c7f2db777c7c8015f1 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 13 Nov 2022 12:28:44 -0800 Subject: [PATCH 15/15] cleanup; add test for multiple --- src/sourmash/tax/__main__.py | 13 ++----- tests/test_tax.py | 74 +++++++++++++++++++++++------------- 2 files changed, 50 insertions(+), 37 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index d003962117..6544e5d610 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -453,8 +453,9 @@ def summarize(args): notify(f"...loaded {len(tax_assign)} entries.") - print_results(f"num idents: {len(tax_assign)}") + print_results(f"number of distinct taxonomic lineages: {len(tax_assign)}") + # count the number of distinct lineage names seen rank_counts = defaultdict(int) name_seen = set() for v in tax_assign.values(): @@ -464,19 +465,11 @@ def summarize(args): rank_counts[rank] += 1 name_seen.add(name) - if 0: - # @CTB - # check duplicates? - sofar.append(name) - tup = tuple(sofar) - seen.add(tuple(sofar)) - #break - rank_count_items = list(rank_counts.items()) rank_count_items.sort(key=lambda x: x[1]) for rank, count in rank_count_items: rank_name_str = f"{rank}:" - print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers") + print_results(f"rank {rank_name_str:<20s} {count} distinct taxonomic lineages") if args.output_lineage_information: notify("now calculating detailed lineage counts...") diff --git a/tests/test_tax.py b/tests/test_tax.py index be5dcf14e0..b1b3c1b8f8 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -2982,14 +2982,34 @@ def test_tax_summarize(runtmp): out = runtmp.last_result.out err = runtmp.last_result.err - assert "num idents: 6" in out - assert "rank superkingdom: 1 distinct identifiers" in out - assert "rank phylum: 2 distinct identifiers" in out - assert "rank class: 2 distinct identifiers" in out - assert "rank order: 2 distinct identifiers" in out - assert "rank family: 3 distinct identifiers" in out - assert "rank genus: 4 distinct identifiers" in out - assert "rank species: 4 distinct identifiers" in out + assert "number of distinct taxonomic lineages: 6" in out + assert "rank superkingdom: 1 distinct taxonomic lineages" in out + assert "rank phylum: 2 distinct taxonomic lineages" in out + assert "rank class: 2 distinct taxonomic lineages" in out + assert "rank order: 2 distinct taxonomic lineages" in out + assert "rank family: 3 distinct taxonomic lineages" in out + assert "rank genus: 4 distinct taxonomic lineages" in out + assert "rank species: 4 distinct taxonomic lineages" in out + + +def test_tax_summarize_multiple(runtmp): + # test basic operation with summarize on multiple files + tax1 = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + + runtmp.sourmash('tax', 'summarize', tax1, tax2) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 6" in out + assert "rank superkingdom: 2 distinct taxonomic lineages" in out + assert "rank phylum: 3 distinct taxonomic lineages" in out + assert "rank class: 4 distinct taxonomic lineages" in out + assert "rank order: 4 distinct taxonomic lineages" in out + assert "rank family: 5 distinct taxonomic lineages" in out + assert "rank genus: 5 distinct taxonomic lineages" in out + assert "rank species: 5 distinct taxonomic lineages" in out def test_tax_summarize_empty_line(runtmp): @@ -3001,14 +3021,14 @@ def test_tax_summarize_empty_line(runtmp): out = runtmp.last_result.out err = runtmp.last_result.err - assert "num idents: 6" in out - assert "rank superkingdom: 1 distinct identifiers" in out - assert "rank phylum: 2 distinct identifiers" in out - assert "rank class: 2 distinct identifiers" in out - assert "rank order: 2 distinct identifiers" in out - assert "rank family: 3 distinct identifiers" in out - assert "rank genus: 4 distinct identifiers" in out - assert "rank species: 4 distinct identifiers" in out + assert "number of distinct taxonomic lineages: 6" in out + assert "rank superkingdom: 1 distinct taxonomic lineages" in out + assert "rank phylum: 2 distinct taxonomic lineages" in out + assert "rank class: 2 distinct taxonomic lineages" in out + assert "rank order: 2 distinct taxonomic lineages" in out + assert "rank family: 3 distinct taxonomic lineages" in out + assert "rank genus: 4 distinct taxonomic lineages" in out + assert "rank species: 4 distinct taxonomic lineages" in out def test_tax_summarize_empty(runtmp): @@ -3032,7 +3052,7 @@ def test_tax_summarize_csv(runtmp): out = runtmp.last_result.out err = runtmp.last_result.err - assert "num idents: 6" in out + assert "number of distinct taxonomic lineages: 6" in out assert "saved 18 lineage counts to 'ranks.csv'" in err csv_out = runtmp.output('ranks.csv') @@ -3074,14 +3094,14 @@ def test_tax_summarize_on_annotate(runtmp): print(out) print(err) - assert "num idents: 4" in out - assert "rank superkingdom: 1 distinct identifiers" in out - assert "rank phylum: 2 distinct identifiers" in out - assert "rank class: 2 distinct identifiers" in out - assert "rank order: 2 distinct identifiers" in out - assert "rank family: 2 distinct identifiers" in out - assert "rank genus: 3 distinct identifiers" in out - assert "rank species: 3 distinct identifiers" in out + assert "number of distinct taxonomic lineages: 4" in out + assert "rank superkingdom: 1 distinct taxonomic lineages" in out + assert "rank phylum: 2 distinct taxonomic lineages" in out + assert "rank class: 2 distinct taxonomic lineages" in out + assert "rank order: 2 distinct taxonomic lineages" in out + assert "rank family: 2 distinct taxonomic lineages" in out + assert "rank genus: 3 distinct taxonomic lineages" in out + assert "rank species: 3 distinct taxonomic lineages" in out def test_tax_summarize_strain_csv(runtmp): @@ -3093,7 +3113,7 @@ def test_tax_summarize_strain_csv(runtmp): out = runtmp.last_result.out err = runtmp.last_result.err - assert "num idents: 6" in out + assert "number of distinct taxonomic lineages: 6" in out assert "saved 24 lineage counts to 'ranks.csv'" in err csv_out = runtmp.output('ranks.csv') @@ -3132,7 +3152,7 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp): out = runtmp.last_result.out err = runtmp.last_result.err - assert "num idents: 6" in out + assert "number of distinct taxonomic lineages: 6" in out assert "saved 24 lineage counts to" in err csv_out = runtmp.output('ranks.csv')