diff --git a/doc/command-line.md b/doc/command-line.md index b8c6c7d378..939f62f7df 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -92,6 +92,7 @@ information; these are grouped under the `sourmash tax` and * `tax genome` - summarize single-genome gather results and report most likely classification. * `tax annotate` - annotate gather results with lineage information (no summarization or classification). * `tax grep` - subset taxonomies and create picklists based on taxonomy string matches. +* `tax summarize` - print summary information (counts of lineages) for a taxonomy lineages file or database. `sourmash lca` commands: @@ -515,7 +516,6 @@ As with all reference-based analysis, results can be affected by the For more details on how `gather` works and can be used to classify signatures, see [classifying-signatures](classifying-signatures.md). - ### `sourmash tax metagenome` - summarize metagenome content from `gather` results `sourmash tax metagenome` summarizes gather results for each query metagenome by @@ -836,6 +836,10 @@ sourmash tax annotate --taxonomy gtdb-rs202.taxonomy.v2.csv ``` +The `with-lineages` output file format can be summarized with +`sourmash tax summarize` and can also be used as an input taxonomy +spreadsheet for any of the tax subcommands (new as of v4.6.0). + ### `sourmash tax prepare` - prepare and/or combine taxonomy files `sourmash tax prepare` prepares taxonomy files for other `sourmash tax` @@ -865,6 +869,9 @@ can be set to CSV like so: sourmash tax prepare --taxonomy file1.csv file2.db -o tax.csv -F csv ``` +**Note:** As of sourmash v4.6.0, the output of `sourmash tax annotate` can + be used as a taxonomy input spreadsheet as well. + ### `sourmash tax grep` - subset taxonomies and create picklists based on taxonomy string matches (`sourmash tax grep` is a new command as of sourmash v4.5.0.) @@ -895,9 +902,8 @@ sourmash search query.sig gtdb-rs207.genomic.k31.zip \ --picklist shew-picklist.csv:ident:ident ``` - `tax grep` can also restrict string matching to a specific taxonomic rank -with `-r/--rank`; for examplem +with `-r/--rank`; for example, ``` sourmash tax grep Shew -t gtdb-rs207.taxonomy.sqldb \ -o shew-picklist.csv -r genus @@ -912,6 +918,47 @@ _not_ match the pattern. Currently only CSV output (optionally gzipped) is supported; use `sourmash tax prepare` to convert CSV output from `tax grep` into a sqlite3 taxonomy database. +### `sourmash tax summarize` - print summary information for lineage spreadsheets or taxonomy databases + +(`sourmash tax summarize` is a new command as of sourmash v4.6.0.) + +`sourmash tax summarize` loads in one or more lineage spreadsheets, +counts the distinct taxonomic lineages, and outputs a summary. It +optionally will output a CSV file with a detailed count of how many +identifiers belong to each taxonomic lineage. + +For example, +``` +sourmash tax summarize gtdb-rs202.taxonomy.v2.db -o ranks.csv +``` +outputs +``` +number of distinct taxonomic lineages: 258406 +rank superkingdom: 2 distinct taxonomic lineages +rank phylum: 169 distinct taxonomic lineages +rank class: 419 distinct taxonomic lineages +rank order: 1312 distinct taxonomic lineages +rank family: 3264 distinct taxonomic lineages +rank genus: 12888 distinct taxonomic lineages +rank species: 47894 distinct taxonomic lineages +``` + +and creates a file `ranks.csv` with the number of distinct identifier +counts for each lineage at each rank: +``` +rank,lineage_count,lineage +superkingdom,254090,d__Bacteria +phylum,120757,d__Bacteria;p__Proteobacteria +class,104665,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria +order,64157,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales +family,55347,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae +... +``` +That is, there are 254,090 identifiers in GTDB rs202 under `d__Bacteria`, +and 120,757 within the `p__Proteobacteria`. + +`tax summarize` can also be used to summarize the output of `tax annotate`. + ## `sourmash lca` subcommands for in-memory taxonomy integration These commands use LCA databases (created with `lca index`, below, or diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index 3cdd006215..b8bf95f8d8 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -9,6 +9,7 @@ from . import annotate from . import prepare from . import grep +from . import summarize from ..utils import command_list from argparse import SUPPRESS, RawDescriptionHelpFormatter diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py new file mode 100644 index 0000000000..7fca17e837 --- /dev/null +++ b/src/sourmash/cli/tax/summarize.py @@ -0,0 +1,52 @@ +"""summarize taxonomy/lineage information""" + +usage=""" + + sourmash tax summarize [ ... ] + +The 'tax summarize' command reads in one or more taxonomy databases +or lineage files (produced by 'tax annotate'), combines them, +and produces a human readable summary. + +Please see the 'tax summarize' documentation for more details: + https://sourmash.readthedocs.io/en/latest/command-line.html#command-line.html#sourmash-tax-summarize-print-summary-information-for-lineage-spreadsheets-or-taxonomy-databases + +""" + +import sourmash +from sourmash.logging import notify, print_results, error + + +def subparser(subparsers): + subparser = subparsers.add_parser('summarize', + usage=usage) + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + 'taxonomy_files', + metavar='FILE', + nargs="+", action="extend", + help='database lineages' + ) + subparser.add_argument( + '-o', '--output-lineage-information', + help='output a CSV file containing individual lineage counts', + ) + subparser.add_argument( + '--keep-full-identifiers', action='store_true', + help='do not split identifiers on whitespace' + ) + subparser.add_argument( + '--keep-identifier-versions', action='store_true', + help='after splitting identifiers, do not remove accession versions' + ) + subparser.add_argument( + '-f', '--force', action = 'store_true', + help='continue past errors in file and taxonomy loading', + ) + +def main(args): + import sourmash + return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index f9a274d5c6..6544e5d610 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -4,13 +4,13 @@ import sys import csv import os -from collections import defaultdict +from collections import defaultdict, Counter import re import sourmash from ..sourmash_args import FileOutputCSV, FileOutput -from sourmash.logging import set_quiet, error, notify -from sourmash.lca.lca_utils import display_lineage +from sourmash.logging import set_quiet, error, notify, print_results +from sourmash.lca.lca_utils import display_lineage, zip_lineage from . import tax_utils from .tax_utils import ClassificationResult, MultiLineageDB @@ -438,6 +438,63 @@ def search_pattern(l, r): notify(f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'") +def summarize(args): + "Summarize multiple taxonomy databases." + notify("loading taxonomies...") + try: + tax_assign = MultiLineageDB.load(args.taxonomy_files, + force=args.force, + keep_full_identifiers=args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions) + except ValueError as exc: + error("ERROR while loading taxonomies!") + error(str(exc)) + sys.exit(-1) + + notify(f"...loaded {len(tax_assign)} entries.") + + print_results(f"number of distinct taxonomic lineages: {len(tax_assign)}") + + # count the number of distinct lineage names seen + rank_counts = defaultdict(int) + name_seen = set() + for v in tax_assign.values(): + sofar = [] + for rank, name in v: + if name not in name_seen: + rank_counts[rank] += 1 + name_seen.add(name) + + rank_count_items = list(rank_counts.items()) + rank_count_items.sort(key=lambda x: x[1]) + for rank, count in rank_count_items: + rank_name_str = f"{rank}:" + print_results(f"rank {rank_name_str:<20s} {count} distinct taxonomic lineages") + + if args.output_lineage_information: + notify("now calculating detailed lineage counts...") + lineage_counts = Counter() + for v in tax_assign.values(): + tup = v + while tup: + lineage_counts[tup] += 1 + tup = tup[:-1] + notify("...done!") + + with FileOutputCSV(args.output_lineage_information) as fp: + w = csv.writer(fp) + w.writerow(['rank', 'lineage_count', 'lineage']) + + # output in order of most common + for lineage, count in lineage_counts.most_common(): + rank = lineage[-1].rank + lin = ";".join(zip_lineage(lineage, truncate_empty=True)) + w.writerow([rank, str(count), lin]) + + n = len(lineage_counts) + notify(f"saved {n} lineage counts to '{args.output_lineage_information}'") + + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6828b61ecb..d504d4ab9c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -730,13 +730,17 @@ def load(cls, filename, *, delimiter=',', force=False, elif 'accession' in header: identifier = 'accession' header = ["ident" if "accession" == x else x for x in header] + elif 'name' in header and 'lineage' in header: + return cls.load_from_gather_with_lineages(filename, + force=force) else: - raise ValueError('No taxonomic identifiers found.') + header_str = ",".join([repr(x) for x in header]) + raise ValueError(f'No taxonomic identifiers found; headers are {header_str}') # is "strain" an available rank? if "strain" in header: include_strain=True - # check that all ranks are in header + # check that all ranks are in header ranks = list(lca_utils.taxlist(include_strain=include_strain)) if not set(ranks).issubset(header): # for now, just raise err if not all ranks are present. @@ -751,45 +755,98 @@ def load(cls, filename, *, delimiter=',', force=False, # now parse and load lineages for n, row in enumerate(r): - if row: - num_rows += 1 - lineage = [] - # read row into a lineage pair - for rank in lca_utils.taxlist(include_strain=include_strain): - lin = row[rank] - lineage.append(LineagePair(rank, lin)) - ident = row[identifier] - - # fold, spindle, and mutilate ident? - if not keep_full_identifiers: - ident = ident.split(' ')[0] - - if not keep_identifier_versions: - ident = ident.split('.')[0] - - # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] - - # remove end nulls - while lineage and lineage[-1].name == 'unassigned': - lineage = lineage[:-1] - - # store lineage tuple - if lineage: - # check duplicates - if ident in assignments: - if assignments[ident] != tuple(lineage): - if not force: - raise ValueError(f"multiple lineages for identifier {ident}") - else: - assignments[ident] = tuple(lineage) - - if lineage[-1].rank == 'species': - n_species += 1 - elif lineage[-1].rank == 'strain': - n_species += 1 - n_strains += 1 + num_rows += 1 + lineage = [] + # read row into a lineage pair + for rank in lca_utils.taxlist(include_strain=include_strain): + lin = row[rank] + lineage.append(LineagePair(rank, lin)) + ident = row[identifier] + + # fold, spindle, and mutilate ident? + ident = get_ident(ident, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) + + # clean lineage of null names, replace with 'unassigned' + lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] + lineage = [ LineagePair(a, b) for (a, b) in lineage ] + + # remove end nulls + while lineage and lineage[-1].name == 'unassigned': + lineage = lineage[:-1] + + # store lineage tuple + if lineage: + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + if not force: + raise ValueError(f"multiple lineages for identifier {ident}") + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 + + return LineageDB(assignments, ranks) + + + @classmethod + def load_from_gather_with_lineages(cls, filename, *, force=False): + """ + Load an annotated gather-with-lineages CSV file produced by + 'tax annotate' into a LineageDB. + """ + include_strain = False + + if not os.path.exists(filename): + raise ValueError(f"'{filename}' does not exist") + + if os.path.isdir(filename): + raise ValueError(f"'{filename}' is a directory") + + with sourmash_args.FileInputCSV(filename) as r: + header = r.fieldnames + if not header: + raise ValueError(f'cannot read taxonomy assignments from {filename}') + + if "name" not in header or "lineage" not in header: + raise ValueError(f"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?") + + ranks = list(lca_utils.taxlist(include_strain=include_strain)) + assignments = {} + num_rows = 0 + n_species = 0 + n_strains = 0 + + # now parse and load lineages + for n, row in enumerate(r): + num_rows += 1 + + name = row['name'] + ident = get_ident(name) + lineage = row['lineage'] + lineage = lca_utils.make_lineage(lineage) + + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + # this should not happen with valid + # sourmash tax annotate output, but check anyway. + if not force: + raise ValueError(f"multiple lineages for identifier {ident}") + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 return LineageDB(assignments, ranks) diff --git a/tests/test-data/tax/test-empty-line.taxonomy.csv b/tests/test-data/tax/test-empty-line.taxonomy.csv new file mode 100644 index 0000000000..a197efb7fa --- /dev/null +++ b/tests/test-data/tax/test-empty-line.taxonomy.csv @@ -0,0 +1,8 @@ +ident,superkingdom,phylum,class,order,family,genus,species + +GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli +GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri +GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus +GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri +GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica +GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica diff --git a/tests/test_tax.py b/tests/test_tax.py index 6ff6ffdf4b..b1b3c1b8f8 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -5,14 +5,17 @@ import csv import pytest import gzip +from collections import Counter import sourmash import sourmash_tst_utils as utils from sourmash.tax import tax_utils +from sourmash.lca import lca_utils from sourmash_tst_utils import SourmashCommandFailed from sourmash import sqlite_utils from sourmash.exceptions import IndexNotSupported +from sourmash import sourmash_args ## command line tests def test_run_sourmash_tax(): @@ -1379,9 +1382,41 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp): assert "multiple lineages for identifier GCF_001881345" in str(exc.value) -def test_genome_rank_duplicated_taxonomy_force(runtmp): +def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp): + # write temp taxonomy with duplicates => lineages-style file c = runtmp + + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxdb = tax_utils.LineageDB.load(taxonomy_csv) + + for k, v in taxdb.items(): + print(k, v) + + lineage_csv = runtmp.output('lin.csv') + with open(lineage_csv, 'w', newline="") as fp: + w = csv.writer(fp) + w.writerow(['name', 'lineage']) + for k, v in taxdb.items(): + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + # duplicate each row, changing something (truncate species, here) + v = v[:-1] + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'summarize', lineage_csv) + print(c.last_result.out) + print(c.last_result.err) + + assert "cannot read taxonomy assignments" in str(exc.value) + assert "multiple lineages for identifier GCF_001881345" in str(exc.value) + + +def test_genome_rank_duplicated_taxonomy_force(runtmp): # write temp taxonomy with duplicates + c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') duplicated_csv = runtmp.output("duplicated_taxonomy.csv") with open(duplicated_csv, 'w') as dup: @@ -2936,3 +2971,203 @@ def test_tax_grep_duplicate_csv(runtmp): assert 'GCF_000017325.1' in names assert 'GCF_000021665.1' in names assert 'GCF_001881345.1' in names + + +def test_tax_summarize(runtmp): + # test basic operation with summarize + taxfile = utils.get_test_data('tax/test.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 6" in out + assert "rank superkingdom: 1 distinct taxonomic lineages" in out + assert "rank phylum: 2 distinct taxonomic lineages" in out + assert "rank class: 2 distinct taxonomic lineages" in out + assert "rank order: 2 distinct taxonomic lineages" in out + assert "rank family: 3 distinct taxonomic lineages" in out + assert "rank genus: 4 distinct taxonomic lineages" in out + assert "rank species: 4 distinct taxonomic lineages" in out + + +def test_tax_summarize_multiple(runtmp): + # test basic operation with summarize on multiple files + tax1 = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + + runtmp.sourmash('tax', 'summarize', tax1, tax2) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 6" in out + assert "rank superkingdom: 2 distinct taxonomic lineages" in out + assert "rank phylum: 3 distinct taxonomic lineages" in out + assert "rank class: 4 distinct taxonomic lineages" in out + assert "rank order: 4 distinct taxonomic lineages" in out + assert "rank family: 5 distinct taxonomic lineages" in out + assert "rank genus: 5 distinct taxonomic lineages" in out + assert "rank species: 5 distinct taxonomic lineages" in out + + +def test_tax_summarize_empty_line(runtmp): + # test basic operation with summarize on a file w/empty line + taxfile = utils.get_test_data('tax/test-empty-line.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile) + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 6" in out + assert "rank superkingdom: 1 distinct taxonomic lineages" in out + assert "rank phylum: 2 distinct taxonomic lineages" in out + assert "rank class: 2 distinct taxonomic lineages" in out + assert "rank order: 2 distinct taxonomic lineages" in out + assert "rank family: 3 distinct taxonomic lineages" in out + assert "rank genus: 4 distinct taxonomic lineages" in out + assert "rank species: 4 distinct taxonomic lineages" in out + + +def test_tax_summarize_empty(runtmp): + # test failure on empty file + taxfile = runtmp.output('no-exist') + + with pytest.raises(SourmashCommandFailed): + runtmp.sourmash('tax', 'summarize', taxfile) + + out = runtmp.last_result.out + err = runtmp.last_result.err + assert "ERROR while loading taxonomies" in err + + +def test_tax_summarize_csv(runtmp): + # test basic operation w/csv output + taxfile = utils.get_test_data('tax/test.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 6" in out + assert "saved 18 lineage counts to 'ranks.csv'" in err + + csv_out = runtmp.output('ranks.csv') + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + val = row['lineage_count'] + c[val] += 1 + + assert c['3'] == 7 + assert c['2'] == 5 + assert c['1'] == 5 + + +def test_tax_summarize_on_annotate(runtmp): + # test summarize on output of annotate basics + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + runtmp.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert os.path.exists(csvout) + + # so far so good - now see if we can run summarize! + + runtmp.run_sourmash('tax', 'summarize', csvout) + out = runtmp.last_result.out + err = runtmp.last_result.err + + print(out) + print(err) + + assert "number of distinct taxonomic lineages: 4" in out + assert "rank superkingdom: 1 distinct taxonomic lineages" in out + assert "rank phylum: 2 distinct taxonomic lineages" in out + assert "rank class: 2 distinct taxonomic lineages" in out + assert "rank order: 2 distinct taxonomic lineages" in out + assert "rank family: 2 distinct taxonomic lineages" in out + assert "rank genus: 3 distinct taxonomic lineages" in out + assert "rank species: 3 distinct taxonomic lineages" in out + + +def test_tax_summarize_strain_csv(runtmp): + # test basic operation w/csv output on taxonomy with strains + taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') + + runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 6" in out + assert "saved 24 lineage counts to 'ranks.csv'" in err + + csv_out = runtmp.output('ranks.csv') + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + print(row) + val = row['lineage_count'] + c[val] += 1 + + print(list(c.most_common())) + + assert c['3'] == 7 + assert c['2'] == 5 + assert c['6'] == 1 + assert c['1'] == 11 + + +def test_tax_summarize_strain_csv_with_lineages(runtmp): + # test basic operation w/csv output on lineages-style file w/strain csv + taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv') + lineage_csv = runtmp.output('lin-with-strains.csv') + + taxdb = tax_utils.LineageDB.load(taxfile) + with open(lineage_csv, 'w', newline="") as fp: + w = csv.writer(fp) + w.writerow(['name', 'lineage']) + for k, v in taxdb.items(): + linstr = lca_utils.display_lineage(v) + w.writerow([k, linstr]) + + runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv') + + out = runtmp.last_result.out + err = runtmp.last_result.err + + assert "number of distinct taxonomic lineages: 6" in out + assert "saved 24 lineage counts to" in err + + csv_out = runtmp.output('ranks.csv') + + with sourmash_args.FileInputCSV(csv_out) as r: + # count number across ranks as a cheap consistency check + c = Counter() + for row in r: + print(row) + val = row['lineage_count'] + c[val] += 1 + + print(list(c.most_common())) + + assert c['3'] == 7 + assert c['2'] == 5 + assert c['6'] == 1 + assert c['1'] == 11 diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e2f4838476..7652970f60 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -3,6 +3,7 @@ """ import pytest +import os from os.path import basename import gzip @@ -129,6 +130,47 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): assert ids_missing == {"gA"} +def test_check_and_load_gather_lineage_csvs_empty(runtmp): + # try loading an empty annotated gather file + g_res = runtmp.output('empty.gather-tax.csv') + with open(g_res, 'w') as fp: + fp.write("") + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "cannot read taxonomy assignments" in str(exc.value) + + +def test_check_and_load_gather_lineage_csvs_bad_header(runtmp): + # test on file with wrong headers + g_res = runtmp.output('empty.gather-tax.csv') + with open(g_res, 'w', newline="") as fp: + fp.write("x,y,z") + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value) + + +def test_check_and_load_gather_lineage_csvs_dne(runtmp): + # test loading with-lineage file that does not exist + g_res = runtmp.output('empty.gather-tax.csv') + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "does not exist" in str(exc.value) + + +def test_check_and_load_gather_lineage_csvs_isdir(runtmp): + # test loading a with-lineage file that is actually a directory + g_res = runtmp.output('empty.gather-tax.csv') + os.mkdir(g_res) + + with pytest.raises(ValueError) as exc: + tax_assign = LineageDB.load_from_gather_with_lineages(g_res) + assert "is a directory" in str(exc.value) + + def test_check_and_load_gather_csvs_fail_on_missing(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') # make gather results with taxonomy name not in tax_assign