From 55a26159b89a267acd6b2bb3e32f301e2aa27100 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 6 Feb 2021 11:11:31 -0800 Subject: [PATCH] Remove lca gather (#1307) * remove lca gather * remove lca gather from docs and comments --- doc/classifying-signatures.md | 7 +- doc/command-line.md | 38 +--- src/sourmash/cli/lca/__init__.py | 1 - src/sourmash/cli/lca/gather.py | 32 ---- src/sourmash/lca/__init__.py | 1 - src/sourmash/lca/__main__.py | 3 +- src/sourmash/lca/command_gather.py | 280 ----------------------------- tests/test_lca.py | 196 +------------------- 8 files changed, 6 insertions(+), 552 deletions(-) delete mode 100644 src/sourmash/cli/lca/gather.py delete mode 100644 src/sourmash/lca/command_gather.py diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md index de0301439e..ade98e31d2 100644 --- a/doc/classifying-signatures.md +++ b/doc/classifying-signatures.md @@ -81,9 +81,6 @@ differences between the `sourmash lca` subcommands and the basic output structured taxonomic information, and these are what you should look to if you are interested in doing classification. -The command `lca gather` applies the `gather` algorithm to search an -LCA database; it reports taxonomy. - It's important to note that taxonomy based on k-mers is very, very specific and if you get a match, it's pretty reliable. On the converse, however, k-mer identification is very brittle with respect @@ -120,8 +117,8 @@ containment queries against genome databases. This will give you numbers that (approximately) match what you get from counting mapped reads. -If you compute your input signatures with `--track-abundance`, both -`sourmash gather` and `sourmash lca gather` will use that information +If you compute your input signatures with `--track-abundance`, +`sourmash gather` will use that information to calculate an abundance-weighted result. This will weight each match to a hash value by the multiplicity of the hash value in the query signature. You can turn off this behavior with diff --git a/doc/command-line.md b/doc/command-line.md index 1e0a8dcc70..4e1ce714a6 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -77,7 +77,6 @@ walkthrough of these commands. * `lca classify` classifies many signatures against an LCA database. * `lca summarize` summarizes the content of metagenomes using an LCA database. -* `lca gather` finds non-overlapping matches to a metagenome in an LCA database. * `lca index` creates a database for use with LCA subcommands. * `lca rankinfo` summarizes the content of a database. * `lca compare_csv` compares lineage spreadsheets, e.g. those output by `lca classify`. @@ -261,8 +260,8 @@ Note: Use `sourmash gather` to classify a metagenome against a collection of genomes with no (or incomplete) taxonomic information. Use `sourmash -lca summarize` and `sourmash lca gather` to classify a metagenome -using a collection of genomes with taxonomic information. +lca summarize` to classify a metagenome using a collection of genomes +with taxonomic information. ## `sourmash lca` subcommands for taxonomic classification @@ -431,39 +430,6 @@ text file passed to `sourmash lca summarize` with the `--query-from-file` flag; these files will be appended to the `--query` input. -### `sourmash lca gather` - find metagenome taxonomy (DEPRECATED for 4.0) - -The `sourmash lca gather` command finds all non-overlapping -matches to the query, similar to the `sourmash gather` command. This -is specifically meant for metagenome and genome bin analysis. (See -[Classifying Signatures](classifying-signatures.md) for more -information on the different approaches that can be used here.) - -If the input signature was computed with `--track-abundance`, output -will be abundance weighted (unless `--ignore-abundances` is -specified). `-o/--output` will create a CSV file containing the -matches. - -Usage: - -``` -sourmash lca gather query.sig [ ...] -``` - -Example output: - -``` -overlap p_query p_match ---------- ------- -------- -1.8 Mbp 14.6% 9.1% Fusobacterium nucleatum -1.0 Mbp 7.8% 16.3% Proteiniclasticum ruminis -1.0 Mbp 7.7% 25.9% Haloferax volcanii -0.9 Mbp 7.4% 11.8% Nostoc sp. PCC 7120 -0.9 Mbp 7.0% 5.8% Shewanella baltica -0.8 Mbp 6.0% 8.6% Desulfovibrio vulgaris -0.6 Mbp 4.9% 12.6% Thermus thermophilus -``` - ### `sourmash lca index` - build an LCA database The `sourmash lca index` command creates an LCA database from diff --git a/src/sourmash/cli/lca/__init__.py b/src/sourmash/cli/lca/__init__.py index a02e467bb7..a403876d02 100644 --- a/src/sourmash/cli/lca/__init__.py +++ b/src/sourmash/cli/lca/__init__.py @@ -6,7 +6,6 @@ from . import classify from . import compare_csv -from . import gather from . import index from . import rankinfo from . import summarize diff --git a/src/sourmash/cli/lca/gather.py b/src/sourmash/cli/lca/gather.py deleted file mode 100644 index 0f9c27057a..0000000000 --- a/src/sourmash/cli/lca/gather.py +++ /dev/null @@ -1,32 +0,0 @@ -"""classify metagenomes""" - - -def subparser(subparsers): - subparser = subparsers.add_parser('gather') - subparser.add_argument('query') - subparser.add_argument('db', nargs='+') - subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output') - subparser.add_argument( - '-d', '--debug', action='store_true', - help='output debugging output' - ) - subparser.add_argument( - '-o', '--output', metavar='FILE', - help='output CSV containing matches to this file' - ) - subparser.add_argument( - '--output-unassigned', metavar='FILE', - help='output unassigned portions of the query as a signature to this ' - 'file' - ) - subparser.add_argument( - '--ignore-abundance', action='store_true', - help='do NOT use k-mer abundances if present' - ) - - -def main(args): - import sourmash - return sourmash.lca.gather_main(args) diff --git a/src/sourmash/lca/__init__.py b/src/sourmash/lca/__init__.py index e595ef8b9b..c2f3324b0a 100644 --- a/src/sourmash/lca/__init__.py +++ b/src/sourmash/lca/__init__.py @@ -9,6 +9,5 @@ from .command_classify import classify from .command_summarize import summarize_main from .command_rankinfo import rankinfo_main -from .command_gather import gather_main from .__main__ import main diff --git a/src/sourmash/lca/__main__.py b/src/sourmash/lca/__main__.py index 40277bd38e..b02b891771 100644 --- a/src/sourmash/lca/__main__.py +++ b/src/sourmash/lca/__main__.py @@ -5,7 +5,7 @@ import sys import argparse -from . import classify, index, summarize_main, rankinfo_main, gather_main +from . import classify, index, summarize_main, rankinfo_main from .command_compare_csv import compare_csv from ..logging import set_quiet, error @@ -16,7 +16,6 @@ index - create LCA database classify --db --query - classify genomes -gather - classify metagenomes summarize --db --query - summarize mixture rankinfo - database rank info compare_csv - compare spreadsheets diff --git a/src/sourmash/lca/command_gather.py b/src/sourmash/lca/command_gather.py deleted file mode 100644 index 57454138e6..0000000000 --- a/src/sourmash/lca/command_gather.py +++ /dev/null @@ -1,280 +0,0 @@ -#! /usr/bin/env python -""" -Execute a greedy search on lineages attached to hashvals in the query. - -Mimics `sourmash gather` but provides taxonomic information. -""" -import sys -import csv -from collections import Counter, defaultdict, namedtuple - -from .. import sourmash_args, save_signatures, SourmashSignature -from ..logging import notify, print_results, set_quiet, debug -from . import lca_utils -from .lca_utils import check_files_exist -from ..search import format_bp - - -LCAGatherResult = namedtuple('LCAGatherResult', - 'intersect_bp, f_unique_to_query, f_unique_weighted, average_abund, lineage, f_match, name, n_equal_matches') - - -def format_lineage(lineage_tup): - """ - Pretty print lineage. - """ - # list of ranks present - present = [ l.rank for l in lineage_tup if l.name ] - d = dict(lineage_tup) # rank: value - - if 'genus' in present: - genus = d['genus'] - if 'strain' in present: - name = d['strain'] - elif 'species' in present: - species = d['species'] - if species.startswith(genus + ' ') or \ - species.startswith(genus + '_'): - name = species - else: - name = '{} {}'.format(genus, species) - else: - name = '{} sp.'.format(genus) - elif len(present) < 3: - lineage_str = lca_utils.zip_lineage(lineage_tup, truncate_empty=True) - lineage_str = "; ".join(lineage_str) - name = lineage_str + ' - (no further assignment)' - elif len(present) > 1 and 'superkingdom' in present: - lowest_rank = present[-1] - name = '{}; {} {}'.format(d['superkingdom'], lowest_rank, - d[lowest_rank]) - else: - lineage_str = lca_utils.zip_lineage(lineage_tup, truncate_empty=True) - lineage_str = "; ".join(lineage_str) - name = lineage_str - - return name - - -def gather_signature(query_sig, dblist, ignore_abundance): - """ - Decompose 'query_sig' using the given list of databases. - """ - notify('loaded query: {}... (k={})', str(query_sig)[:30], - query_sig.minhash.ksize) - - # extract the basic set of mins - query_mins = set(query_sig.minhash.hashes) - n_mins = len(query_mins) - - if query_sig.minhash.track_abundance and not ignore_abundance: - orig_abunds = query_sig.minhash.hashes - else: - if query_sig.minhash.track_abundance and ignore_abundance: - notify('** ignoring abundance') - orig_abunds = { k: 1 for k in query_mins } - sum_abunds = sum(orig_abunds.values()) - - # now! do the gather: - while 1: - # find all of the assignments for the current set of hashes - assignments = defaultdict(set) - for hashval in query_mins: - for lca_db in dblist: - idx_list = lca_db.hashval_to_idx.get(hashval, []) - - for idx in idx_list: - assignments[hashval].add((lca_db, idx)) - - # none? quit. - if not assignments: - break - - # count the distinct signatures. - counts = Counter() - for hashval, match_set in assignments.items(): - for (lca_db, idx) in match_set: - counts[(lca_db, idx)] += 1 - - # collect the most abundant assignments - best_list = [] - top_count = 0 - - for (lca_db, idx), count in counts.most_common(): - if not best_list: - top_count = count - best_list.append((lca_db, idx)) - continue - - if count != top_count: - break - - best_list.append((lca_db, idx)) - - # sort on idx and pick the lowest (for consistency). - best_list.sort(key=lambda x: x[1]) - best_lca_db, best_idx = best_list[0] - - equiv_counts = len(best_list) - 1 - - # now, remove hashes from query mins. - intersect_mins = set() - for hashval, match_set in assignments.items(): - if (best_lca_db, best_idx) in match_set: - query_mins.remove(hashval) - intersect_mins.add(hashval) - - # should match! - assert top_count == len(intersect_mins) - - # calculate size of match (# of hashvals belonging to that sig) - match_size = 0 - for hashval, idx_list in best_lca_db.hashval_to_idx.items(): - if best_idx in idx_list: - match_size += 1 - - # construct 'result' object - intersect_bp = top_count * query_sig.minhash.scaled - f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \ - / sum_abunds - average_abund = sum((orig_abunds[k] for k in intersect_mins)) \ - / len(intersect_mins) - f_match = len(intersect_mins) / match_size - - # XXX name and lineage - for ident, idx in best_lca_db.ident_to_idx.items(): - if idx == best_idx: - name = best_lca_db.ident_to_name[ident] - - lid = best_lca_db.idx_to_lid.get(best_idx) - lineage = () - if lid is not None: - lineage = best_lca_db.lid_to_lineage[lid] - - result = LCAGatherResult(intersect_bp = intersect_bp, - f_unique_to_query= top_count / n_mins, - f_unique_weighted=f_unique_weighted, - average_abund=average_abund, - f_match=f_match, - lineage=lineage, - name=name, - n_equal_matches=equiv_counts) - - f_unassigned = len(query_mins) / n_mins - est_bp = len(query_mins) * query_sig.minhash.scaled - - yield result, f_unassigned, est_bp, query_mins - - ## done. - - -def gather_main(args): - """ - Do a greedy search for the hash components of a query against an LCA db. - - Here we don't actually do a least-common-ancestor search of any kind; we - do essentially the same kind of search as we do in `sourmash gather`, with - the main difference that we are implicitly combining different genomes of - identical lineages. - - This takes advantage of the structure of the LCA db, where we store the - full lineage information for each known hash, as opposed to storing only - the least-common-ancestor information for it. - """ - set_quiet(args.quiet, args.debug) - - notify("** WARNING: lca gather is deprecated as of sourmash 3.4, and will") - notify("** be removed in sourmash 4.0; use 'gather' instead.") - notify('') - - if not check_files_exist(args.query, *args.db): - sys.exit(-1) - - # load all the databases - dblist, ksize, scaled = lca_utils.load_databases(args.db, None) - - # for each query, gather all the matches across databases - moltype = dblist[0].moltype - query_sig = sourmash_args.load_query_signature(args.query, ksize, moltype) - debug('classifying', query_sig) - - # make sure we're looking at the same scaled value as database - query_sig.minhash = query_sig.minhash.downsample(scaled=scaled) - - # do the classification, output results - found = [] - for result, f_unassigned, est_bp, remaining_mins in gather_signature(query_sig, dblist, args.ignore_abundance): - # is this our first time through the loop? print headers, if so. - if not len(found): - print_results("") - print_results("overlap p_query p_match ") - print_results("--------- ------- --------") - - # output! - pct_query = '{:.1f}%'.format(result.f_unique_to_query*100) - pct_match = '{:.1f}%'.format(result.f_match*100) - str_bp = format_bp(result.intersect_bp) - name = format_lineage(result.lineage) - - equal_match_str = "" - if result.n_equal_matches: - equal_match_str = " (** {} equal matches)".format(result.n_equal_matches) - - print_results('{:9} {:>6} {:>6} {}{}', str_bp, pct_query, - pct_match, name, equal_match_str) - - found.append(result) - - if found: - print_results('') - if f_unassigned: - print_results('{:.1f}% ({}) of hashes have no assignment.', f_unassigned*100, - format_bp(est_bp)) - else: - print_results('Query is completely assigned.') - print_results('') - # nothing found. - else: - est_bp = len(query_sig.minhash) * query_sig.minhash.scaled - print_results('') - print_results('No assignment for est {} of sequence.', - format_bp(est_bp)) - print_results('') - - if not found: - sys.exit(0) - - if args.output: - fieldnames = ['intersect_bp', 'f_match', 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'name', 'n_equal_matches'] + list(lca_utils.taxlist()) - - with sourmash_args.FileOutput(args.output, 'wt') as csv_fp: - w = csv.DictWriter(csv_fp, fieldnames=fieldnames) - w.writeheader() - for result in found: - lineage = result.lineage - d = dict(result._asdict()) - del d['lineage'] - - for (rank, value) in lineage: - d[rank] = value - - w.writerow(d) - - if args.output_unassigned: - if not found: - notify('nothing found - entire query signature unassigned.') - elif not remaining_mins: - notify('no unassigned hashes! not saving.') - else: - notify('saving unassigned hashes to "{}"', args.output_unassigned) - - e = query_sig.minhash.copy_and_clear() - e.add_many(remaining_mins) - - with sourmash_args.FileOutput(args.output_unassigned, 'wt') as fp: - save_signatures([ SourmashSignature(e) ], fp) - - -if __name__ == '__main__': - sys.exit(gather_main(sys.argv[1:])) diff --git a/tests/test_lca.py b/tests/test_lca.py index a222b92157..e39ad1a394 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -1807,203 +1807,9 @@ def test_compare_csv_real(): assert '0 incompatible at rank species' in err -def test_single_gather(): - with utils.TempDirectory() as location: - db1 = utils.get_test_data('lca/delmont-1.lca.json') - input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - in_dir = os.path.join(location, 'sigs') - os.mkdir(in_dir) - shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig')) - - cmd = ['lca', 'gather', input_sig, db1] - status, out, err = utils.runscript('sourmash', cmd) - - print(cmd) - print(out) - print(err) - - assert '2.0 Mbp 100.0% 100.0% Alteromonas_macleodii' in out - assert 'Query is completely assigned.' - - -def test_gather_unknown_hashes(): - with utils.TempDirectory() as location: - taxcsv = utils.get_test_data('lca-root/tax.csv') - input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig') - input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig') - lca_db = os.path.join(location, 'lca-root.lca.json') - - cmd = ['lca', 'index', taxcsv, lca_db, input_sig2] - status, out, err = utils.runscript('sourmash', cmd) - - print(cmd) - print(out) - print(err) - - assert os.path.exists(lca_db) - - assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in err - - cmd = ['lca', 'gather', input_sig1, lca_db] - status, out, err = utils.runscript('sourmash', cmd) - - print(cmd) - print(out) - print(err) - - assert '270.0 kbp 11.5% 21.4% Archaea; family novelFamily_I' in out - assert '88.5% (2.1 Mbp) of hashes have no assignment.' in out - - -def test_gather_combined_results(): - with utils.TempDirectory() as location: - query_sig = utils.get_test_data('47+63.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') - - cmd = ['lca', 'gather', query_sig, lca_db, '-o', 'matches.csv'] - status, out, err = utils.runscript('sourmash', cmd, - in_directory=location) - - print(cmd) - print(out) - print(err) - - assert '5.5 Mbp 69.4% 100.0% Shewanella baltica OS223' in out - assert '2.4 Mbp 30.6% 47.1% Shewanella baltica OS185' in out - - -def test_gather_equiv_results(): - with utils.TempDirectory() as location: - query_sig = utils.get_test_data('47+63-intersect.fa.sig') - lca_db = utils.get_test_data('lca/47+63.lca.json') - - cmd = ['lca', 'gather', query_sig, lca_db, '-o', 'matches.csv'] - status, out, err = utils.runscript('sourmash', cmd, - in_directory=location) - - print(cmd) - print(out) - print(err) - - assert '2.7 Mbp 100.0%' in out - assert 'Shewanella baltica' in out - assert '(** 1 equal matches)' in out - assert ('OS223' in out) or ('OS185' in out) - - assert os.path.exists(lca_db) - - r = csv.DictReader(open(os.path.join(location, 'matches.csv'))) - row = next(r) - assert row['n_equal_matches'] == '1' - - -def test_gather_old_lca_db(): - with utils.TempDirectory() as location: - query_sig = utils.get_test_data('47+63.fa.sig') - lca_db = utils.get_test_data('lca/old-db-format-1.json') - - cmd = ['lca', 'gather', query_sig, lca_db] - status, out, err = utils.runscript('sourmash', cmd, - in_directory=location, - fail_ok=True) - - print(cmd) - print(out) - print(err) - assert 'Error! This is an old-style LCA DB.' in err - assert status != 0 - - -@utils.in_tempdir -def test_incompat_lca_db_scaled(c): - # create a database with scaled of 10000 - testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') - c.run_sourmash('compute', '-k', '25', '--scaled', '10000', testdata1, - '-o', 'test_db.sig') - print(c) - - c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), - 'test.lca.json', 'test_db.sig', - '-k', '25', '--scaled', '10000') - print(c) - - # next, create a query sig with scaled of 100000 - testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') - c.run_sourmash('compute', '-k', '25', '--scaled', '100000', testdata1, - '-o', 'test_query.sig') - print(c) - - with pytest.raises(ValueError) as e: - c.run_sourmash('lca', 'gather', 'test_query.sig', 'test.lca.json') - print(c) - - assert 'new scaled 10000 is lower than current sample scaled 10000' in str(e.value) - - -@utils.in_thisdir -def test_lca_gather_protein(c): - # test lca gather on protein foo - testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db1 = utils.get_test_data('prot/protein.lca.json.gz') - - c.run_sourmash('lca', 'gather', testquery, db1) - - assert c.last_result.status == 0 - assert 'loaded 1 LCA databases. ksize=19, scaled=100 moltype=protein' in c.last_result.err - assert '340.9 kbp 100.0% 100.0% s__B26-1 sp001593925 sp.' in c.last_result.out - - -@utils.in_thisdir -def test_lca_gather_deprecated_message(c): - # lca gather is deprecated for 4.0; check message - testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db1 = utils.get_test_data('prot/protein.lca.json.gz') - - c.run_sourmash('lca', 'gather', testquery, db1) - - assert c.last_result.status == 0 - assert 'WARNING: lca gather is deprecated as of sourmash 3.4' in c.last_result.err - - -@utils.in_thisdir -def test_incompat_lca_db_moltype(c): - # test load of incompatible LCA DBs - testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') - db1 = utils.get_test_data('prot/protein.lca.json.gz') - db2 = utils.get_test_data('prot/dayhoff.lca.json.gz') - - with pytest.raises(ValueError) as e: - c.run_sourmash('lca', 'gather', testquery, db1, db2) - - assert 'Exception: multiple moltypes, quitting' in str(e.value) - - -@utils.in_tempdir -def test_incompat_lca_db_ksize(c): - # create a database with ksize of 25 - testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') - c.run_sourmash('compute', '-k', '25', '--scaled', '1000', testdata1, - '-o', 'test_db.sig') - print(c) - - c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',), - 'test.lca.json', 'test_db.sig', - '-k', '25', '--scaled', '10000') - print(c) - - # this should fail: the LCA database has ksize 25, and the query sig has - # no compatible ksizes. - with pytest.raises(ValueError) as e: - c.run_sourmash('lca', 'gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), 'test.lca.json') - print(c.last_result) - - assert '0 signatures matching ksize and molecule type;' in str(e.value) - - @utils.in_tempdir def test_incompat_lca_db_ksize_2(c): - # test on gather, not just lca gather - # create a database with ksize of 25 + # test on gather - create a database with ksize of 25 testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz') c.run_sourmash('compute', '-k', '25', '--scaled', '1000', testdata1, '-o', 'test_db.sig')