From 55a26159b89a267acd6b2bb3e32f301e2aa27100 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 6 Feb 2021 11:11:31 -0800
Subject: [PATCH] Remove lca gather (#1307)

* remove lca gather
* remove lca gather from docs and comments
---
 doc/classifying-signatures.md      |   7 +-
 doc/command-line.md                |  38 +---
 src/sourmash/cli/lca/__init__.py   |   1 -
 src/sourmash/cli/lca/gather.py     |  32 ----
 src/sourmash/lca/__init__.py       |   1 -
 src/sourmash/lca/__main__.py       |   3 +-
 src/sourmash/lca/command_gather.py | 280 -----------------------------
 tests/test_lca.py                  | 196 +-------------------
 8 files changed, 6 insertions(+), 552 deletions(-)
 delete mode 100644 src/sourmash/cli/lca/gather.py
 delete mode 100644 src/sourmash/lca/command_gather.py
diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md
index de0301439e..ade98e31d2 100644
--- a/doc/classifying-signatures.md
+++ b/doc/classifying-signatures.md
@@ -81,9 +81,6 @@ differences between the `sourmash lca` subcommands and the basic
 output structured taxonomic information, and these are what you should look
 to if you are interested in doing classification.
 
-The command `lca gather` applies the `gather` algorithm to search an
-LCA database; it reports taxonomy.
-
 It's important to note that taxonomy based on k-mers is very, very
 specific and if you get a match, it's pretty reliable. On the
 converse, however, k-mer identification is very brittle with respect
@@ -120,8 +117,8 @@ containment queries against genome databases.  This will give you
 numbers that (approximately) match what you get from counting mapped
 reads.
 
-If you compute your input signatures with `--track-abundance`, both
-`sourmash gather` and `sourmash lca gather` will use that information
+If you compute your input signatures with `--track-abundance`,
+`sourmash gather` will use that information
 to calculate an abundance-weighted result.  This will weight
 each match to a hash value by the multiplicity of the hash value in
 the query signature.  You can turn off this behavior with
diff --git a/doc/command-line.md b/doc/command-line.md
index 1e0a8dcc70..4e1ce714a6 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -77,7 +77,6 @@ walkthrough of these commands.
 
 * `lca classify` classifies many signatures against an LCA database.
 * `lca summarize` summarizes the content of metagenomes using an LCA database.
-* `lca gather` finds non-overlapping matches to a metagenome in an LCA database.
 * `lca index` creates a database for use with LCA subcommands.
 * `lca rankinfo` summarizes the content of a database.
 * `lca compare_csv` compares lineage spreadsheets, e.g. those output by `lca classify`.
@@ -261,8 +260,8 @@ Note:
 
 Use `sourmash gather` to classify a metagenome against a collection of
 genomes with no (or incomplete) taxonomic information.  Use `sourmash
-lca summarize` and `sourmash lca gather` to classify a metagenome
-using a collection of genomes with taxonomic information.
+lca summarize` to classify a metagenome using a collection of genomes
+with taxonomic information.
 
 ## `sourmash lca` subcommands for taxonomic classification
 
@@ -431,39 +430,6 @@ text file passed to `sourmash lca summarize` with the
 `--query-from-file` flag; these files will be appended to the `--query`
 input.
 
-### `sourmash lca gather` - find metagenome taxonomy (DEPRECATED for 4.0)
-
-The `sourmash lca gather` command finds all non-overlapping
-matches to the query, similar to the `sourmash gather` command.  This
-is specifically meant for metagenome and genome bin analysis.  (See
-[Classifying Signatures](classifying-signatures.md) for more
-information on the different approaches that can be used here.)
-
-If the input signature was computed with `--track-abundance`, output
-will be abundance weighted (unless `--ignore-abundances` is
-specified).  `-o/--output` will create a CSV file containing the
-matches.
-
-Usage:
-
-```
-sourmash lca gather query.sig [<lca database> ...]
-```
-
-Example output:
-
-```
-overlap     p_query p_match
----------   ------- --------
-1.8 Mbp      14.6%    9.1%      Fusobacterium nucleatum
-1.0 Mbp       7.8%   16.3%      Proteiniclasticum ruminis
-1.0 Mbp       7.7%   25.9%      Haloferax volcanii
-0.9 Mbp       7.4%   11.8%      Nostoc sp. PCC 7120
-0.9 Mbp       7.0%    5.8%      Shewanella baltica
-0.8 Mbp       6.0%    8.6%      Desulfovibrio vulgaris
-0.6 Mbp       4.9%   12.6%      Thermus thermophilus
-```
-
 ### `sourmash lca index` - build an LCA database
 
 The `sourmash lca index` command creates an LCA database from
diff --git a/src/sourmash/cli/lca/__init__.py b/src/sourmash/cli/lca/__init__.py
index a02e467bb7..a403876d02 100644
--- a/src/sourmash/cli/lca/__init__.py
+++ b/src/sourmash/cli/lca/__init__.py
@@ -6,7 +6,6 @@
 
 from . import classify
 from . import compare_csv
-from . import gather
 from . import index
 from . import rankinfo
 from . import summarize
diff --git a/src/sourmash/cli/lca/gather.py b/src/sourmash/cli/lca/gather.py
deleted file mode 100644
index 0f9c27057a..0000000000
--- a/src/sourmash/cli/lca/gather.py
+++ /dev/null
@@ -1,32 +0,0 @@
-"""classify metagenomes"""
-
-
-def subparser(subparsers):
-    subparser = subparsers.add_parser('gather')
-    subparser.add_argument('query')
-    subparser.add_argument('db', nargs='+')
-    subparser.add_argument(
-        '-q', '--quiet', action='store_true',
-        help='suppress non-error output')
-    subparser.add_argument(
-        '-d', '--debug', action='store_true',
-        help='output debugging output'
-    )
-    subparser.add_argument(
-        '-o', '--output', metavar='FILE',
-        help='output CSV containing matches to this file'
-    )
-    subparser.add_argument(
-        '--output-unassigned', metavar='FILE',
-        help='output unassigned portions of the query as a signature to this '
-        'file'
-    )
-    subparser.add_argument(
-        '--ignore-abundance', action='store_true',
-        help='do NOT use k-mer abundances if present'
-    )
-
-
-def main(args):
-    import sourmash
-    return sourmash.lca.gather_main(args)
diff --git a/src/sourmash/lca/__init__.py b/src/sourmash/lca/__init__.py
index e595ef8b9b..c2f3324b0a 100644
--- a/src/sourmash/lca/__init__.py
+++ b/src/sourmash/lca/__init__.py
@@ -9,6 +9,5 @@
 from .command_classify import classify
 from .command_summarize import summarize_main
 from .command_rankinfo import rankinfo_main
-from .command_gather import gather_main
 from .__main__ import main
 
diff --git a/src/sourmash/lca/__main__.py b/src/sourmash/lca/__main__.py
index 40277bd38e..b02b891771 100644
--- a/src/sourmash/lca/__main__.py
+++ b/src/sourmash/lca/__main__.py
@@ -5,7 +5,7 @@
 import sys
 import argparse
 
-from . import classify, index, summarize_main, rankinfo_main, gather_main
+from . import classify, index, summarize_main, rankinfo_main
 from .command_compare_csv import compare_csv
 from ..logging import set_quiet, error
 
@@ -16,7 +16,6 @@
 
 index <taxonomy.csv> <output_db name> <signature [...]>  - create LCA database
 classify --db <db_name [...]> --query <signature [...]>  - classify genomes
-gather <signature> <db_name [...]>                       - classify metagenomes
 summarize --db <db_name [...]> --query <signature [...]> - summarize mixture
 rankinfo <db_name [...]>                                 - database rank info
 compare_csv <csv1> <csv2>                                - compare spreadsheets
diff --git a/src/sourmash/lca/command_gather.py b/src/sourmash/lca/command_gather.py
deleted file mode 100644
index 57454138e6..0000000000
--- a/src/sourmash/lca/command_gather.py
+++ /dev/null
@@ -1,280 +0,0 @@
-#! /usr/bin/env python
-"""
-Execute a greedy search on lineages attached to hashvals in the query.
-
-Mimics `sourmash gather` but provides taxonomic information.
-"""
-import sys
-import csv
-from collections import Counter, defaultdict, namedtuple
-
-from .. import sourmash_args, save_signatures, SourmashSignature
-from ..logging import notify, print_results, set_quiet, debug
-from . import lca_utils
-from .lca_utils import check_files_exist
-from ..search import format_bp
-
-
-LCAGatherResult = namedtuple('LCAGatherResult',
-                             'intersect_bp, f_unique_to_query, f_unique_weighted, average_abund, lineage, f_match, name, n_equal_matches')
-
-
-def format_lineage(lineage_tup):
-    """
-    Pretty print lineage.
-    """
-    # list of ranks present
-    present = [ l.rank for l in lineage_tup if l.name ]
-    d = dict(lineage_tup) # rank: value
-
-    if 'genus' in present:
-        genus = d['genus']
-        if 'strain' in present:
-            name = d['strain']
-        elif 'species' in present:
-            species = d['species']
-            if species.startswith(genus + ' ') or \
-              species.startswith(genus + '_'):
-                name = species
-            else:
-                name = '{} {}'.format(genus, species)
-        else:
-            name = '{} sp.'.format(genus)
-    elif len(present) < 3:
-        lineage_str = lca_utils.zip_lineage(lineage_tup, truncate_empty=True)
-        lineage_str = "; ".join(lineage_str)
-        name = lineage_str + ' - (no further assignment)'
-    elif len(present) > 1 and 'superkingdom' in present:
-        lowest_rank = present[-1]
-        name = '{}; {} {}'.format(d['superkingdom'], lowest_rank,
-                                   d[lowest_rank])
-    else:
-        lineage_str = lca_utils.zip_lineage(lineage_tup, truncate_empty=True)
-        lineage_str = "; ".join(lineage_str)
-        name = lineage_str
-
-    return name
-
-
-def gather_signature(query_sig, dblist, ignore_abundance):
-    """
-    Decompose 'query_sig' using the given list of databases.
-    """
-    notify('loaded query: {}... (k={})', str(query_sig)[:30],
-                                         query_sig.minhash.ksize)
-
-    # extract the basic set of mins
-    query_mins = set(query_sig.minhash.hashes)
-    n_mins = len(query_mins)
-
-    if query_sig.minhash.track_abundance and not ignore_abundance:
-        orig_abunds = query_sig.minhash.hashes
-    else:
-        if query_sig.minhash.track_abundance and ignore_abundance:
-            notify('** ignoring abundance')
-        orig_abunds = { k: 1 for k in query_mins }
-    sum_abunds = sum(orig_abunds.values())
-
-    # now! do the gather:
-    while 1:
-        # find all of the assignments for the current set of hashes
-        assignments = defaultdict(set)
-        for hashval in query_mins:
-            for lca_db in dblist:
-                idx_list = lca_db.hashval_to_idx.get(hashval, [])
-
-                for idx in idx_list:
-                    assignments[hashval].add((lca_db, idx))
-
-        # none? quit.
-        if not assignments:
-            break
-
-        # count the distinct signatures.
-        counts = Counter()
-        for hashval, match_set in assignments.items():
-            for (lca_db, idx) in match_set:
-                counts[(lca_db, idx)] += 1
-
-        # collect the most abundant assignments
-        best_list = []
-        top_count = 0
-
-        for (lca_db, idx), count in counts.most_common():
-            if not best_list:
-                top_count = count
-                best_list.append((lca_db, idx))
-                continue
-
-            if count != top_count:
-                break
-
-            best_list.append((lca_db, idx))
-
-        # sort on idx and pick the lowest (for consistency).
-        best_list.sort(key=lambda x: x[1])
-        best_lca_db, best_idx = best_list[0]
-
-        equiv_counts = len(best_list) - 1
-
-        # now, remove hashes from query mins.
-        intersect_mins = set()
-        for hashval, match_set in assignments.items():
-            if (best_lca_db, best_idx) in match_set:
-                query_mins.remove(hashval)
-                intersect_mins.add(hashval)
-
-        # should match!
-        assert top_count == len(intersect_mins)
-
-        # calculate size of match (# of hashvals belonging to that sig)
-        match_size = 0
-        for hashval, idx_list in best_lca_db.hashval_to_idx.items():
-            if best_idx in idx_list:
-                match_size += 1
-
-        # construct 'result' object
-        intersect_bp = top_count * query_sig.minhash.scaled
-        f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \
-               / sum_abunds
-        average_abund = sum((orig_abunds[k] for k in intersect_mins)) \
-               / len(intersect_mins)
-        f_match = len(intersect_mins) / match_size
-
-        # XXX name and lineage
-        for ident, idx in best_lca_db.ident_to_idx.items():
-            if idx == best_idx:
-                name = best_lca_db.ident_to_name[ident]
-
-        lid = best_lca_db.idx_to_lid.get(best_idx)
-        lineage = ()
-        if lid is not None:
-            lineage = best_lca_db.lid_to_lineage[lid]
-
-        result = LCAGatherResult(intersect_bp = intersect_bp,
-                                 f_unique_to_query= top_count / n_mins,
-                                 f_unique_weighted=f_unique_weighted,
-                                 average_abund=average_abund,
-                                 f_match=f_match,
-                                 lineage=lineage,
-                                 name=name,
-                                 n_equal_matches=equiv_counts)
-
-        f_unassigned = len(query_mins) / n_mins
-        est_bp = len(query_mins) * query_sig.minhash.scaled
-
-        yield result, f_unassigned, est_bp, query_mins
-
-    ## done.
-
-
-def gather_main(args):
-    """
-    Do a greedy search for the hash components of a query against an LCA db.
-
-    Here we don't actually do a least-common-ancestor search of any kind; we
-    do essentially the same kind of search as we do in `sourmash gather`, with
-    the main difference that we are implicitly combining different genomes of
-    identical lineages.
-
-    This takes advantage of the structure of the LCA db, where we store the
-    full lineage information for each known hash, as opposed to storing only
-    the least-common-ancestor information for it.
-    """
-    set_quiet(args.quiet, args.debug)
-
-    notify("** WARNING: lca gather is deprecated as of sourmash 3.4, and will")
-    notify("**    be removed in sourmash 4.0; use 'gather' instead.")
-    notify('')
-
-    if not check_files_exist(args.query, *args.db):
-        sys.exit(-1)
-
-    # load all the databases
-    dblist, ksize, scaled = lca_utils.load_databases(args.db, None)
-
-    # for each query, gather all the matches across databases
-    moltype = dblist[0].moltype
-    query_sig = sourmash_args.load_query_signature(args.query, ksize, moltype)
-    debug('classifying', query_sig)
-
-    # make sure we're looking at the same scaled value as database
-    query_sig.minhash = query_sig.minhash.downsample(scaled=scaled)
-
-    # do the classification, output results
-    found = []
-    for result, f_unassigned, est_bp, remaining_mins in gather_signature(query_sig, dblist, args.ignore_abundance):
-        # is this our first time through the loop? print headers, if so.
-        if not len(found):
-            print_results("")
-            print_results("overlap     p_query p_match ")
-            print_results("---------   ------- --------")
-
-        # output!
-        pct_query = '{:.1f}%'.format(result.f_unique_to_query*100)
-        pct_match = '{:.1f}%'.format(result.f_match*100)
-        str_bp = format_bp(result.intersect_bp)
-        name = format_lineage(result.lineage)
-
-        equal_match_str = ""
-        if result.n_equal_matches:
-            equal_match_str = " (** {} equal matches)".format(result.n_equal_matches)
-
-        print_results('{:9}   {:>6}  {:>6}      {}{}', str_bp, pct_query,
-                      pct_match, name, equal_match_str)
-
-        found.append(result)
-
-    if found:
-        print_results('')
-        if f_unassigned:
-            print_results('{:.1f}% ({}) of hashes have no assignment.', f_unassigned*100,
-                          format_bp(est_bp))
-        else:
-            print_results('Query is completely assigned.')
-            print_results('')
-    # nothing found.
-    else:
-        est_bp = len(query_sig.minhash) * query_sig.minhash.scaled
-        print_results('')
-        print_results('No assignment for est {} of sequence.',
-                      format_bp(est_bp))
-        print_results('')
-
-    if not found:
-        sys.exit(0)
-
-    if args.output:
-        fieldnames = ['intersect_bp', 'f_match', 'f_unique_to_query', 'f_unique_weighted',
-                      'average_abund', 'name', 'n_equal_matches'] + list(lca_utils.taxlist())
-
-        with sourmash_args.FileOutput(args.output, 'wt') as csv_fp:
-            w = csv.DictWriter(csv_fp, fieldnames=fieldnames)
-            w.writeheader()
-            for result in found:
-                lineage = result.lineage
-                d = dict(result._asdict())
-                del d['lineage']
-
-                for (rank, value) in lineage:
-                    d[rank] = value
-
-                w.writerow(d)
-
-    if args.output_unassigned:
-        if not found:
-            notify('nothing found - entire query signature unassigned.')
-        elif not remaining_mins:
-            notify('no unassigned hashes! not saving.')
-        else:
-            notify('saving unassigned hashes to "{}"', args.output_unassigned)
-
-            e = query_sig.minhash.copy_and_clear()
-            e.add_many(remaining_mins)
-
-            with sourmash_args.FileOutput(args.output_unassigned, 'wt') as fp:
-                save_signatures([ SourmashSignature(e) ], fp)
-
-
-if __name__ == '__main__':
-    sys.exit(gather_main(sys.argv[1:]))
diff --git a/tests/test_lca.py b/tests/test_lca.py
index a222b92157..e39ad1a394 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -1807,203 +1807,9 @@ def test_compare_csv_real():
         assert '0 incompatible at rank species' in err
 
 
-def test_single_gather():
-    with utils.TempDirectory() as location:
-        db1 = utils.get_test_data('lca/delmont-1.lca.json')
-        input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')
-        in_dir = os.path.join(location, 'sigs')
-        os.mkdir(in_dir)
-        shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig'))
-
-        cmd = ['lca', 'gather', input_sig, db1]
-        status, out, err = utils.runscript('sourmash', cmd)
-
-        print(cmd)
-        print(out)
-        print(err)
-
-        assert '2.0 Mbp     100.0%  100.0%      Alteromonas_macleodii' in out
-        assert 'Query is completely assigned.'
-
-
-def test_gather_unknown_hashes():
-    with utils.TempDirectory() as location:
-        taxcsv = utils.get_test_data('lca-root/tax.csv')
-        input_sig1 = utils.get_test_data('lca-root/TARA_MED_MAG_00029.fa.sig')
-        input_sig2 = utils.get_test_data('lca-root/TOBG_MED-875.fna.gz.sig')
-        lca_db = os.path.join(location, 'lca-root.lca.json')
-
-        cmd = ['lca', 'index', taxcsv, lca_db, input_sig2]
-        status, out, err = utils.runscript('sourmash', cmd)
-
-        print(cmd)
-        print(out)
-        print(err)
-
-        assert os.path.exists(lca_db)
-
-        assert '1 identifiers used out of 2 distinct identifiers in spreadsheet.' in err
-
-        cmd = ['lca', 'gather', input_sig1, lca_db]
-        status, out, err = utils.runscript('sourmash', cmd)
-
-        print(cmd)
-        print(out)
-        print(err)
-
-        assert '270.0 kbp    11.5%   21.4%      Archaea; family novelFamily_I' in out
-        assert '88.5% (2.1 Mbp) of hashes have no assignment.' in out
-
-
-def test_gather_combined_results():
-    with utils.TempDirectory() as location:
-        query_sig = utils.get_test_data('47+63.fa.sig')
-        lca_db = utils.get_test_data('lca/47+63.lca.json')
-
-        cmd = ['lca', 'gather', query_sig, lca_db, '-o', 'matches.csv']
-        status, out, err = utils.runscript('sourmash', cmd,
-                                           in_directory=location)
-
-        print(cmd)
-        print(out)
-        print(err)
-
-        assert '5.5 Mbp      69.4%  100.0%      Shewanella baltica OS223' in out
-        assert '2.4 Mbp      30.6%   47.1%      Shewanella baltica OS185' in out
-
-
-def test_gather_equiv_results():
-    with utils.TempDirectory() as location:
-        query_sig = utils.get_test_data('47+63-intersect.fa.sig')
-        lca_db = utils.get_test_data('lca/47+63.lca.json')
-
-        cmd = ['lca', 'gather', query_sig, lca_db, '-o', 'matches.csv']
-        status, out, err = utils.runscript('sourmash', cmd,
-                                           in_directory=location)
-
-        print(cmd)
-        print(out)
-        print(err)
-
-        assert '2.7 Mbp     100.0%' in out
-        assert 'Shewanella baltica' in out
-        assert '(** 1 equal matches)' in out
-        assert ('OS223' in out) or ('OS185' in out)
-
-        assert os.path.exists(lca_db)
-
-        r = csv.DictReader(open(os.path.join(location, 'matches.csv')))
-        row = next(r)
-        assert row['n_equal_matches'] == '1'
-
-
-def test_gather_old_lca_db():
-    with utils.TempDirectory() as location:
-        query_sig = utils.get_test_data('47+63.fa.sig')
-        lca_db = utils.get_test_data('lca/old-db-format-1.json')
-
-        cmd = ['lca', 'gather', query_sig, lca_db]
-        status, out, err = utils.runscript('sourmash', cmd,
-                                           in_directory=location,
-                                           fail_ok=True)
-
-        print(cmd)
-        print(out)
-        print(err)
-        assert 'Error! This is an old-style LCA DB.' in err
-        assert status != 0
-
-
-@utils.in_tempdir
-def test_incompat_lca_db_scaled(c):
-    # create a database with scaled of 10000
-    testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz')
-    c.run_sourmash('compute', '-k', '25', '--scaled', '10000', testdata1,
-                   '-o', 'test_db.sig')
-    print(c)
-
-    c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',),
-                   'test.lca.json', 'test_db.sig',
-                    '-k', '25', '--scaled', '10000')
-    print(c)
-
-    # next, create a query sig with scaled of 100000
-    testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz')
-    c.run_sourmash('compute', '-k', '25', '--scaled', '100000', testdata1,
-                   '-o', 'test_query.sig')
-    print(c)
-
-    with pytest.raises(ValueError) as e:
-        c.run_sourmash('lca', 'gather', 'test_query.sig', 'test.lca.json')
-        print(c)
-
-    assert 'new scaled 10000 is lower than current sample scaled 10000' in str(e.value)
-
-
-@utils.in_thisdir
-def test_lca_gather_protein(c):
-    # test lca gather on protein foo
-    testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
-    db1 = utils.get_test_data('prot/protein.lca.json.gz')
-
-    c.run_sourmash('lca', 'gather', testquery, db1)
-
-    assert c.last_result.status == 0
-    assert 'loaded 1 LCA databases. ksize=19, scaled=100 moltype=protein' in c.last_result.err
-    assert '340.9 kbp   100.0%  100.0%      s__B26-1 sp001593925 sp.' in c.last_result.out
-
-
-@utils.in_thisdir
-def test_lca_gather_deprecated_message(c):
-    # lca gather is deprecated for 4.0; check message
-    testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
-    db1 = utils.get_test_data('prot/protein.lca.json.gz')
-
-    c.run_sourmash('lca', 'gather', testquery, db1)
-
-    assert c.last_result.status == 0
-    assert 'WARNING: lca gather is deprecated as of sourmash 3.4' in c.last_result.err
-
-
-@utils.in_thisdir
-def test_incompat_lca_db_moltype(c):
-    # test load of incompatible LCA DBs
-    testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
-    db1 = utils.get_test_data('prot/protein.lca.json.gz')
-    db2 = utils.get_test_data('prot/dayhoff.lca.json.gz')
-
-    with pytest.raises(ValueError) as e:
-        c.run_sourmash('lca', 'gather', testquery, db1, db2)
-
-    assert 'Exception: multiple moltypes, quitting' in str(e.value)
-
-
-@utils.in_tempdir
-def test_incompat_lca_db_ksize(c):
-    # create a database with ksize of 25
-    testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz')
-    c.run_sourmash('compute', '-k', '25', '--scaled', '1000', testdata1,
-                   '-o', 'test_db.sig')
-    print(c)
-
-    c.run_sourmash('lca', 'index', utils.get_test_data('lca/delmont-1.csv',),
-                   'test.lca.json', 'test_db.sig',
-                    '-k', '25', '--scaled', '10000')
-    print(c)
-
-    # this should fail: the LCA database has ksize 25, and the query sig has
-    # no compatible ksizes.
-    with pytest.raises(ValueError) as e:
-        c.run_sourmash('lca', 'gather', utils.get_test_data('lca/TARA_ASE_MAG_00031.sig'), 'test.lca.json')
-    print(c.last_result)
-
-    assert '0 signatures matching ksize and molecule type;' in str(e.value)
-
-
 @utils.in_tempdir
 def test_incompat_lca_db_ksize_2(c):
-    # test on gather, not just lca gather
-    # create a database with ksize of 25
+    # test on gather - create a database with ksize of 25
     testdata1 = utils.get_test_data('lca/TARA_ASE_MAG_00031.fa.gz')
     c.run_sourmash('compute', '-k', '25', '--scaled', '1000', testdata1,
                    '-o', 'test_db.sig')