From 6164f880c2fd460244ac446e81634372bc486e35 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 15 Oct 2022 09:46:25 -0700
Subject: [PATCH 01/15] initial addition of tax summarize

---
 src/sourmash/cli/tax/__init__.py  |  1 +
 src/sourmash/cli/tax/summarize.py | 53 +++++++++++++++++++++++++++++++
 src/sourmash/tax/__main__.py      | 42 +++++++++++++++++++++++-
 3 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 src/sourmash/cli/tax/summarize.py
diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py
index 3cdd006215..b8bf95f8d8 100644
--- a/src/sourmash/cli/tax/__init__.py
+++ b/src/sourmash/cli/tax/__init__.py
@@ -9,6 +9,7 @@
 from . import annotate
 from . import prepare
 from . import grep
+from . import summarize
 
 from ..utils import command_list
 from argparse import SUPPRESS, RawDescriptionHelpFormatter
diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py
new file mode 100644
index 0000000000..6e3a0a3e8c
--- /dev/null
+++ b/src/sourmash/cli/tax/summarize.py
@@ -0,0 +1,53 @@
+"""summarize taxonomy/lineage information"""
+
+usage="""
+
+    sourmash tax summarize <taxonomy_file> [ <more files> ... ]
+
+The 'tax summarize' command reads in one or more taxonomy databases
+or lineage files (produced by 'tax annotate'), combines them,
+and produces a human readable summary.
+
+Please see the 'tax summarize' documentation for more details:
+  https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-summarize
+@CTB check URL :)
+"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+
+def subparser(subparsers):
+    subparser = subparsers.add_parser('summarize',
+                                      usage=usage)
+    subparser.add_argument(
+        '-q', '--quiet', action='store_true',
+        help='suppress non-error output'
+    )
+    subparser.add_argument(
+        'taxonomy_files',
+        metavar='FILE',
+        nargs="+", action="extend",
+        help='database lineages'
+    )
+    subparser.add_argument(
+        '-o', '--output',
+        help='output file',
+        # @CTB implement
+    )
+    subparser.add_argument(
+        '--keep-full-identifiers', action='store_true',
+        help='do not split identifiers on whitespace'
+    )
+    subparser.add_argument(
+        '--keep-identifier-versions', action='store_true',
+        help='after splitting identifiers, do not remove accession versions'
+    )
+    subparser.add_argument(
+        '-f', '--force', action = 'store_true',
+        help='continue past errors in file and taxonomy loading',
+    )
+
+def main(args):
+    import sourmash
+    return sourmash.tax.__main__.summarize(args)
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index f9a274d5c6..b8c3d8e8d5 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -9,7 +9,7 @@
 
 import sourmash
 from ..sourmash_args import FileOutputCSV, FileOutput
-from sourmash.logging import set_quiet, error, notify
+from sourmash.logging import set_quiet, error, notify, print_results
 from sourmash.lca.lca_utils import display_lineage
 
 from . import tax_utils
@@ -438,6 +438,46 @@ def search_pattern(l, r):
         notify(f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'")
 
 
+def summarize(args):
+    "Summarize multiple taxonomy databases."
+    notify("loading taxonomies...")
+    try:
+        tax_assign = MultiLineageDB.load(args.taxonomy_files,
+                                         force=args.force,
+                       keep_full_identifiers=args.keep_full_identifiers,
+                       keep_identifier_versions=args.keep_identifier_versions)
+    except ValueError as exc:
+        error("ERROR while loading taxonomies!")
+        error(str(exc))
+        sys.exit(-1)
+
+    notify(f"...loaded {len(tax_assign)} entries.")
+
+    print_results(f"num idents: {len(tax_assign)}")
+
+    rank_counts = defaultdict(int)
+    name_seen = set()
+    for v in tax_assign.values():
+        sofar = []
+        for rank, name in v:
+            if name not in name_seen:
+                rank_counts[rank] += 1
+                name_seen.add(name)
+
+            if 0:
+                # check duplicates?
+                sofar.append(name)
+                tup = tuple(sofar)
+                seen.add(tuple(sofar))
+        #break
+
+    rank_count_items = list(rank_counts.items())
+    rank_count_items.sort(key=lambda x: x[1])
+    for rank, count in rank_count_items:
+        rank_name_str = f"{rank}:"
+        print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers")
+
+
 def main(arglist=None):
     args = sourmash.cli.get_parser().parse_args(arglist)
     submod = getattr(sourmash.cli.sig, args.subcmd)

From 06159c1456bf2def8043efd720e4a35bfa42092e Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 15 Oct 2022 09:46:35 -0700
Subject: [PATCH 02/15] report oddball CSV headers

---
 src/sourmash/tax/tax_utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 5dc85ceb63..0bf9038345 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -732,7 +732,8 @@ def load(cls, filename, *, delimiter=',', force=False,
                     identifier = 'accession'
                     header = ["ident" if "accession" == x else x for x in header]
                 else:
-                    raise ValueError('No taxonomic identifiers found.')
+                    header_str = ",".join([repr(x) for x in header])
+                    raise ValueError(f'No taxonomic identifiers found; headers are {header_str}')
             # is "strain" an available rank?
             if "strain" in header:
                 include_strain=True

From 2dad65ecfad735522c02127eabef4c82411d1979 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 15 Oct 2022 10:17:27 -0700
Subject: [PATCH 03/15] add provisional support for loading
 gather-with-lineages files as native taxonomies

---
 src/sourmash/tax/tax_utils.py | 64 ++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 0bf9038345..a832678987 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -731,6 +731,9 @@ def load(cls, filename, *, delimiter=',', force=False,
                 elif 'accession' in header:
                     identifier = 'accession'
                     header = ["ident" if "accession" == x else x for x in header]
+                elif 'name' in header and 'lineage' in header:
+                    return cls.load_from_gather_with_lineages(filename,
+                                                              force=force)
                 else:
                     header_str = ",".join([repr(x) for x in header])
                     raise ValueError(f'No taxonomic identifiers found; headers are {header_str}')
@@ -738,7 +741,7 @@ def load(cls, filename, *, delimiter=',', force=False,
             if "strain" in header:
                 include_strain=True
 
-           # check that all ranks are in header
+            # check that all ranks are in header
             ranks = list(lca_utils.taxlist(include_strain=include_strain))
             if not set(ranks).issubset(header):
                 # for now, just raise err if not all ranks are present.
@@ -763,6 +766,7 @@ def load(cls, filename, *, delimiter=',', force=False,
                     ident = row[identifier]
 
                     # fold, spindle, and mutilate ident?
+                    # @CTB: replace with get_ident?
                     if not keep_full_identifiers:
                         ident = ident.split(' ')[0]
 
@@ -796,6 +800,64 @@ def load(cls, filename, *, delimiter=',', force=False,
         return LineageDB(assignments, ranks)
 
 
+    @classmethod
+    def load_from_gather_with_lineages(cls, filename, *, force=False):
+        """
+        Load an annotated gather-with-lineages CSV file produced by
+        'tax annotate' into a LineageDB.
+        """
+        include_strain = False
+
+        if not os.path.exists(filename):
+            raise ValueError(f"'{filename}' does not exist")
+
+        if os.path.isdir(filename):
+            raise ValueError(f"'{filename}' is a directory")
+
+        with sourmash_args.FileInputCSV(filename) as r:
+            header = r.fieldnames
+            if not header:
+                raise ValueError(f'cannot read taxonomy assignments from {filename}')
+
+            if "name" not in header or "lineage" not in header:
+                raise ValueError(f"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?")
+
+            ranks = list(lca_utils.taxlist(include_strain=include_strain))
+            assignments = {}
+            num_rows = 0
+            n_species = 0
+            n_strains = 0
+
+            # now parse and load lineages
+            for n, row in enumerate(r):
+                if row:
+                    num_rows += 1
+
+                    name = row['name']
+                    ident = get_ident(name)
+                    lineage = row['lineage']
+                    lineage = lca_utils.make_lineage(lineage)
+
+                    print(ident, row['lineage'])
+
+                    # check duplicates
+                    if ident in assignments:
+                        if assignments[ident] != tuple(lineage):
+                            # @CTB what to do here...
+                            if not force:
+                                raise ValueError(f"multiple lineages for identifier {ident}")
+                    else:
+                        assignments[ident] = tuple(lineage)
+
+                        if lineage[-1].rank == 'species':
+                            n_species += 1
+                        elif lineage[-1].rank == 'strain':
+                            n_species += 1
+                            n_strains += 1
+
+        return LineageDB(assignments, ranks)
+
+
 class LineageDB_Sqlite(abc.Mapping):
     """
     A LineageDB based on a sqlite3 database with a 'sourmash_taxonomy' table.

From 07e11b7ddb5698227e61489be4a694a5a3341c83 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 15 Oct 2022 10:18:26 -0700
Subject: [PATCH 04/15] remove print

---
 src/sourmash/tax/tax_utils.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index a832678987..0605d3ce9d 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -838,8 +838,6 @@ def load_from_gather_with_lineages(cls, filename, *, force=False):
                     lineage = row['lineage']
                     lineage = lca_utils.make_lineage(lineage)
 
-                    print(ident, row['lineage'])
-
                     # check duplicates
                     if ident in assignments:
                         if assignments[ident] != tuple(lineage):

From 15ce0ebbb77982680414137d7f69a7c1beae638a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 15 Oct 2022 16:56:25 -0700
Subject: [PATCH 05/15] add per-rank lineage info

---
 src/sourmash/cli/tax/summarize.py |  4 ++--
 src/sourmash/tax/__main__.py      | 28 ++++++++++++++++++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py
index 6e3a0a3e8c..771164172c 100644
--- a/src/sourmash/cli/tax/summarize.py
+++ b/src/sourmash/cli/tax/summarize.py
@@ -31,8 +31,8 @@ def subparser(subparsers):
         help='database lineages'
     )
     subparser.add_argument(
-        '-o', '--output',
-        help='output file',
+        '-o', '--output-lineage-information',
+        help='output a CSV file containing individual lineage counts',
         # @CTB implement
     )
     subparser.add_argument(
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index b8c3d8e8d5..754f1a58e4 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -4,13 +4,13 @@
 import sys
 import csv
 import os
-from collections import defaultdict
+from collections import defaultdict, Counter
 import re
 
 import sourmash
 from ..sourmash_args import FileOutputCSV, FileOutput
 from sourmash.logging import set_quiet, error, notify, print_results
-from sourmash.lca.lca_utils import display_lineage
+from sourmash.lca.lca_utils import display_lineage, zip_lineage
 
 from . import tax_utils
 from .tax_utils import ClassificationResult, MultiLineageDB
@@ -478,6 +478,30 @@ def summarize(args):
         print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers")
 
 
+    if args.output_lineage_information:
+        notify("now calculating detailed lineage counts...")
+        lineage_counts = Counter()
+        for v in tax_assign.values():
+            tup = v
+            while tup:
+                lineage_counts[tup] += 1
+                tup = tup[:-1]
+        notify("...done!")
+
+        with FileOutputCSV(args.output_lineage_information) as fp:
+            w = csv.writer(fp)
+            w.writerow(['rank', 'count', 'lineage'])
+
+            # output in order of most common
+            for lineage, count in lineage_counts.most_common():
+                rank = lineage[-1].rank
+                lin = ";".join(zip_lineage(lineage, truncate_empty=True))
+                w.writerow([rank, str(count), lin])
+
+        n = len(lineage_counts)
+        notify(f"saved {n} lineage counts to '{args.output_lineage_information}'")
+
+
 def main(arglist=None):
     args = sourmash.cli.get_parser().parse_args(arglist)
     submod = getattr(sourmash.cli.sig, args.subcmd)

From b5ac05cddfacdcb51aeba0f21bf5d93ffab859a6 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 15 Oct 2022 21:16:42 -0700
Subject: [PATCH 06/15] add some basic tests for summarize

---
 tests/test_tax.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/tests/test_tax.py b/tests/test_tax.py
index b9ac15c171..83fb4e299e 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -5,6 +5,7 @@
 import csv
 import pytest
 import gzip
+from collections import Counter
 
 import sourmash
 import sourmash_tst_utils as utils
@@ -13,6 +14,7 @@
 
 from sourmash import sqlite_utils
 from sourmash.exceptions import IndexNotSupported
+from sourmash import sourmash_args
 
 ## command line tests
 def test_run_sourmash_tax():
@@ -2902,3 +2904,84 @@ def test_tax_grep_duplicate_csv(runtmp):
     assert 'GCF_000017325.1' in names
     assert 'GCF_000021665.1' in names
     assert 'GCF_001881345.1' in names
+
+
+def test_tax_summarize(runtmp):
+    # test basic operation
+    taxfile = utils.get_test_data('tax/test.taxonomy.csv')
+
+    runtmp.sourmash('tax', 'summarize', taxfile)
+
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+
+    assert "num idents: 6" in out
+    assert "rank superkingdom:        1 distinct identifiers" in out
+    assert "rank phylum:              2 distinct identifiers" in out
+    assert "rank class:               2 distinct identifiers" in out
+    assert "rank order:               2 distinct identifiers" in out
+    assert "rank family:              3 distinct identifiers" in out
+    assert "rank genus:               4 distinct identifiers" in out
+    assert "rank species:             4 distinct identifiers" in out
+
+
+def test_tax_summarize_csv(runtmp):
+    # test basic operation w/csv output
+    taxfile = utils.get_test_data('tax/test.taxonomy.csv')
+
+    runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv')
+
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+
+    assert "num idents: 6" in out
+    assert "saved 18 lineage counts to 'ranks.csv'" in err
+
+    csv_out = runtmp.output('ranks.csv')
+
+    with sourmash_args.FileInputCSV(csv_out) as r:
+        # count number across ranks as a cheap consistency check
+        c = Counter()
+        for row in r:
+            val = row['count']
+            c[val] += 1
+
+        assert c['3'] == 7
+        assert c['2'] == 5
+        assert c['1'] == 5
+
+
+def test_summarize_on_annotate(runtmp):
+    # test summarize on output of annotate basics
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    tax = utils.get_test_data('tax/test.taxonomy.csv')
+    csvout = runtmp.output("test1.gather.with-lineages.csv")
+    out_dir = os.path.dirname(csvout)
+
+    runtmp.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir)
+
+    print(runtmp.last_result.status)
+    print(runtmp.last_result.out)
+    print(runtmp.last_result.err)
+
+    assert runtmp.last_result.status == 0
+    assert os.path.exists(csvout)
+
+    # so far so good - now see if we can run summarize!
+
+    runtmp.run_sourmash('tax', 'summarize', csvout)
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+
+    print(out)
+    print(err)
+
+    assert "num idents: 4" in out
+    assert "rank superkingdom:        1 distinct identifiers" in out
+    assert "rank phylum:              2 distinct identifiers" in out
+    assert "rank class:               2 distinct identifiers" in out
+    assert "rank order:               2 distinct identifiers" in out
+    assert "rank family:              2 distinct identifiers" in out
+    assert "rank genus:               3 distinct identifiers" in out
+    assert "rank species:             3 distinct identifiers" in out
+

From 8501a184c42fa9eb1405bde134b595a2e30d42c7 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 12 Nov 2022 19:31:06 -0800
Subject: [PATCH 07/15] add documentation section for tax summarize

---
 doc/command-line.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/doc/command-line.md b/doc/command-line.md
index b8c6c7d378..42657819a5 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -92,6 +92,7 @@ information; these are grouped under the `sourmash tax` and
 * `tax genome`     - summarize single-genome gather results and report most likely classification.
 * `tax annotate`   - annotate gather results with lineage information (no summarization or classification).
 * `tax grep` - subset taxonomies and create picklists based on taxonomy string matches.
+* `tax summarize` - print summary information (counts of lineages) for a taxonomy lineages file or database.
 
 `sourmash lca` commands:
 
@@ -912,6 +913,12 @@ _not_ match the pattern.
 Currently only CSV output (optionally gzipped) is supported; use `sourmash tax prepare` to
 convert CSV output from `tax grep` into a sqlite3 taxonomy database.
 
+### `sourmash tax summarize` - print summary information for lineage spreadsheets or taxonomy databases
+
+(`sourmash tax summarize` is a new command as of sourmash v4.5.0.) @CTB
+
+@CTB
+
 ## `sourmash lca` subcommands for in-memory taxonomy integration
 
 These commands use LCA databases (created with `lca index`, below, or

From aa579ee598375f77bb83ec3ced0d3bfd0902dbb5 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 12 Nov 2022 20:02:10 -0800
Subject: [PATCH 08/15] cover more of the code w/tests

---
 tests/test_tax_utils.py | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index e2f4838476..60302657e5 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -3,6 +3,7 @@
 """
 
 import pytest
+import os
 from os.path import basename
 import gzip
 
@@ -129,6 +130,43 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp):
     assert ids_missing == {"gA"}
 
 
+def test_check_and_load_gather_lineage_csvs_empty(runtmp):
+    g_res = runtmp.output('empty.gather-tax.csv')
+    with open(g_res, 'w') as fp:
+        fp.write("")
+
+    with pytest.raises(ValueError) as exc:
+        tax_assign = LineageDB.load_from_gather_with_lineages(g_res)
+    assert "cannot read taxonomy assignments" in str(exc.value)
+
+
+def test_check_and_load_gather_lineage_csvs_bad_header(runtmp):
+    g_res = runtmp.output('empty.gather-tax.csv')
+    with open(g_res, 'w', newline="") as fp:
+        fp.write("x,y,z")
+
+    with pytest.raises(ValueError) as exc:
+        tax_assign = LineageDB.load_from_gather_with_lineages(g_res)
+    assert "Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?" in str(exc.value)
+
+
+def test_check_and_load_gather_lineage_csvs_dne(runtmp):
+    g_res = runtmp.output('empty.gather-tax.csv')
+
+    with pytest.raises(ValueError) as exc:
+        tax_assign = LineageDB.load_from_gather_with_lineages(g_res)
+    assert "does not exist" in str(exc.value)
+
+
+def test_check_and_load_gather_lineage_csvs_isdir(runtmp):
+    g_res = runtmp.output('empty.gather-tax.csv')
+    os.mkdir(g_res)
+
+    with pytest.raises(ValueError) as exc:
+        tax_assign = LineageDB.load_from_gather_with_lineages(g_res)
+    assert "is a directory" in str(exc.value)
+
+
 def test_check_and_load_gather_csvs_fail_on_missing(runtmp):
     g_csv = utils.get_test_data('tax/test1.gather.csv')
     # make gather results with taxonomy name not in tax_assign

From ae7b441c6f421945d21745a472e1bb5acab02e67 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 13 Nov 2022 07:44:34 -0800
Subject: [PATCH 09/15] add test; change column name

---
 src/sourmash/cli/tax/summarize.py |  4 ++--
 src/sourmash/tax/__main__.py      |  3 ++-
 tests/test_tax.py                 | 14 +++++++++++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py
index 771164172c..10bb3ee35d 100644
--- a/src/sourmash/cli/tax/summarize.py
+++ b/src/sourmash/cli/tax/summarize.py
@@ -9,8 +9,8 @@
 and produces a human readable summary.
 
 Please see the 'tax summarize' documentation for more details:
-  https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-summarize
-@CTB check URL :)
+  https://sourmash.readthedocs.io/en/latest/command-line.html#command-line.html#sourmash-tax-summarize-print-summary-information-for-lineage-spreadsheets-or-taxonomy-databases
+
 """
 
 import sourmash
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 754f1a58e4..6fdb8d0900 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -465,6 +465,7 @@ def summarize(args):
                 name_seen.add(name)
 
             if 0:
+                # @CTB
                 # check duplicates?
                 sofar.append(name)
                 tup = tuple(sofar)
@@ -490,7 +491,7 @@ def summarize(args):
 
         with FileOutputCSV(args.output_lineage_information) as fp:
             w = csv.writer(fp)
-            w.writerow(['rank', 'count', 'lineage'])
+            w.writerow(['rank', 'lineage_count', 'lineage'])
 
             # output in order of most common
             for lineage, count in lineage_counts.most_common():
diff --git a/tests/test_tax.py b/tests/test_tax.py
index d2a30d60e4..1f1123a22f 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -2959,6 +2959,18 @@ def test_tax_summarize(runtmp):
     assert "rank species:             4 distinct identifiers" in out
 
 
+def test_tax_summarize_empty(runtmp):
+    # test failure on empty file
+    taxfile = runtmp.output('no-exist')
+
+    with pytest.raises(SourmashCommandFailed):
+        runtmp.sourmash('tax', 'summarize', taxfile)
+
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+    assert "ERROR while loading taxonomies" in err
+
+
 def test_tax_summarize_csv(runtmp):
     # test basic operation w/csv output
     taxfile = utils.get_test_data('tax/test.taxonomy.csv')
@@ -2977,7 +2989,7 @@ def test_tax_summarize_csv(runtmp):
         # count number across ranks as a cheap consistency check
         c = Counter()
         for row in r:
-            val = row['count']
+            val = row['lineage_count']
             c[val] += 1
 
         assert c['3'] == 7

From 4ce8ff7db892095aa8e514a7843a7b3d6b1502d3 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 13 Nov 2022 07:46:07 -0800
Subject: [PATCH 10/15] refactor to use get_ident

---
 src/sourmash/tax/tax_utils.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index b7ba259ba6..b7fe1caf7d 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -765,12 +765,9 @@ def load(cls, filename, *, delimiter=',', force=False,
                     ident = row[identifier]
 
                     # fold, spindle, and mutilate ident?
-                    # @CTB: replace with get_ident?
-                    if not keep_full_identifiers:
-                        ident = ident.split(' ')[0]
-
-                        if not keep_identifier_versions:
-                            ident = ident.split('.')[0]
+                    ident = get_ident(ident,
+                                      keep_full_identifiers=keep_full_identifiers,
+                                      keep_identifier_versions=keep_identifier_versions)
 
                     # clean lineage of null names, replace with 'unassigned'
                     lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]

From d1f5477399661a7b35e0a5236c9f3818caac8d3f Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 13 Nov 2022 07:59:29 -0800
Subject: [PATCH 11/15] test duplicate/different tax assignments in lineages
 file

---
 src/sourmash/tax/tax_utils.py | 47 +++++++++++++++++++----------------
 tests/test_tax.py             | 33 ++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 21 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index b7fe1caf7d..699e80177f 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -826,28 +826,33 @@ def load_from_gather_with_lineages(cls, filename, *, force=False):
 
             # now parse and load lineages
             for n, row in enumerate(r):
-                if row:
-                    num_rows += 1
+                # skip empty rows
+                if not row:
+                    continue
 
-                    name = row['name']
-                    ident = get_ident(name)
-                    lineage = row['lineage']
-                    lineage = lca_utils.make_lineage(lineage)
-
-                    # check duplicates
-                    if ident in assignments:
-                        if assignments[ident] != tuple(lineage):
-                            # @CTB what to do here...
-                            if not force:
-                                raise ValueError(f"multiple lineages for identifier {ident}")
-                    else:
-                        assignments[ident] = tuple(lineage)
-
-                        if lineage[-1].rank == 'species':
-                            n_species += 1
-                        elif lineage[-1].rank == 'strain':
-                            n_species += 1
-                            n_strains += 1
+                num_rows += 1
+
+                name = row['name']
+                ident = get_ident(name)
+                lineage = row['lineage']
+                lineage = lca_utils.make_lineage(lineage)
+
+                # check duplicates
+                if ident in assignments:
+                    if assignments[ident] != tuple(lineage):
+                        # this should not happen with valid
+                        # sourmash tax annotate output, but check anyway.
+                        if not force:
+                            raise ValueError(f"multiple lineages for identifier {ident}")
+                else:
+                    assignments[ident] = tuple(lineage)
+
+                    if lineage[-1].rank == 'species':
+                        n_species += 1
+                    elif lineage[-1].rank == 'strain':
+                        assert 0 # @CTB
+                        n_species += 1
+                        n_strains += 1
 
         return LineageDB(assignments, ranks)
 
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 1f1123a22f..e6de945227 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -10,6 +10,7 @@
 import sourmash
 import sourmash_tst_utils as utils
 from sourmash.tax import tax_utils
+from sourmash.lca import lca_utils
 from sourmash_tst_utils import SourmashCommandFailed
 
 from sourmash import sqlite_utils
@@ -1381,6 +1382,38 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp):
     assert "multiple lineages for identifier GCF_001881345" in str(exc.value)
 
 
+def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp):
+    c = runtmp
+
+    # write temp taxonomy with duplicates => lineages-style file
+    taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+    taxdb = tax_utils.LineageDB.load(taxonomy_csv)
+
+    for k, v in taxdb.items():
+        print(k, v)
+
+    lineage_csv = runtmp.output('lin.csv')
+    with open(lineage_csv, 'w', newline="") as fp:
+        w = csv.writer(fp)
+        w.writerow(['name', 'lineage'])
+        for k, v in taxdb.items():
+            linstr = lca_utils.display_lineage(v)
+            w.writerow([k, linstr])
+
+            # duplicate each row, changing something (truncate species, here)
+            v = v[:-1]
+            linstr = lca_utils.display_lineage(v)
+            w.writerow([k, linstr])
+
+    with pytest.raises(SourmashCommandFailed) as exc:
+        c.run_sourmash('tax', 'summarize', lineage_csv)
+        print(c.last_result.out)
+        print(c.last_result.err)
+
+    assert "cannot read taxonomy assignments" in str(exc.value)
+    assert "multiple lineages for identifier GCF_001881345" in str(exc.value)
+
+
 def test_genome_rank_duplicated_taxonomy_force(runtmp):
     c = runtmp
     # write temp taxonomy with duplicates

From 4351649255a6ca9f4fdc0818ff35392ae6e83e5d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 13 Nov 2022 09:21:31 -0800
Subject: [PATCH 12/15] test strains and lineage stuff

---
 src/sourmash/tax/tax_utils.py |  1 -
 tests/test_tax.py             | 68 +++++++++++++++++++++++++++++++++++
 tests/test_tax_utils.py       |  3 ++
 3 files changed, 71 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 699e80177f..4c281c2f70 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -850,7 +850,6 @@ def load_from_gather_with_lineages(cls, filename, *, force=False):
                     if lineage[-1].rank == 'species':
                         n_species += 1
                     elif lineage[-1].rank == 'strain':
-                        assert 0 # @CTB
                         n_species += 1
                         n_strains += 1
 
diff --git a/tests/test_tax.py b/tests/test_tax.py
index e6de945227..78774c06c0 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -3064,3 +3064,71 @@ def test_summarize_on_annotate(runtmp):
     assert "rank genus:               3 distinct identifiers" in out
     assert "rank species:             3 distinct identifiers" in out
 
+
+def test_tax_summarize_strain_csv(runtmp):
+    # test basic operation w/csv output on taxonomy with strains
+    taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv')
+
+    runtmp.sourmash('tax', 'summarize', taxfile, '-o', 'ranks.csv')
+
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+
+    assert "num idents: 6" in out
+    assert "saved 24 lineage counts to 'ranks.csv'" in err
+
+    csv_out = runtmp.output('ranks.csv')
+
+    with sourmash_args.FileInputCSV(csv_out) as r:
+        # count number across ranks as a cheap consistency check
+        c = Counter()
+        for row in r:
+            print(row)
+            val = row['lineage_count']
+            c[val] += 1
+
+        print(list(c.most_common()))
+
+        assert c['3'] == 7
+        assert c['2'] == 5
+        assert c['6'] == 1
+        assert c['1'] == 11
+
+
+def test_tax_summarize_strain_csv_with_lineages(runtmp):
+    # test basic operation w/csv output
+    taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv')
+    lineage_csv = runtmp.output('lin-with-strains.csv')
+
+    taxdb = tax_utils.LineageDB.load(taxfile)
+    with open(lineage_csv, 'w', newline="") as fp:
+        w = csv.writer(fp)
+        w.writerow(['name', 'lineage'])
+        for k, v in taxdb.items():
+            linstr = lca_utils.display_lineage(v)
+            w.writerow([k, linstr])
+
+    runtmp.sourmash('tax', 'summarize', lineage_csv, '-o', 'ranks.csv')
+
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+
+    assert "num idents: 6" in out
+    assert "saved 24 lineage counts to" in err
+
+    csv_out = runtmp.output('ranks.csv')
+
+    with sourmash_args.FileInputCSV(csv_out) as r:
+        # count number across ranks as a cheap consistency check
+        c = Counter()
+        for row in r:
+            print(row)
+            val = row['lineage_count']
+            c[val] += 1
+
+        print(list(c.most_common()))
+
+        assert c['3'] == 7
+        assert c['2'] == 5
+        assert c['6'] == 1
+        assert c['1'] == 11
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 60302657e5..2354b8da2c 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -141,6 +141,7 @@ def test_check_and_load_gather_lineage_csvs_empty(runtmp):
 
 
 def test_check_and_load_gather_lineage_csvs_bad_header(runtmp):
+    # test on file with wrong headers
     g_res = runtmp.output('empty.gather-tax.csv')
     with open(g_res, 'w', newline="") as fp:
         fp.write("x,y,z")
@@ -151,6 +152,7 @@ def test_check_and_load_gather_lineage_csvs_bad_header(runtmp):
 
 
 def test_check_and_load_gather_lineage_csvs_dne(runtmp):
+    # test loading with-lineage file that does not exist
     g_res = runtmp.output('empty.gather-tax.csv')
 
     with pytest.raises(ValueError) as exc:
@@ -159,6 +161,7 @@ def test_check_and_load_gather_lineage_csvs_dne(runtmp):
 
 
 def test_check_and_load_gather_lineage_csvs_isdir(runtmp):
+    # test loading a with-lineage file that is actually a directory
     g_res = runtmp.output('empty.gather-tax.csv')
     os.mkdir(g_res)
 

From d3ab0447863618e0859da1f13ba5cec1cad516be Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 13 Nov 2022 11:53:27 -0800
Subject: [PATCH 13/15] test on empty line

---
 src/sourmash/cli/tax/summarize.py             |  1 -
 src/sourmash/tax/__main__.py                  |  1 -
 src/sourmash/tax/tax_utils.py                 | 77 +++++++++----------
 .../tax/test-empty-line.taxonomy.csv          |  8 ++
 tests/test_tax.py                             | 29 +++++--
 tests/test_tax_utils.py                       |  1 +
 6 files changed, 69 insertions(+), 48 deletions(-)
 create mode 100644 tests/test-data/tax/test-empty-line.taxonomy.csv

diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py
index 10bb3ee35d..7fca17e837 100644
--- a/src/sourmash/cli/tax/summarize.py
+++ b/src/sourmash/cli/tax/summarize.py
@@ -33,7 +33,6 @@ def subparser(subparsers):
     subparser.add_argument(
         '-o', '--output-lineage-information',
         help='output a CSV file containing individual lineage counts',
-        # @CTB implement
     )
     subparser.add_argument(
         '--keep-full-identifiers', action='store_true',
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 6fdb8d0900..d003962117 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -478,7 +478,6 @@ def summarize(args):
         rank_name_str = f"{rank}:"
         print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers")
 
-
     if args.output_lineage_information:
         notify("now calculating detailed lineage counts...")
         lineage_counts = Counter()
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 4c281c2f70..d504d4ab9c 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -755,43 +755,42 @@ def load(cls, filename, *, delimiter=',', force=False,
 
             # now parse and load lineages
             for n, row in enumerate(r):
-                if row:
-                    num_rows += 1
-                    lineage = []
-                    # read row into a lineage pair
-                    for rank in lca_utils.taxlist(include_strain=include_strain):
-                        lin = row[rank]
-                        lineage.append(LineagePair(rank, lin))
-                    ident = row[identifier]
-
-                    # fold, spindle, and mutilate ident?
-                    ident = get_ident(ident,
-                                      keep_full_identifiers=keep_full_identifiers,
-                                      keep_identifier_versions=keep_identifier_versions)
-
-                    # clean lineage of null names, replace with 'unassigned'
-                    lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
-                    lineage = [ LineagePair(a, b) for (a, b) in lineage ]
-
-                    # remove end nulls
-                    while lineage and lineage[-1].name == 'unassigned':
-                        lineage = lineage[:-1]
-
-                    # store lineage tuple
-                    if lineage:
-                        # check duplicates
-                        if ident in assignments:
-                            if assignments[ident] != tuple(lineage):
-                                if not force:
-                                    raise ValueError(f"multiple lineages for identifier {ident}")
-                        else:
-                            assignments[ident] = tuple(lineage)
-
-                            if lineage[-1].rank == 'species':
-                                n_species += 1
-                            elif lineage[-1].rank == 'strain':
-                                n_species += 1
-                                n_strains += 1
+                num_rows += 1
+                lineage = []
+                # read row into a lineage pair
+                for rank in lca_utils.taxlist(include_strain=include_strain):
+                    lin = row[rank]
+                    lineage.append(LineagePair(rank, lin))
+                ident = row[identifier]
+
+                # fold, spindle, and mutilate ident?
+                ident = get_ident(ident,
+                                  keep_full_identifiers=keep_full_identifiers,
+                                  keep_identifier_versions=keep_identifier_versions)
+
+                # clean lineage of null names, replace with 'unassigned'
+                lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
+                lineage = [ LineagePair(a, b) for (a, b) in lineage ]
+
+                # remove end nulls
+                while lineage and lineage[-1].name == 'unassigned':
+                    lineage = lineage[:-1]
+
+                # store lineage tuple
+                if lineage:
+                    # check duplicates
+                    if ident in assignments:
+                        if assignments[ident] != tuple(lineage):
+                            if not force:
+                                raise ValueError(f"multiple lineages for identifier {ident}")
+                    else:
+                        assignments[ident] = tuple(lineage)
+
+                        if lineage[-1].rank == 'species':
+                            n_species += 1
+                        elif lineage[-1].rank == 'strain':
+                            n_species += 1
+                            n_strains += 1
 
         return LineageDB(assignments, ranks)
 
@@ -826,10 +825,6 @@ def load_from_gather_with_lineages(cls, filename, *, force=False):
 
             # now parse and load lineages
             for n, row in enumerate(r):
-                # skip empty rows
-                if not row:
-                    continue
-
                 num_rows += 1
 
                 name = row['name']
diff --git a/tests/test-data/tax/test-empty-line.taxonomy.csv b/tests/test-data/tax/test-empty-line.taxonomy.csv
new file mode 100644
index 0000000000..a197efb7fa
--- /dev/null
+++ b/tests/test-data/tax/test-empty-line.taxonomy.csv
@@ -0,0 +1,8 @@
+ident,superkingdom,phylum,class,order,family,genus,species
+
+GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli
+GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus
+GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
+GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 78774c06c0..be5dcf14e0 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -1383,9 +1383,9 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp):
 
 
 def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp):
+    # write temp taxonomy with duplicates => lineages-style file
     c = runtmp
 
-    # write temp taxonomy with duplicates => lineages-style file
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
     taxdb = tax_utils.LineageDB.load(taxonomy_csv)
 
@@ -1415,8 +1415,8 @@ def test_genome_rank_duplicated_taxonomy_fail_lineages(runtmp):
 
 
 def test_genome_rank_duplicated_taxonomy_force(runtmp):
-    c = runtmp
     # write temp taxonomy with duplicates
+    c = runtmp
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
     duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
     with open(duplicated_csv, 'w') as dup:
@@ -2974,7 +2974,7 @@ def test_tax_grep_duplicate_csv(runtmp):
 
 
 def test_tax_summarize(runtmp):
-    # test basic operation
+    # test basic operation with summarize
     taxfile = utils.get_test_data('tax/test.taxonomy.csv')
 
     runtmp.sourmash('tax', 'summarize', taxfile)
@@ -2992,6 +2992,25 @@ def test_tax_summarize(runtmp):
     assert "rank species:             4 distinct identifiers" in out
 
 
+def test_tax_summarize_empty_line(runtmp):
+    # test basic operation with summarize on a file w/empty line
+    taxfile = utils.get_test_data('tax/test-empty-line.taxonomy.csv')
+
+    runtmp.sourmash('tax', 'summarize', taxfile)
+
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+
+    assert "num idents: 6" in out
+    assert "rank superkingdom:        1 distinct identifiers" in out
+    assert "rank phylum:              2 distinct identifiers" in out
+    assert "rank class:               2 distinct identifiers" in out
+    assert "rank order:               2 distinct identifiers" in out
+    assert "rank family:              3 distinct identifiers" in out
+    assert "rank genus:               4 distinct identifiers" in out
+    assert "rank species:             4 distinct identifiers" in out
+
+
 def test_tax_summarize_empty(runtmp):
     # test failure on empty file
     taxfile = runtmp.output('no-exist')
@@ -3030,7 +3049,7 @@ def test_tax_summarize_csv(runtmp):
         assert c['1'] == 5
 
 
-def test_summarize_on_annotate(runtmp):
+def test_tax_summarize_on_annotate(runtmp):
     # test summarize on output of annotate basics
     g_csv = utils.get_test_data('tax/test1.gather.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
@@ -3096,7 +3115,7 @@ def test_tax_summarize_strain_csv(runtmp):
 
 
 def test_tax_summarize_strain_csv_with_lineages(runtmp):
-    # test basic operation w/csv output
+    # test basic operation w/csv output on lineages-style file w/strain csv
     taxfile = utils.get_test_data('tax/test-strain.taxonomy.csv')
     lineage_csv = runtmp.output('lin-with-strains.csv')
 
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 2354b8da2c..7652970f60 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -131,6 +131,7 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp):
 
 
 def test_check_and_load_gather_lineage_csvs_empty(runtmp):
+    # try loading an empty annotated gather file
     g_res = runtmp.output('empty.gather-tax.csv')
     with open(g_res, 'w') as fp:
         fp.write("")

From 5e7bdefaeaf61bb4e838667125200be8312e21f4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 13 Nov 2022 12:28:33 -0800
Subject: [PATCH 14/15] add docs

---
 doc/command-line.md | 50 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 5 deletions(-)

diff --git a/doc/command-line.md b/doc/command-line.md
index 42657819a5..939f62f7df 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -516,7 +516,6 @@ As with all reference-based analysis, results can be affected by the
 For more details on how `gather` works and can be used to classify
  signatures, see [classifying-signatures](classifying-signatures.md).
 
-
 ### `sourmash tax metagenome` - summarize metagenome content from `gather` results
 
 `sourmash tax metagenome` summarizes gather results for each query metagenome by
@@ -837,6 +836,10 @@ sourmash tax annotate
     --taxonomy gtdb-rs202.taxonomy.v2.csv
 ```
 
+The `with-lineages` output file format can be summarized with
+`sourmash tax summarize` and can also be used as an input taxonomy
+spreadsheet for any of the tax subcommands (new as of v4.6.0).
+
 ### `sourmash tax prepare` - prepare and/or combine taxonomy files
 
 `sourmash tax prepare` prepares taxonomy files for other `sourmash tax`
@@ -866,6 +869,9 @@ can be set to CSV like so:
 sourmash tax prepare --taxonomy file1.csv file2.db -o tax.csv -F csv
 ```
 
+**Note:** As of sourmash v4.6.0, the output of `sourmash tax annotate` can
+ be used as a taxonomy input spreadsheet as well.
+
 ### `sourmash tax grep` - subset taxonomies and create picklists based on taxonomy string matches
 
 (`sourmash tax grep` is a new command as of sourmash v4.5.0.)
@@ -896,9 +902,8 @@ sourmash search query.sig gtdb-rs207.genomic.k31.zip \
     --picklist shew-picklist.csv:ident:ident
 ```
 
-
 `tax grep` can also restrict string matching to a specific taxonomic rank
-with `-r/--rank`; for examplem
+with `-r/--rank`; for example,
 ```
 sourmash tax grep Shew -t gtdb-rs207.taxonomy.sqldb \
     -o shew-picklist.csv -r genus
@@ -915,9 +920,44 @@ convert CSV output from `tax grep` into a sqlite3 taxonomy database.
 
 ### `sourmash tax summarize` - print summary information for lineage spreadsheets or taxonomy databases
 
-(`sourmash tax summarize` is a new command as of sourmash v4.5.0.) @CTB
+(`sourmash tax summarize` is a new command as of sourmash v4.6.0.)
+
+`sourmash tax summarize` loads in one or more lineage spreadsheets,
+counts the distinct taxonomic lineages, and outputs a summary. It
+optionally will output a CSV file with a detailed count of how many
+identifiers belong to each taxonomic lineage.
+
+For example,
+```
+sourmash tax summarize gtdb-rs202.taxonomy.v2.db -o ranks.csv
+```
+outputs
+```
+number of distinct taxonomic lineages: 258406
+rank superkingdom:        2 distinct taxonomic lineages
+rank phylum:              169 distinct taxonomic lineages
+rank class:               419 distinct taxonomic lineages
+rank order:               1312 distinct taxonomic lineages
+rank family:              3264 distinct taxonomic lineages
+rank genus:               12888 distinct taxonomic lineages
+rank species:             47894 distinct taxonomic lineages
+```
+
+and creates a file `ranks.csv` with the number of distinct identifier
+counts for each lineage at each rank:
+```
+rank,lineage_count,lineage
+superkingdom,254090,d__Bacteria
+phylum,120757,d__Bacteria;p__Proteobacteria
+class,104665,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria
+order,64157,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales
+family,55347,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae
+...
+```
+That is, there are 254,090 identifiers in GTDB rs202 under `d__Bacteria`,
+and 120,757 within the `p__Proteobacteria`.
 
-@CTB
+`tax summarize` can also be used to summarize the output of `tax annotate`.
 
 ## `sourmash lca` subcommands for in-memory taxonomy integration
 

From 43e094d59550e2095e83b0c7f2db777c7c8015f1 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 13 Nov 2022 12:28:44 -0800
Subject: [PATCH 15/15] cleanup; add test for multiple

---
 src/sourmash/tax/__main__.py | 13 ++-----
 tests/test_tax.py            | 74 +++++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 37 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index d003962117..6544e5d610 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -453,8 +453,9 @@ def summarize(args):
 
     notify(f"...loaded {len(tax_assign)} entries.")
 
-    print_results(f"num idents: {len(tax_assign)}")
+    print_results(f"number of distinct taxonomic lineages: {len(tax_assign)}")
 
+    # count the number of distinct lineage names seen
     rank_counts = defaultdict(int)
     name_seen = set()
     for v in tax_assign.values():
@@ -464,19 +465,11 @@ def summarize(args):
                 rank_counts[rank] += 1
                 name_seen.add(name)
 
-            if 0:
-                # @CTB
-                # check duplicates?
-                sofar.append(name)
-                tup = tuple(sofar)
-                seen.add(tuple(sofar))
-        #break
-
     rank_count_items = list(rank_counts.items())
     rank_count_items.sort(key=lambda x: x[1])
     for rank, count in rank_count_items:
         rank_name_str = f"{rank}:"
-        print_results(f"rank {rank_name_str:<20s} {count} distinct identifiers")
+        print_results(f"rank {rank_name_str:<20s} {count} distinct taxonomic lineages")
 
     if args.output_lineage_information:
         notify("now calculating detailed lineage counts...")
diff --git a/tests/test_tax.py b/tests/test_tax.py
index be5dcf14e0..b1b3c1b8f8 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -2982,14 +2982,34 @@ def test_tax_summarize(runtmp):
     out = runtmp.last_result.out
     err = runtmp.last_result.err
 
-    assert "num idents: 6" in out
-    assert "rank superkingdom:        1 distinct identifiers" in out
-    assert "rank phylum:              2 distinct identifiers" in out
-    assert "rank class:               2 distinct identifiers" in out
-    assert "rank order:               2 distinct identifiers" in out
-    assert "rank family:              3 distinct identifiers" in out
-    assert "rank genus:               4 distinct identifiers" in out
-    assert "rank species:             4 distinct identifiers" in out
+    assert "number of distinct taxonomic lineages: 6" in out
+    assert "rank superkingdom:        1 distinct taxonomic lineages" in out
+    assert "rank phylum:              2 distinct taxonomic lineages" in out
+    assert "rank class:               2 distinct taxonomic lineages" in out
+    assert "rank order:               2 distinct taxonomic lineages" in out
+    assert "rank family:              3 distinct taxonomic lineages" in out
+    assert "rank genus:               4 distinct taxonomic lineages" in out
+    assert "rank species:             4 distinct taxonomic lineages" in out
+
+
+def test_tax_summarize_multiple(runtmp):
+    # test basic operation with summarize on multiple files
+    tax1 = utils.get_test_data('tax/bacteria_refseq_lineage.csv')
+    tax2 = utils.get_test_data('tax/protozoa_genbank_lineage.csv')
+
+    runtmp.sourmash('tax', 'summarize', tax1, tax2)
+
+    out = runtmp.last_result.out
+    err = runtmp.last_result.err
+
+    assert "number of distinct taxonomic lineages: 6" in out
+    assert "rank superkingdom:        2 distinct taxonomic lineages" in out
+    assert "rank phylum:              3 distinct taxonomic lineages" in out
+    assert "rank class:               4 distinct taxonomic lineages" in out
+    assert "rank order:               4 distinct taxonomic lineages" in out
+    assert "rank family:              5 distinct taxonomic lineages" in out
+    assert "rank genus:               5 distinct taxonomic lineages" in out
+    assert "rank species:             5 distinct taxonomic lineages" in out
 
 
 def test_tax_summarize_empty_line(runtmp):
@@ -3001,14 +3021,14 @@ def test_tax_summarize_empty_line(runtmp):
     out = runtmp.last_result.out
     err = runtmp.last_result.err
 
-    assert "num idents: 6" in out
-    assert "rank superkingdom:        1 distinct identifiers" in out
-    assert "rank phylum:              2 distinct identifiers" in out
-    assert "rank class:               2 distinct identifiers" in out
-    assert "rank order:               2 distinct identifiers" in out
-    assert "rank family:              3 distinct identifiers" in out
-    assert "rank genus:               4 distinct identifiers" in out
-    assert "rank species:             4 distinct identifiers" in out
+    assert "number of distinct taxonomic lineages: 6" in out
+    assert "rank superkingdom:        1 distinct taxonomic lineages" in out
+    assert "rank phylum:              2 distinct taxonomic lineages" in out
+    assert "rank class:               2 distinct taxonomic lineages" in out
+    assert "rank order:               2 distinct taxonomic lineages" in out
+    assert "rank family:              3 distinct taxonomic lineages" in out
+    assert "rank genus:               4 distinct taxonomic lineages" in out
+    assert "rank species:             4 distinct taxonomic lineages" in out
 
 
 def test_tax_summarize_empty(runtmp):
@@ -3032,7 +3052,7 @@ def test_tax_summarize_csv(runtmp):
     out = runtmp.last_result.out
     err = runtmp.last_result.err
 
-    assert "num idents: 6" in out
+    assert "number of distinct taxonomic lineages: 6" in out
     assert "saved 18 lineage counts to 'ranks.csv'" in err
 
     csv_out = runtmp.output('ranks.csv')
@@ -3074,14 +3094,14 @@ def test_tax_summarize_on_annotate(runtmp):
     print(out)
     print(err)
 
-    assert "num idents: 4" in out
-    assert "rank superkingdom:        1 distinct identifiers" in out
-    assert "rank phylum:              2 distinct identifiers" in out
-    assert "rank class:               2 distinct identifiers" in out
-    assert "rank order:               2 distinct identifiers" in out
-    assert "rank family:              2 distinct identifiers" in out
-    assert "rank genus:               3 distinct identifiers" in out
-    assert "rank species:             3 distinct identifiers" in out
+    assert "number of distinct taxonomic lineages: 4" in out
+    assert "rank superkingdom:        1 distinct taxonomic lineages" in out
+    assert "rank phylum:              2 distinct taxonomic lineages" in out
+    assert "rank class:               2 distinct taxonomic lineages" in out
+    assert "rank order:               2 distinct taxonomic lineages" in out
+    assert "rank family:              2 distinct taxonomic lineages" in out
+    assert "rank genus:               3 distinct taxonomic lineages" in out
+    assert "rank species:             3 distinct taxonomic lineages" in out
 
 
 def test_tax_summarize_strain_csv(runtmp):
@@ -3093,7 +3113,7 @@ def test_tax_summarize_strain_csv(runtmp):
     out = runtmp.last_result.out
     err = runtmp.last_result.err
 
-    assert "num idents: 6" in out
+    assert "number of distinct taxonomic lineages: 6" in out
     assert "saved 24 lineage counts to 'ranks.csv'" in err
 
     csv_out = runtmp.output('ranks.csv')
@@ -3132,7 +3152,7 @@ def test_tax_summarize_strain_csv_with_lineages(runtmp):
     out = runtmp.last_result.out
     err = runtmp.last_result.err
 
-    assert "num idents: 6" in out
+    assert "number of distinct taxonomic lineages: 6" in out
     assert "saved 24 lineage counts to" in err
 
     csv_out = runtmp.output('ranks.csv')