sourmash-bio · ctb · Nov 14, 2022 · Oct 15, 2022 · Oct 15, 2022 · Oct 15, 2022
diff --git a/doc/command-line.md b/doc/command-line.md
@@ -92,6 +92,7 @@ information; these are grouped under the `sourmash tax` and
 * `tax genome`     - summarize single-genome gather results and report most likely classification.
 * `tax annotate`   - annotate gather results with lineage information (no summarization or classification).
 * `tax grep` - subset taxonomies and create picklists based on taxonomy string matches.
+* `tax summarize` - print summary information (counts of lineages) for a taxonomy lineages file or database.
 
 `sourmash lca` commands:
 
@@ -515,7 +516,6 @@ As with all reference-based analysis, results can be affected by the
 For more details on how `gather` works and can be used to classify
  signatures, see [classifying-signatures](classifying-signatures.md).
 
-
 ### `sourmash tax metagenome` - summarize metagenome content from `gather` results
 
 `sourmash tax metagenome` summarizes gather results for each query metagenome by
@@ -836,6 +836,10 @@ sourmash tax annotate
     --taxonomy gtdb-rs202.taxonomy.v2.csv
 ```
 
+The `with-lineages` output file format can be summarized with
+`sourmash tax summarize` and can also be used as an input taxonomy
+spreadsheet for any of the tax subcommands (new as of v4.6.0).
+
 ### `sourmash tax prepare` - prepare and/or combine taxonomy files
 
 `sourmash tax prepare` prepares taxonomy files for other `sourmash tax`
@@ -865,6 +869,9 @@ can be set to CSV like so:
 sourmash tax prepare --taxonomy file1.csv file2.db -o tax.csv -F csv
 ```
 
+**Note:** As of sourmash v4.6.0, the output of `sourmash tax annotate` can
+ be used as a taxonomy input spreadsheet as well.
+
 ### `sourmash tax grep` - subset taxonomies and create picklists based on taxonomy string matches
 
 (`sourmash tax grep` is a new command as of sourmash v4.5.0.)
@@ -895,9 +902,8 @@ sourmash search query.sig gtdb-rs207.genomic.k31.zip \
     --picklist shew-picklist.csv:ident:ident
 ```
 
-
 `tax grep` can also restrict string matching to a specific taxonomic rank
-with `-r/--rank`; for examplem
+with `-r/--rank`; for example,
 ```
 sourmash tax grep Shew -t gtdb-rs207.taxonomy.sqldb \
     -o shew-picklist.csv -r genus
@@ -912,6 +918,47 @@ _not_ match the pattern.
 Currently only CSV output (optionally gzipped) is supported; use `sourmash tax prepare` to
 convert CSV output from `tax grep` into a sqlite3 taxonomy database.
 
+### `sourmash tax summarize` - print summary information for lineage spreadsheets or taxonomy databases
+
+(`sourmash tax summarize` is a new command as of sourmash v4.6.0.)
+
+`sourmash tax summarize` loads in one or more lineage spreadsheets,
+counts the distinct taxonomic lineages, and outputs a summary. It
+optionally will output a CSV file with a detailed count of how many
+identifiers belong to each taxonomic lineage.
+
+For example,
+```
+sourmash tax summarize gtdb-rs202.taxonomy.v2.db -o ranks.csv
+```
+outputs
+```
+number of distinct taxonomic lineages: 258406
+rank superkingdom:        2 distinct taxonomic lineages
+rank phylum:              169 distinct taxonomic lineages
+rank class:               419 distinct taxonomic lineages
+rank order:               1312 distinct taxonomic lineages
+rank family:              3264 distinct taxonomic lineages
+rank genus:               12888 distinct taxonomic lineages
+rank species:             47894 distinct taxonomic lineages
+```
+
+and creates a file `ranks.csv` with the number of distinct identifier
+counts for each lineage at each rank:
+```
+rank,lineage_count,lineage
+superkingdom,254090,d__Bacteria
+phylum,120757,d__Bacteria;p__Proteobacteria
+class,104665,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria
+order,64157,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales
+family,55347,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae
+...
+```
+That is, there are 254,090 identifiers in GTDB rs202 under `d__Bacteria`,
+and 120,757 within the `p__Proteobacteria`.
+
+`tax summarize` can also be used to summarize the output of `tax annotate`.
+
 ## `sourmash lca` subcommands for in-memory taxonomy integration
 
 These commands use LCA databases (created with `lca index`, below, or

diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py
@@ -9,6 +9,7 @@
 from . import annotate
 from . import prepare
 from . import grep
+from . import summarize
 
 from ..utils import command_list
 from argparse import SUPPRESS, RawDescriptionHelpFormatter

diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py
@@ -0,0 +1,52 @@
+"""summarize taxonomy/lineage information"""
+
+usage="""
+
+    sourmash tax summarize <taxonomy_file> [ <more files> ... ]
+
+The 'tax summarize' command reads in one or more taxonomy databases
+or lineage files (produced by 'tax annotate'), combines them,
+and produces a human readable summary.
+
+Please see the 'tax summarize' documentation for more details:
+  https://sourmash.readthedocs.io/en/latest/command-line.html#command-line.html#sourmash-tax-summarize-print-summary-information-for-lineage-spreadsheets-or-taxonomy-databases
+
+"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+
+def subparser(subparsers):
+    subparser = subparsers.add_parser('summarize',
+                                      usage=usage)
+    subparser.add_argument(
+        '-q', '--quiet', action='store_true',
+        help='suppress non-error output'
+    )
+    subparser.add_argument(
+        'taxonomy_files',
+        metavar='FILE',
+        nargs="+", action="extend",
+        help='database lineages'
+    )
+    subparser.add_argument(
+        '-o', '--output-lineage-information',
+        help='output a CSV file containing individual lineage counts',
+    )
+    subparser.add_argument(
+        '--keep-full-identifiers', action='store_true',
+        help='do not split identifiers on whitespace'
+    )
+    subparser.add_argument(
+        '--keep-identifier-versions', action='store_true',
+        help='after splitting identifiers, do not remove accession versions'
+    )
+    subparser.add_argument(
+        '-f', '--force', action = 'store_true',
+        help='continue past errors in file and taxonomy loading',
+    )
+
+def main(args):
+    import sourmash
+    return sourmash.tax.__main__.summarize(args)
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
@@ -4,13 +4,13 @@
 import sys
 import csv
 import os
-from collections import defaultdict
+from collections import defaultdict, Counter
 import re
 
 import sourmash
 from ..sourmash_args import FileOutputCSV, FileOutput
-from sourmash.logging import set_quiet, error, notify
-from sourmash.lca.lca_utils import display_lineage
+from sourmash.logging import set_quiet, error, notify, print_results
+from sourmash.lca.lca_utils import display_lineage, zip_lineage
 
 from . import tax_utils
 from .tax_utils import ClassificationResult, MultiLineageDB
@@ -438,6 +438,63 @@ def search_pattern(l, r):
         notify(f"found {len(match_ident)} matches; saved identifiers to picklist file '{args.output}'")
 
 
+def summarize(args):
+    "Summarize multiple taxonomy databases."
+    notify("loading taxonomies...")
+    try:
+        tax_assign = MultiLineageDB.load(args.taxonomy_files,
+                                         force=args.force,
+                       keep_full_identifiers=args.keep_full_identifiers,
+                       keep_identifier_versions=args.keep_identifier_versions)
+    except ValueError as exc:
+        error("ERROR while loading taxonomies!")
+        error(str(exc))
+        sys.exit(-1)
+
+    notify(f"...loaded {len(tax_assign)} entries.")
+
+    print_results(f"number of distinct taxonomic lineages: {len(tax_assign)}")
+
+    # count the number of distinct lineage names seen
+    rank_counts = defaultdict(int)
+    name_seen = set()
+    for v in tax_assign.values():
+        sofar = []
+        for rank, name in v:
+            if name not in name_seen:
+                rank_counts[rank] += 1
+                name_seen.add(name)
+
+    rank_count_items = list(rank_counts.items())
+    rank_count_items.sort(key=lambda x: x[1])
+    for rank, count in rank_count_items:
+        rank_name_str = f"{rank}:"
+        print_results(f"rank {rank_name_str:<20s} {count} distinct taxonomic lineages")
+
+    if args.output_lineage_information:
+        notify("now calculating detailed lineage counts...")
+        lineage_counts = Counter()
+        for v in tax_assign.values():
+            tup = v
+            while tup:
+                lineage_counts[tup] += 1
+                tup = tup[:-1]
+        notify("...done!")
+
+        with FileOutputCSV(args.output_lineage_information) as fp:
+            w = csv.writer(fp)
+            w.writerow(['rank', 'lineage_count', 'lineage'])
+
+            # output in order of most common
+            for lineage, count in lineage_counts.most_common():
+                rank = lineage[-1].rank
+                lin = ";".join(zip_lineage(lineage, truncate_empty=True))
+                w.writerow([rank, str(count), lin])
+
+        n = len(lineage_counts)
+        notify(f"saved {n} lineage counts to '{args.output_lineage_information}'")
+
+
 def main(arglist=None):
     args = sourmash.cli.get_parser().parse_args(arglist)
     submod = getattr(sourmash.cli.sig, args.subcmd)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -730,13 +730,17 @@ def load(cls, filename, *, delimiter=',', force=False,
                 elif 'accession' in header:
                     identifier = 'accession'
                     header = ["ident" if "accession" == x else x for x in header]
+                elif 'name' in header and 'lineage' in header:
+                    return cls.load_from_gather_with_lineages(filename,
+                                                              force=force)
                 else:
-                    raise ValueError('No taxonomic identifiers found.')
+                    header_str = ",".join([repr(x) for x in header])
+                    raise ValueError(f'No taxonomic identifiers found; headers are {header_str}')
             # is "strain" an available rank?
             if "strain" in header:
                 include_strain=True
 
-           # check that all ranks are in header
+            # check that all ranks are in header
             ranks = list(lca_utils.taxlist(include_strain=include_strain))
             if not set(ranks).issubset(header):
                 # for now, just raise err if not all ranks are present.
@@ -751,45 +755,98 @@ def load(cls, filename, *, delimiter=',', force=False,
 
             # now parse and load lineages
             for n, row in enumerate(r):
-                if row:
-                    num_rows += 1
-                    lineage = []
-                    # read row into a lineage pair
-                    for rank in lca_utils.taxlist(include_strain=include_strain):
-                        lin = row[rank]
-                        lineage.append(LineagePair(rank, lin))
-                    ident = row[identifier]
-
-                    # fold, spindle, and mutilate ident?
-                    if not keep_full_identifiers:
-                        ident = ident.split(' ')[0]
-
-                        if not keep_identifier_versions:
-                            ident = ident.split('.')[0]
-
-                    # clean lineage of null names, replace with 'unassigned'
-                    lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
-                    lineage = [ LineagePair(a, b) for (a, b) in lineage ]
-
-                    # remove end nulls
-                    while lineage and lineage[-1].name == 'unassigned':
-                        lineage = lineage[:-1]
-
-                    # store lineage tuple
-                    if lineage:
-                        # check duplicates
-                        if ident in assignments:
-                            if assignments[ident] != tuple(lineage):
-                                if not force:
-                                    raise ValueError(f"multiple lineages for identifier {ident}")
-                        else:
-                            assignments[ident] = tuple(lineage)
-
-                            if lineage[-1].rank == 'species':
-                                n_species += 1
-                            elif lineage[-1].rank == 'strain':
-                                n_species += 1
-                                n_strains += 1
+                num_rows += 1
+                lineage = []
+                # read row into a lineage pair
+                for rank in lca_utils.taxlist(include_strain=include_strain):
+                    lin = row[rank]
+                    lineage.append(LineagePair(rank, lin))
+                ident = row[identifier]
+
+                # fold, spindle, and mutilate ident?
+                ident = get_ident(ident,
+                                  keep_full_identifiers=keep_full_identifiers,
+                                  keep_identifier_versions=keep_identifier_versions)
+
+                # clean lineage of null names, replace with 'unassigned'
+                lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
+                lineage = [ LineagePair(a, b) for (a, b) in lineage ]
+
+                # remove end nulls
+                while lineage and lineage[-1].name == 'unassigned':
+                    lineage = lineage[:-1]
+
+                # store lineage tuple
+                if lineage:
+                    # check duplicates
+                    if ident in assignments:
+                        if assignments[ident] != tuple(lineage):
+                            if not force:
+                                raise ValueError(f"multiple lineages for identifier {ident}")
+                    else:
+                        assignments[ident] = tuple(lineage)
+
+                        if lineage[-1].rank == 'species':
+                            n_species += 1
+                        elif lineage[-1].rank == 'strain':
+                            n_species += 1
+                            n_strains += 1
+
+        return LineageDB(assignments, ranks)
+
+
+    @classmethod
+    def load_from_gather_with_lineages(cls, filename, *, force=False):
+        """
+        Load an annotated gather-with-lineages CSV file produced by
+        'tax annotate' into a LineageDB.
+        """
+        include_strain = False
+
+        if not os.path.exists(filename):
+            raise ValueError(f"'{filename}' does not exist")
+
+        if os.path.isdir(filename):
+            raise ValueError(f"'{filename}' is a directory")
+
+        with sourmash_args.FileInputCSV(filename) as r:
+            header = r.fieldnames
+            if not header:
+                raise ValueError(f'cannot read taxonomy assignments from {filename}')
+
+            if "name" not in header or "lineage" not in header:
+                raise ValueError(f"Expected headers 'name' and 'lineage' not found. Is this a with-lineages file?")
+
+            ranks = list(lca_utils.taxlist(include_strain=include_strain))
+            assignments = {}
+            num_rows = 0
+            n_species = 0
+            n_strains = 0
+
+            # now parse and load lineages
+            for n, row in enumerate(r):
+                num_rows += 1
+
+                name = row['name']
+                ident = get_ident(name)
+                lineage = row['lineage']
+                lineage = lca_utils.make_lineage(lineage)
+
+                # check duplicates
+                if ident in assignments:
+                    if assignments[ident] != tuple(lineage):
+                        # this should not happen with valid
+                        # sourmash tax annotate output, but check anyway.
+                        if not force:
+                            raise ValueError(f"multiple lineages for identifier {ident}")
+                else:
+                    assignments[ident] = tuple(lineage)
+
+                    if lineage[-1].rank == 'species':
+                        n_species += 1
+                    elif lineage[-1].rank == 'strain':
+                        n_species += 1
+                        n_strains += 1
 
         return LineageDB(assignments, ranks)