MRG: tax genome with --lins (#2519)

Use `--lins` and `--lingroup` with `tax genome`. Instead of building a separate `lingroup` output format, genome classification with `--lingroup` input will be restricted to lingroups only. All other options (`--rank`, `--ani-threshold`, etc) should continue to function. However, if you pass in a rank that is not present in your `lingroups`, you may eliminate all classification options. ## Testing ### Install on your own computer/cluster: Here is one way to test this code before it gets fully integrated into sourmash: - If you don't have conda, I'd recommend installing `mamba`, [instructions here](https://mamba.readthedocs.io/en/latest/installation.html) instead. - if you do have `mamba`, replace the word `conda` with `mamba` in the following commands. Download an environment file that points to this branch: ``` curl -JLO https://raw.githubusercontent.com/bluegenes/2023-demo-sourmash-LIN/main/genome-lin.yml ``` Create a virtual environment using this file: ``` conda env create -f genome-lin.yml ``` Activate that environment, which is named `tax-genome-lins`: ``` conda activate tax-genome-lins ``` make sure `--lins` is in the `--help` for `sourmash tax genome`: ``` sourmash tax genome --help ``` ## Command to run The command to run is this one: ``` sourmash tax genome -g $gather_csv -t $taxonomy_csv \ --lins --lingroup $lingroups_csv ``` The gather file should be the result of running sourmash gather using a genome query, since `sourmash tax genome` tries to annotate the file to the best lineage produced by sourmash gather. --------- Co-authored-by: C. Titus Brown <titus@idyll.org>
sourmash-bio · Apr 5, 2023 · bd2fdcc · bd2fdcc
1 parent fd53c50
commit bd2fdcc
Show file tree

Hide file tree

Showing 8 changed files with 148 additions and 41 deletions.
diff --git a/doc/command-line.md b/doc/command-line.md
@@ -826,6 +826,13 @@ Related lingroup subpaths will be grouped in output, but exact ordering may chan
 Optionally, `genome` can instead report classifications at a desired `rank`,
  regardless of match threshold (`--rank` argument, e.g. `--rank species`).
 
+If using `--lins` taxonomy, you can also provide a `--lingroup` file containing two
+columns, `name`, and `lin`, which provide a series of lin prefixes of interest.
+If provided,  genome classification will be restricted to provided lingroups only.
+All other options (`--rank`, `--ani-threshold`, etc) should continue to function.
+If you specify a `--rank` that does not have an associated lingroup, sourmash will
+notify you that you eliminated all classification options.
+
 Note that these thresholds and strategies are under active testing.
 
 To illustrate the utility of `genome`, let's consider a signature consisting

diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py
@@ -34,7 +34,7 @@
 import argparse
 import sourmash
 from sourmash.logging import notify, print_results, error
-from sourmash.cli.utils import add_tax_threshold_arg
+from sourmash.cli.utils import add_tax_threshold_arg, check_rank, check_tax_outputs, add_rank_arg
 
 def subparser(subparsers):
     subparser = subparsers.add_parser('genome',
@@ -65,10 +65,6 @@ def subparser(subparsers):
         '--output-dir', default= "",
         help='directory for output files'
     )
-    subparser.add_argument(
-        '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'],
-        help='Summarize genome taxonomy at this rank and above. Note that the taxonomy CSV must contain lineage information at this rank.'
-    )
     subparser.add_argument(
         '--keep-full-identifiers', action='store_true',
         help='do not split identifiers on whitespace'
@@ -90,25 +86,26 @@ def subparser(subparsers):
         '-f', '--force', action = 'store_true',
         help='continue past survivable errors in loading taxonomy database or gather results',
     )
+    subparser.add_argument(
+        '--lins', '--lin-taxonomy', action='store_true', default=False,
+        help="use LIN taxonomy in place of standard taxonomic ranks.  Note that the taxonomy CSV must contain 'lin' lineage information."
+    )
+    subparser.add_argument(
+        '--lingroup', '--lingroups', metavar='FILE', default=None,
+        help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups."
+    )
     add_tax_threshold_arg(subparser, 0.1)
+    add_rank_arg(subparser)
 
 
 def main(args):
     import sourmash
     try:
         if not args.gather_csv and not args.from_file:
             raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.")
-        # handle output formats
-        print(args.output_format)
-        if not args.rank:
-            if any(x in ["krona"] for x in args.output_format):
-                raise ValueError(f"Rank (--rank) is required for krona output format.")
-        if len(args.output_format) > 1:
-            if args.output_base == "-":
-                raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}")
-        elif not args.output_format:
-            # change to "human" for 5.0
-            args.output_format = ["csv_summary"]
+        if args.rank:
+            args.rank = check_rank(args)
+        args.output_format = check_tax_outputs(args, rank_required = ['krona'])
 
     except ValueError as exc:
         error(f"ERROR: {str(exc)}")

diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py
@@ -93,7 +93,7 @@ def main(args):
             raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.")
         if args.rank:
             args.rank = check_rank(args)
-        args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport'])
+        args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport'], use_lingroup_format=True)
 
     except ValueError as exc:
         error(f"ERROR: {str(exc)}")

diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py
@@ -139,7 +139,6 @@ def check_rank(args):
     standard_ranks =['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
     if args.lins:
         if args.rank.isdigit(): 
-        #if isinstance(args.rank, int):
             return str(args.rank)
         raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position.")
     elif args.rank in standard_ranks:
@@ -152,12 +151,13 @@ def add_rank_arg(parser):
     parser.add_argument(
         '-r', '--rank',
         '--position', '--lin-position',
-        help="For non-default output formats: Summarize genome taxonomy at this rank (or LIN position) and above. \
-              Note that the taxonomy CSV must contain lineage information at this rank (or LIN position). \
+        help="For non-default output formats. Classify to this rank (tax genome) or summarize taxonomy at this rank and above (tax metagenome). \
+              Note that the taxonomy CSV must contain lineage information at this rank, and that LIN positions start at 0. \
               Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position"
     )
 
-def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins = None):
+
+def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins = None, use_lingroup_format=False):
     "Handle ouput format combinations"
     # check that rank is passed for formats requiring rank.
     if not args.rank:
@@ -171,7 +171,7 @@ def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins =
                 raise ValueError(f"The following outputs are incompatible with '--lins': : {', '.join(incompatible_with_lins)}")
         # check that lingroup file exists if needed
         if args.lingroup:
-            if "lingroup" not in args.output_format:
+            if use_lingroup_format and "lingroup" not in args.output_format:
                 args.output_format.append("lingroup")
         elif "lingroup" in args.output_format:
             raise ValueError(f"Must provide lingroup csv via '--lingroup' in order to output a lingroup report.")

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
@@ -202,8 +202,15 @@ def genome(args):
         tax_assign = MultiLineageDB.load(args.taxonomy_csv,
                        keep_full_identifiers=args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions,
-                       force=args.force)
+                       force=args.force, lins=args.lins)
         available_ranks = tax_assign.available_ranks
+
+        lg_ranks=None
+        all_lgs=None
+        if args.lingroup:
+            lingroups = tax_utils.read_lingroups(args.lingroup)
+            lg_ranks, all_lgs = tax_utils.parse_lingroups(lingroups)
+
     except ValueError as exc:
         error(f"ERROR: {str(exc)}")
         sys.exit(-1)
@@ -224,7 +231,7 @@ def genome(args):
                                                                                        fail_on_missing_taxonomy=args.fail_on_missing_taxonomy,
                                                                                        keep_full_identifiers=args.keep_full_identifiers,
                                                                                        keep_identifier_versions = args.keep_identifier_versions,
-                                                                                       )
+                                                                                       lins=args.lins)
 
     except ValueError as exc:
         error(f"ERROR: {str(exc)}")
@@ -239,7 +246,8 @@ def genome(args):
         try:
             queryResult.build_classification_result(rank=args.rank,
                                                     ani_threshold=args.ani_threshold,
-                                                    containment_threshold=args.containment_threshold)
+                                                    containment_threshold=args.containment_threshold,
+                                                    lingroup_ranks=lg_ranks, lingroups=all_lgs)
 
         except ValueError as exc:
             error(f"ERROR: {str(exc)}")

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -19,7 +19,7 @@
 __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs',
            'load_gather_results', 'check_and_load_gather_csvs'
            'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank'
-           'format_for_krona', 'write_output', 'write_bioboxes',
+           'format_for_krona', 'write_output', 'write_bioboxes', 'parse_lingroups',
            'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac',
            'MultiLineageDB', 'RankLineageInfo', 'LINLineageInfo']
 
@@ -632,6 +632,20 @@ def read_lingroups(lingroup_csv):
     return lingroupD
 
 
+def parse_lingroups(lingroupD):
+    # find the ranks we need to consider
+    all_lgs = set()
+    lg_ranks = set()
+    for lg_prefix in lingroupD.keys():
+        # store lineage info for LCA pathfinding
+        lg_info = LINLineageInfo(lineage_str=lg_prefix)
+        all_lgs.add(lg_info)
+        # store rank so we only go through summarized results at these ranks
+        lg_rank = str(lg_info.lowest_rank)
+        lg_ranks.add(lg_rank)
+    return lg_ranks, all_lgs
+
+
 def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False,
                         skip_idents = None, fail_on_missing_taxonomy=False,
                         keep_full_identifiers=False, keep_identifier_versions=False,
@@ -1980,7 +1994,7 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False):
                                               fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani)
                 self.summarized_lineage_results[rank].append(sres)
 
-    def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False):
+    def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False, lingroup_ranks=None, lingroups=None):
         if containment_threshold is not None and not 0 <= containment_threshold <= 1:
             raise ValueError(f"Containment threshold must be between 0 and 1 (input value: {containment_threshold}).")
         if ani_threshold is not None and not 0 <= ani_threshold <= 1:
@@ -1996,7 +2010,13 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
                 raise ValueError(f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}")
             else:
                 self.classified_ranks = [rank]
+        if lingroup_ranks:
+            notify("Restricting classification to lingroups.")
+            self.classified_ranks = [x for x in self.classified_ranks if x in lingroup_ranks]
+        if not self.classified_ranks:
+            raise ValueError(f"Error: no ranks remain for classification.")
         # CLASSIFY using summarization--> best only result. Best way = use ANI or containment threshold
+        classif = None
         for this_rank in self.classified_ranks: # ascending order or just single rank
             # reset for this rank
             f_weighted=0.0
@@ -2008,6 +2028,10 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
             sorted_sum_uniq_to_query.sort(key = lambda x: -x[1])
             # select best-at-rank only
             this_lineage, f_unique_at_rank = sorted_sum_uniq_to_query[0]
+            # if in desired lineage groups, continue (or??)
+            if lingroups and this_lineage not in lingroups:
+                # ignore this lineage and continue up
+                continue
             bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage]
             f_weighted = self.sum_uniq_weighted[this_rank][this_lineage]
 
@@ -2170,23 +2194,14 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
         header = ["name", "lin", "percent_containment", "num_bp_contained"]
 
         if self.query_info.total_weighted_hashes == 0:
-            raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0")
+            raise ValueError("ERROR: cannot produce 'lingroup' format from gather results before sourmash v4.5.0")
 
         # find the ranks we need to consider
-        all_lgs = set()
-        lg_ranks = set()
-        for lg_prefix in LINgroupsD.keys():
-            # store lineage info for LCA pathfinding
-            lg_info = LINLineageInfo(lineage_str=lg_prefix)
-            all_lgs.add(lg_info)
-            # store rank so we only go through summarized results at these ranks
-            lg_rank = int(lg_info.lowest_rank)
-            lg_ranks.add(lg_rank)
+        lg_ranks, all_lgs = parse_lingroups(LINgroupsD)
 
         # grab summarized results matching LINgroup prefixes
         lg_results = {}
         for rank in lg_ranks:
-            rank = str(rank)
             rank_results = self.summarized_lineage_results[rank]
             for res in rank_results:
                 if res.lineage in all_lgs:# is this lineage in the list of LINgroups?

diff --git a/tests/test_tax.py b/tests/test_tax.py
@@ -505,7 +505,7 @@ def test_genome_no_rank_krona(runtmp):
 
     with pytest.raises(SourmashCommandFailed) as exc:
         runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona')
-    assert "Rank (--rank) is required for krona output format." in str(exc.value)
+    assert "ERROR: Rank (--rank) is required for krona output formats" in str(exc.value)
 
 
 def test_metagenome_rank_not_available(runtmp):
@@ -1081,8 +1081,10 @@ def test_genome_empty_gather_results(runtmp):
     with pytest.raises(SourmashCommandFailed) as exc:
         runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax)
 
-    assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value)
     assert runtmp.last_result.status == -1
+    print(runtmp.last_result.err)
+    print(runtmp.last_result.out)
+    assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value)
 
 
 def test_genome_bad_gather_header(runtmp):
@@ -1978,6 +1980,7 @@ def test_genome_empty_gather_results_with_csv_force(runtmp):
     assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
     assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out
 
+
 def test_genome_containment_threshold_bounds(runtmp):
     c = runtmp
     g_csv = utils.get_test_data('tax/test1.gather.csv')
@@ -2212,6 +2215,83 @@ def test_annotate_no_gather_csv(runtmp):
     print(runtmp.last_result.err)
 
 
+def test_genome_LIN(runtmp):
+    # test basic genome with LIN taxonomy
+    c = runtmp
+
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    tax = utils.get_test_data('tax/test.LIN-taxonomy.csv')
+
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.93')
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
+    assert "test1,below_threshold,0,0.089,1,md5,test1.sig,0.057,444000,0.925" in c.last_result.out
+
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.924')
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
+    assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out
+
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--rank', '4')
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
+    assert "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out
+
+
+def test_genome_LIN_lingroups(runtmp):
+    # test basic genome with LIN taxonomy
+    c = runtmp
+
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    tax = utils.get_test_data('tax/test.LIN-taxonomy.csv')
+
+    lg_file = runtmp.output("test.lg.csv")
+
+    with open(lg_file, 'w') as out:
+        out.write('lin,name\n')
+        out.write('0;0;0,lg1\n')
+        out.write('1;0;0,lg2\n')
+        out.write('2;0;0,lg3\n')
+        out.write('1;0;1,lg3\n')
+        # write a 19 so we can check the end
+        out.write('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n')
+
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file)
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
+    assert "test1,below_threshold,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out
+
+    c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file, '--ani-threshold', '0.924')
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
+    assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out
+
+
 def test_annotate_0(runtmp):
     # test annotate basics
     c = runtmp

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
@@ -2912,7 +2912,7 @@ def test_make_lingroup_results_fail_pre_v450():
     with pytest.raises(ValueError) as exc:
         q_res.make_lingroup_results(lingroupD)
     print(str(exc))
-    assert "cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0" in str(exc)
+    assert "cannot produce 'lingroup' format from gather results before sourmash v4.5.0" in str(exc)
 
 
 def test_read_lingroups(runtmp):