Skip to content

Commit

Permalink
MRG: tax genome with --lins (#2519)
Browse files Browse the repository at this point in the history
Use `--lins` and `--lingroup` with `tax genome`.

Instead of building a separate `lingroup` output format, genome
classification with `--lingroup` input will be restricted to lingroups
only. All other options (`--rank`, `--ani-threshold`, etc) should
continue to function. However, if you pass in a rank that is not present
in your `lingroups`, you may eliminate all classification options.


## Testing

### Install on your own computer/cluster: 
Here is one way to test this code before it gets fully integrated into
sourmash:
- If you don't have conda, I'd recommend installing `mamba`,
[instructions
here](https://mamba.readthedocs.io/en/latest/installation.html) instead.
- if you do have `mamba`, replace the word `conda` with `mamba` in the
following commands.

Download an environment file that points to this branch:
```
curl -JLO https://raw.githubusercontent.com/bluegenes/2023-demo-sourmash-LIN/main/genome-lin.yml
```
Create a virtual environment using this file:
```
conda env create -f genome-lin.yml
```
Activate that environment, which is named `tax-genome-lins`:

```
conda activate tax-genome-lins
```
make sure `--lins` is in the `--help` for `sourmash tax genome`:
```
sourmash tax genome --help
```

## Command to run

The command to run is this one: 

```
sourmash tax genome -g $gather_csv  -t $taxonomy_csv \
                        --lins --lingroup $lingroups_csv
```

The gather file should be the result of running sourmash gather using a
genome query, since `sourmash tax genome` tries to annotate the file to
the best lineage produced by sourmash gather.

---------

Co-authored-by: C. Titus Brown <titus@idyll.org>
  • Loading branch information
bluegenes and ctb committed Apr 5, 2023
1 parent fd53c50 commit bd2fdcc
Show file tree
Hide file tree
Showing 8 changed files with 148 additions and 41 deletions.
7 changes: 7 additions & 0 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,13 @@ Related lingroup subpaths will be grouped in output, but exact ordering may chan
Optionally, `genome` can instead report classifications at a desired `rank`,
regardless of match threshold (`--rank` argument, e.g. `--rank species`).

If using `--lins` taxonomy, you can also provide a `--lingroup` file containing two
columns, `name`, and `lin`, which provide a series of lin prefixes of interest.
If provided, genome classification will be restricted to provided lingroups only.
All other options (`--rank`, `--ani-threshold`, etc) should continue to function.
If you specify a `--rank` that does not have an associated lingroup, sourmash will
notify you that you eliminated all classification options.

Note that these thresholds and strategies are under active testing.

To illustrate the utility of `genome`, let's consider a signature consisting
Expand Down
29 changes: 13 additions & 16 deletions src/sourmash/cli/tax/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import argparse
import sourmash
from sourmash.logging import notify, print_results, error
from sourmash.cli.utils import add_tax_threshold_arg
from sourmash.cli.utils import add_tax_threshold_arg, check_rank, check_tax_outputs, add_rank_arg

def subparser(subparsers):
subparser = subparsers.add_parser('genome',
Expand Down Expand Up @@ -65,10 +65,6 @@ def subparser(subparsers):
'--output-dir', default= "",
help='directory for output files'
)
subparser.add_argument(
'-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'],
help='Summarize genome taxonomy at this rank and above. Note that the taxonomy CSV must contain lineage information at this rank.'
)
subparser.add_argument(
'--keep-full-identifiers', action='store_true',
help='do not split identifiers on whitespace'
Expand All @@ -90,25 +86,26 @@ def subparser(subparsers):
'-f', '--force', action = 'store_true',
help='continue past survivable errors in loading taxonomy database or gather results',
)
subparser.add_argument(
'--lins', '--lin-taxonomy', action='store_true', default=False,
help="use LIN taxonomy in place of standard taxonomic ranks. Note that the taxonomy CSV must contain 'lin' lineage information."
)
subparser.add_argument(
'--lingroup', '--lingroups', metavar='FILE', default=None,
help="CSV containing 'name', 'lin' columns, where 'lin' is the lingroup prefix. Will restrict classification to these groups."
)
add_tax_threshold_arg(subparser, 0.1)
add_rank_arg(subparser)


def main(args):
import sourmash
try:
if not args.gather_csv and not args.from_file:
raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.")
# handle output formats
print(args.output_format)
if not args.rank:
if any(x in ["krona"] for x in args.output_format):
raise ValueError(f"Rank (--rank) is required for krona output format.")
if len(args.output_format) > 1:
if args.output_base == "-":
raise ValueError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}")
elif not args.output_format:
# change to "human" for 5.0
args.output_format = ["csv_summary"]
if args.rank:
args.rank = check_rank(args)
args.output_format = check_tax_outputs(args, rank_required = ['krona'])

except ValueError as exc:
error(f"ERROR: {str(exc)}")
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/cli/tax/metagenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def main(args):
raise ValueError(f"No gather CSVs found! Please input via '-g' or '--from-file'.")
if args.rank:
args.rank = check_rank(args)
args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport'])
args.output_format = check_tax_outputs(args, rank_required = ['krona', 'lineage_summary'], incompatible_with_lins = ['bioboxes', 'kreport'], use_lingroup_format=True)

except ValueError as exc:
error(f"ERROR: {str(exc)}")
Expand Down
10 changes: 5 additions & 5 deletions src/sourmash/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,6 @@ def check_rank(args):
standard_ranks =['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom']
if args.lins:
if args.rank.isdigit():
#if isinstance(args.rank, int):
return str(args.rank)
raise argparse.ArgumentTypeError(f"Invalid '--rank'/'--position' input: '{args.rank}'. '--lins' is specified. Rank must be an integer corresponding to a LIN position.")
elif args.rank in standard_ranks:
Expand All @@ -152,12 +151,13 @@ def add_rank_arg(parser):
parser.add_argument(
'-r', '--rank',
'--position', '--lin-position',
help="For non-default output formats: Summarize genome taxonomy at this rank (or LIN position) and above. \
Note that the taxonomy CSV must contain lineage information at this rank (or LIN position). \
help="For non-default output formats. Classify to this rank (tax genome) or summarize taxonomy at this rank and above (tax metagenome). \
Note that the taxonomy CSV must contain lineage information at this rank, and that LIN positions start at 0. \
Choices: 'strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom' or an integer LIN position"
)

def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins = None):

def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins = None, use_lingroup_format=False):
"Handle ouput format combinations"
# check that rank is passed for formats requiring rank.
if not args.rank:
Expand All @@ -171,7 +171,7 @@ def check_tax_outputs(args, rank_required = ["krona"], incompatible_with_lins =
raise ValueError(f"The following outputs are incompatible with '--lins': : {', '.join(incompatible_with_lins)}")
# check that lingroup file exists if needed
if args.lingroup:
if "lingroup" not in args.output_format:
if use_lingroup_format and "lingroup" not in args.output_format:
args.output_format.append("lingroup")
elif "lingroup" in args.output_format:
raise ValueError(f"Must provide lingroup csv via '--lingroup' in order to output a lingroup report.")
Expand Down
14 changes: 11 additions & 3 deletions src/sourmash/tax/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,15 @@ def genome(args):
tax_assign = MultiLineageDB.load(args.taxonomy_csv,
keep_full_identifiers=args.keep_full_identifiers,
keep_identifier_versions=args.keep_identifier_versions,
force=args.force)
force=args.force, lins=args.lins)
available_ranks = tax_assign.available_ranks

lg_ranks=None
all_lgs=None
if args.lingroup:
lingroups = tax_utils.read_lingroups(args.lingroup)
lg_ranks, all_lgs = tax_utils.parse_lingroups(lingroups)

except ValueError as exc:
error(f"ERROR: {str(exc)}")
sys.exit(-1)
Expand All @@ -224,7 +231,7 @@ def genome(args):
fail_on_missing_taxonomy=args.fail_on_missing_taxonomy,
keep_full_identifiers=args.keep_full_identifiers,
keep_identifier_versions = args.keep_identifier_versions,
)
lins=args.lins)

except ValueError as exc:
error(f"ERROR: {str(exc)}")
Expand All @@ -239,7 +246,8 @@ def genome(args):
try:
queryResult.build_classification_result(rank=args.rank,
ani_threshold=args.ani_threshold,
containment_threshold=args.containment_threshold)
containment_threshold=args.containment_threshold,
lingroup_ranks=lg_ranks, lingroups=all_lgs)

except ValueError as exc:
error(f"ERROR: {str(exc)}")
Expand Down
41 changes: 28 additions & 13 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
__all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs',
'load_gather_results', 'check_and_load_gather_csvs'
'report_missing_and_skipped_identities', 'aggregate_by_lineage_at_rank'
'format_for_krona', 'write_output', 'write_bioboxes',
'format_for_krona', 'write_output', 'write_bioboxes', 'parse_lingroups',
'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac',
'MultiLineageDB', 'RankLineageInfo', 'LINLineageInfo']

Expand Down Expand Up @@ -632,6 +632,20 @@ def read_lingroups(lingroup_csv):
return lingroupD


def parse_lingroups(lingroupD):
# find the ranks we need to consider
all_lgs = set()
lg_ranks = set()
for lg_prefix in lingroupD.keys():
# store lineage info for LCA pathfinding
lg_info = LINLineageInfo(lineage_str=lg_prefix)
all_lgs.add(lg_info)
# store rank so we only go through summarized results at these ranks
lg_rank = str(lg_info.lowest_rank)
lg_ranks.add(lg_rank)
return lg_ranks, all_lgs


def load_gather_results(gather_csv, tax_assignments, *, seen_queries=None, force=False,
skip_idents = None, fail_on_missing_taxonomy=False,
keep_full_identifiers=False, keep_identifier_versions=False,
Expand Down Expand Up @@ -1980,7 +1994,7 @@ def build_summarized_result(self, single_rank=None, force_resummarize=False):
fraction=f_unique, bp_match_at_rank=bp_intersect_at_rank, query_ani_at_rank=query_ani)
self.summarized_lineage_results[rank].append(sres)

def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False):
def build_classification_result(self, rank=None, ani_threshold=None, containment_threshold=0.1, force_resummarize=False, lingroup_ranks=None, lingroups=None):
if containment_threshold is not None and not 0 <= containment_threshold <= 1:
raise ValueError(f"Containment threshold must be between 0 and 1 (input value: {containment_threshold}).")
if ani_threshold is not None and not 0 <= ani_threshold <= 1:
Expand All @@ -1996,7 +2010,13 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
raise ValueError(f"Error: rank '{rank}' not in summarized rank(s), {','.join(self.summarized_ranks)}")
else:
self.classified_ranks = [rank]
if lingroup_ranks:
notify("Restricting classification to lingroups.")
self.classified_ranks = [x for x in self.classified_ranks if x in lingroup_ranks]
if not self.classified_ranks:
raise ValueError(f"Error: no ranks remain for classification.")
# CLASSIFY using summarization--> best only result. Best way = use ANI or containment threshold
classif = None
for this_rank in self.classified_ranks: # ascending order or just single rank
# reset for this rank
f_weighted=0.0
Expand All @@ -2008,6 +2028,10 @@ def build_classification_result(self, rank=None, ani_threshold=None, containment
sorted_sum_uniq_to_query.sort(key = lambda x: -x[1])
# select best-at-rank only
this_lineage, f_unique_at_rank = sorted_sum_uniq_to_query[0]
# if in desired lineage groups, continue (or??)
if lingroups and this_lineage not in lingroups:
# ignore this lineage and continue up
continue
bp_intersect_at_rank = self.sum_uniq_bp[this_rank][this_lineage]
f_weighted = self.sum_uniq_weighted[this_rank][this_lineage]

Expand Down Expand Up @@ -2170,23 +2194,14 @@ def make_lingroup_results(self, LINgroupsD): # LingroupsD is dictionary {lg_pref
header = ["name", "lin", "percent_containment", "num_bp_contained"]

if self.query_info.total_weighted_hashes == 0:
raise ValueError("ERROR: cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0")
raise ValueError("ERROR: cannot produce 'lingroup' format from gather results before sourmash v4.5.0")

# find the ranks we need to consider
all_lgs = set()
lg_ranks = set()
for lg_prefix in LINgroupsD.keys():
# store lineage info for LCA pathfinding
lg_info = LINLineageInfo(lineage_str=lg_prefix)
all_lgs.add(lg_info)
# store rank so we only go through summarized results at these ranks
lg_rank = int(lg_info.lowest_rank)
lg_ranks.add(lg_rank)
lg_ranks, all_lgs = parse_lingroups(LINgroupsD)

# grab summarized results matching LINgroup prefixes
lg_results = {}
for rank in lg_ranks:
rank = str(rank)
rank_results = self.summarized_lineage_results[rank]
for res in rank_results:
if res.lineage in all_lgs:# is this lineage in the list of LINgroups?
Expand Down
84 changes: 82 additions & 2 deletions tests/test_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def test_genome_no_rank_krona(runtmp):

with pytest.raises(SourmashCommandFailed) as exc:
runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona')
assert "Rank (--rank) is required for krona output format." in str(exc.value)
assert "ERROR: Rank (--rank) is required for krona output formats" in str(exc.value)


def test_metagenome_rank_not_available(runtmp):
Expand Down Expand Up @@ -1081,8 +1081,10 @@ def test_genome_empty_gather_results(runtmp):
with pytest.raises(SourmashCommandFailed) as exc:
runtmp.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax)

assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value)
assert runtmp.last_result.status == -1
print(runtmp.last_result.err)
print(runtmp.last_result.out)
assert f"Cannot read gather results from '{g_csv}'. Is file empty?" in str(exc.value)


def test_genome_bad_gather_header(runtmp):
Expand Down Expand Up @@ -1978,6 +1980,7 @@ def test_genome_empty_gather_results_with_csv_force(runtmp):
assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out
assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000' in c.last_result.out


def test_genome_containment_threshold_bounds(runtmp):
c = runtmp
g_csv = utils.get_test_data('tax/test1.gather.csv')
Expand Down Expand Up @@ -2212,6 +2215,83 @@ def test_annotate_no_gather_csv(runtmp):
print(runtmp.last_result.err)


def test_genome_LIN(runtmp):
# test basic genome with LIN taxonomy
c = runtmp

g_csv = utils.get_test_data('tax/test1.gather.csv')
tax = utils.get_test_data('tax/test.LIN-taxonomy.csv')

c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.93')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0
assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
assert "test1,below_threshold,0,0.089,1,md5,test1.sig,0.057,444000,0.925" in c.last_result.out

c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--ani-threshold', '0.924')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0
assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out

c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--rank', '4')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0
assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
assert "test1,below_threshold,4,0.088,0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out


def test_genome_LIN_lingroups(runtmp):
# test basic genome with LIN taxonomy
c = runtmp

g_csv = utils.get_test_data('tax/test1.gather.csv')
tax = utils.get_test_data('tax/test.LIN-taxonomy.csv')

lg_file = runtmp.output("test.lg.csv")

with open(lg_file, 'w') as out:
out.write('lin,name\n')
out.write('0;0;0,lg1\n')
out.write('1;0;0,lg2\n')
out.write('2;0;0,lg3\n')
out.write('1;0;1,lg3\n')
# write a 19 so we can check the end
out.write('0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,lg4\n')

c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file)

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0
assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
assert "test1,below_threshold,2,0.088,0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out

c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, '--lins', '--lingroup', lg_file, '--ani-threshold', '0.924')

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0
assert "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank,query_ani_at_rank" in c.last_result.out
assert "test1,match,19,0.088,0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0,md5,test1.sig,0.058,442000,0.925" in c.last_result.out


def test_annotate_0(runtmp):
# test annotate basics
c = runtmp
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2912,7 +2912,7 @@ def test_make_lingroup_results_fail_pre_v450():
with pytest.raises(ValueError) as exc:
q_res.make_lingroup_results(lingroupD)
print(str(exc))
assert "cannot produce 'LINgroup_report' format from gather results before sourmash v4.5.0" in str(exc)
assert "cannot produce 'lingroup' format from gather results before sourmash v4.5.0" in str(exc)


def test_read_lingroups(runtmp):
Expand Down

0 comments on commit bd2fdcc

Please sign in to comment.