From ca57995fe7fc7f153ba34b1c0de58c0440014347 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 12 Jan 2023 20:44:46 -0800 Subject: [PATCH] [MRG] add `--scaled` to sourmash compare (#2414) Fixes https://github.com/sourmash-bio/sourmash/issues/2398 Co-authored-by: Tessa Pierce Ward --- src/sourmash/cli/compare.py | 4 +- src/sourmash/cli/utils.py | 2 +- src/sourmash/commands.py | 17 ++++++-- tests/test_sourmash.py | 79 ++++++++++++++++++++++++++++++++++++- 4 files changed, 96 insertions(+), 6 deletions(-) diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index 7659c0697c..54864d6c93 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -28,7 +28,8 @@ """ from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, - add_picklist_args, add_pattern_args) + add_picklist_args, add_pattern_args, + add_scaled_arg) def subparser(subparsers): @@ -95,6 +96,7 @@ def subparser(subparsers): add_moltype_args(subparser) add_picklist_args(subparser) add_pattern_args(subparser) + add_scaled_arg(subparser) def main(args): diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 7a5ac0b770..d92c726b2d 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -123,7 +123,7 @@ def command_list(dirpath): def add_scaled_arg(parser, default=None): parser.add_argument( '--scaled', metavar='FLOAT', type=check_scaled_bounds, - help='scaled value should be between 100 and 1e6' + help='downsample to this scaled; value should be between 100 and 1e6' ) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 4bf21a1441..b0650db0f0 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -102,7 +102,7 @@ def compare(args): # complain if it's not all one or the other if is_scaled != is_scaled_2: - error('cannot mix scaled signatures with bounded signatures') + error('ERROR: cannot mix scaled signatures with num signatures') sys.exit(-1) is_containment = False @@ -134,17 +134,25 @@ def compare(args): if track_abundances: notify('NOTE: --containment, --max-containment, --avg-containment, and --estimate-ani ignore signature abundances.') - # if using --scaled, downsample appropriately + # if using scaled sketches or --scaled, downsample to common max scaled. printed_scaled_msg = False if is_scaled: max_scaled = max(s.minhash.scaled for s in siglist) + if args.scaled: + args.scaled = int(args.scaled) + + max_scaled = max(max_scaled, args.scaled) + if max_scaled > args.scaled: + notify(f"WARNING: --scaled specified {args.scaled}, but max scaled of sketches is {max_scaled}") + notify(f"WARNING: continuing with scaled value of {max_scaled}.") + new_siglist = [] for s in siglist: if not size_may_be_inaccurate and not s.minhash.size_is_accurate(): size_may_be_inaccurate = True if s.minhash.scaled != max_scaled: if not printed_scaled_msg: - notify(f'downsampling to scaled value of {format(max_scaled)}') + notify(f'NOTE: downsampling to scaled value of {format(max_scaled)}') printed_scaled_msg = True with s.update() as s: s.minhash = s.minhash.downsample(scaled=max_scaled) @@ -152,6 +160,9 @@ def compare(args): else: new_siglist.append(s) siglist = new_siglist + elif args.scaled is not None: + error("ERROR: cannot specify --scaled with non-scaled signatures.") + sys.exit(-1) if len(siglist) == 0: error('no signatures!') diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 06e427eda8..72be1e03cf 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -414,7 +414,7 @@ def test_compare_output_csv_gz(runtmp): def test_compare_downsample(runtmp): - # test 'compare' with --downsample + # test 'compare' with implicit downsampling c = runtmp testdata1 = utils.get_test_data('short.fa') c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) @@ -433,6 +433,83 @@ def test_compare_downsample(runtmp): assert lines[2].startswith('0.6666') +def test_compare_downsample_scaled(runtmp): + # test 'compare' with explicit --scaled downsampling + c = runtmp + testdata1 = utils.get_test_data('short.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + + c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', + '--scaled', '300') + + print(c.last_result.status, c.last_result.out, c.last_result.err) + assert 'downsampling to scaled value of 300' in c.last_result.err + with open(c.output('xxx')) as fp: + lines = fp.readlines() + assert len(lines) == 3 + assert lines[1].startswith('1.0,0.0') + assert lines[2].startswith('0.0') + + +def test_compare_downsample_scaled_too_low(runtmp): + # test 'compare' with explicit --scaled downsampling, but lower than min + c = runtmp + testdata1 = utils.get_test_data('short.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1) + + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + + c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx', + '--scaled', '100') + + print(c.last_result.status, c.last_result.out, c.last_result.err) + assert 'downsampling to scaled value of 200' in c.last_result.err + assert "WARNING: --scaled specified 100, but max scaled of sketches is 200" in c.last_result.err + with open(c.output('xxx')) as fp: + lines = fp.readlines() + assert len(lines) == 3 + assert lines[1].startswith('1.0,0.6666') + assert lines[2].startswith('0.6666') + + +def test_compare_downsample_scaled_fail_num(runtmp): + # test 'compare' with explicit --scaled downsampling; fail on num sketch + c = runtmp + testdata1 = utils.get_test_data('short.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1) + + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2) + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', + '--csv', 'xxx', '--scaled', '300') + + print(c.last_result.status, c.last_result.out, c.last_result.err) + assert "cannot mix scaled signatures with num signatures" in c.last_result.err + + +def test_compare_downsample_scaled_fail_all_num(runtmp): + # test 'compare' with explicit --scaled downsampling; fail on all num sketches + c = runtmp + testdata1 = utils.get_test_data('short.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=20', testdata1) + + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=30', testdata2) + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', + '--csv', 'xxx', '--scaled', '300') + + print(c.last_result.status, c.last_result.out, c.last_result.err) + assert "ERROR: cannot specify --scaled with non-scaled signatures." in c.last_result.err + + def test_compare_output_multiple_k(runtmp): # test 'compare' when given multiple k-mer sizes -> should fail c = runtmp