From ec6ae9c2591d7d5065b206db773c7817b7e2db49 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 26 Nov 2017 05:40:30 -0800 Subject: [PATCH 1/7] add --scaled to sbt index --- sourmash_lib/commands.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index 9df24aa003..366b249098 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -607,7 +607,8 @@ def index(args): parser.add_argument('-s', '--sparseness', type=float, default=.0, help='What percentage of internal nodes will not be saved. ' 'Ranges from 0.0 (save all nodes) to 1.0 (no nodes saved)') - + parser.add_argument('--scaled', type=float, default=0, + help='downsample signatures to this scaled factor') sourmash_args.add_moltype_args(parser) args = parser.parse_args(args) @@ -628,12 +629,17 @@ def index(args): if args.sparseness < 0 or args.sparseness > 1.0: error('sparseness must be in range [0.0, 1.0].') + if args.scaled: + args.scaled = int(args.scaled) + notify('downsampling signatures to scaled={}', args.scaled) + notify('loading {} files into SBT', len(inp_files)) n = 0 ksizes = set() moltypes = set() for f in inp_files: + notify('\r...reading from {} ({} signatures so far)', f, n, end='') siglist = sig.load_signatures(f, ksize=args.ksize, select_moltype=moltype) @@ -642,6 +648,9 @@ def index(args): ksizes.add(ss.minhash.ksize) moltypes.add(sourmash_args.get_moltype(ss)) + if args.scaled: + ss.minhash.downsample_scaled(args.scaled) + leaf = sourmash_lib.sbtmh.SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) n += 1 @@ -654,6 +663,8 @@ def index(args): ", ".join(map(str, ksizes)), ", ".join(moltypes)) sys.exit(-1) + notify('') + # did we load any!? if n == 0: error('no signatures found to load into tree!? failing.') From 1750e15b287aafa0171d70e597d9aa4129cba309 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Feb 2018 20:30:38 -0800 Subject: [PATCH 2/7] add tests for sourmash index --scaled --- sourmash_lib/commands.py | 4 ++-- tests/test_sourmash.py | 45 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index c379137eae..eb04c97925 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -678,10 +678,10 @@ def index(args): ksizes.add(ss.minhash.ksize) moltypes.add(sourmash_args.get_moltype(ss)) nums.add(ss.minhash.num) - scaleds.add(ss.minhash.scaled) if args.scaled: - ss.minhash.downsample_scaled(args.scaled) + ss.minhash = ss.minhash.downsample_scaled(args.scaled) + scaleds.add(ss.minhash.scaled) leaf = sourmash_lib.sbtmh.SigLeaf(ss.md5sum(), ss) tree.add_node(leaf) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index b735b9cd48..7d9f332b2d 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1210,6 +1210,51 @@ def test_do_sourmash_index_multiscaled_fail(): assert 'trying to build an SBT with incompatible signatures.' in err +def test_do_sourmash_index_multiscaled_rescale(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '--scaled', '10', testdata1], + in_directory=location) + status, out, err = utils.runscript('sourmash', + ['compute', '--scaled', '1', testdata2], + in_directory=location) + + status, out, err = utils.runscript('sourmash', + ['index', '-k', '31', 'zzz', + '--scaled', '10', + 'short.fa.sig', + 'short2.fa.sig'], + in_directory=location, fail_ok=True) + + print(status, out, err) + assert status == 0 + + +def test_do_sourmash_index_multiscaled_rescale_fail(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '--scaled', '10', testdata1], + in_directory=location) + status, out, err = utils.runscript('sourmash', + ['compute', '--scaled', '1', testdata2], + in_directory=location) + + status, out, err = utils.runscript('sourmash', + ['index', '-k', '31', 'zzz', + '--scaled', '5', + 'short.fa.sig', + 'short2.fa.sig'], + in_directory=location, fail_ok=True) + + print(status, out, err) + assert status == -1 + assert 'new scaled is lower than current sample scaled' in err + + def test_do_sourmash_sbt_search_output(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') From c4190e4ab128f35ed013ed1f34ee1292cdfcf642 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 18 Feb 2018 20:40:37 -0800 Subject: [PATCH 3/7] add a failing test that shows problems with search... sigh. --- tests/test_sourmash.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 7d9f332b2d..d8dab66664 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1580,6 +1580,34 @@ def test_search_metagenome_downsample_containment(): assert '12 matches; showing first 3:' in out +def test_search_metagenome_downsample_index(): + with utils.TempDirectory() as location: + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + # downscale during indexing, rather than during search. + cmd = ['index', 'gcf_all', '-k', '21', '--scaled', '100000'] + cmd.extend(testdata_sigs) + + status, out, err = utils.runscript('sourmash', cmd, + in_directory=location) + + assert os.path.exists(os.path.join(location, 'gcf_all.sbt.json')) + + cmd = 'search {} gcf_all -k 21 --containment' + cmd = cmd.format(query_sig) + status, out, err = utils.runscript('sourmash', cmd.split(' '), + in_directory=location) + + print(out) + print(err) + + assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in out + assert '12 matches; showing first 3:' in out + + def test_mash_csv_to_sig(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa.msh.dump') From 2c0497ad71b50fa6d2920502434ab53c422421a5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 23 Feb 2018 06:04:02 -0800 Subject: [PATCH 4/7] add comment --- tests/test_sourmash.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index d8dab66664..3ee677fb4b 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1528,6 +1528,9 @@ def test_search_metagenome_traverse(): assert '13 matches; showing first 3:' in out +# explanation: you cannot downsample a scaled SBT to match a scaled +# signature, so make sure that when you try such a search, it fails! +# (you *can* downsample a signature to match an SBT.) def test_search_metagenome_downsample(): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF*.sig') From 4b28d19050d471a6868909e20519f5a332a43364 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 25 Dec 2018 07:02:41 -0800 Subject: [PATCH 5/7] refactor tests --- tests/test_sourmash.py | 70 +++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index dd98f52df5..172ad1ad33 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1328,49 +1328,43 @@ def test_do_sourmash_index_multiscaled_fail(): assert 'trying to build an SBT with incompatible signatures.' in err -def test_do_sourmash_index_multiscaled_rescale(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '--scaled', '10', testdata1], - in_directory=location) - status, out, err = utils.runscript('sourmash', - ['compute', '--scaled', '1', testdata2], - in_directory=location) - - status, out, err = utils.runscript('sourmash', - ['index', '-k', '31', 'zzz', - '--scaled', '10', - 'short.fa.sig', - 'short2.fa.sig'], - in_directory=location, fail_ok=True) +@utils.in_tempdir +def test_do_sourmash_index_multiscaled_rescale(c): + # test sourmash index --scaled + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') - print(status, out, err) - assert status == 0 + c.run_sourmash('compute', '--scaled', '10', testdata1) + c.run_sourmash('compute', '--scaled', '1', testdata2) + c.run_sourmash('index', '-k', '31', 'zzz', + '--scaled', '10', + 'short.fa.sig', + 'short2.fa.sig') -def test_do_sourmash_index_multiscaled_rescale_fail(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - status, out, err = utils.runscript('sourmash', - ['compute', '--scaled', '10', testdata1], - in_directory=location) - status, out, err = utils.runscript('sourmash', - ['compute', '--scaled', '1', testdata2], - in_directory=location) + print(c) + assert c.last_result.status == 0 - status, out, err = utils.runscript('sourmash', - ['index', '-k', '31', 'zzz', - '--scaled', '5', - 'short.fa.sig', - 'short2.fa.sig'], - in_directory=location, fail_ok=True) - print(status, out, err) - assert status == -1 - assert 'new scaled is lower than current sample scaled' in err +@utils.in_tempdir +def test_do_sourmash_index_multiscaled_rescale_fail(c): + # test sourmash index --scaled with invalid rescaling (10 -> 5) + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + + c.run_sourmash('compute', '--scaled', '10', testdata1) + c.run_sourmash('compute', '--scaled', '1', testdata2) + # this should fail: cannot go from a scaled value of 10 to 5 + + with pytest.raises(ValueError) as e: + c.run_sourmash('index', '-k', '31', 'zzz', + '--scaled', '5', + 'short.fa.sig', + 'short2.fa.sig') + + print(e.value) + assert c.last_result.status == -1 + assert 'new scaled 5 is lower than current sample scaled 10' in c.last_result.err def test_do_sourmash_sbt_search_output(): From 962a213ab5ee07e055ebfde93f2ee5a189fd6ccd Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 26 Dec 2018 05:42:54 -0800 Subject: [PATCH 6/7] downsample signature to match SBT on search --- sourmash/search.py | 15 +++++++++++- sourmash/sourmash_args.py | 1 + tests/test_sourmash.py | 50 +++++++++++++++++++-------------------- 3 files changed, 40 insertions(+), 26 deletions(-) diff --git a/sourmash/search.py b/sourmash/search.py index 16eca15847..fa6663c6e5 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -46,7 +46,20 @@ def search_databases(query, databases, threshold, do_containment, best_only, search_fn = SearchMinHashesFindBest().search tree = obj - for leaf in tree.find(search_fn, query, threshold): + + # figure out scaled value of tree, downsample query if needed. + leaf = next(iter(tree.leaves())) + tree_mh = leaf.data.minhash + + tree_query = query + if tree_mh.scaled and query.minhash.scaled and \ + tree_mh.scaled > query.minhash.scaled: + resampled_query_mh = tree_query.minhash + resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled) + tree_query = SourmashSignature(resampled_query_mh) + + # now, search! + for leaf in tree.find(search_fn, tree_query, threshold): similarity = query_match(leaf.data) # tree search should always/only return matches above threshold diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 02e9030fbe..9e10cbeb95 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -190,6 +190,7 @@ def check_signatures_are_compatible(query, subject): def check_tree_is_compatible(treename, tree, query, is_similarity_query): + # get a minhash from the tree leaf = next(iter(tree.leaves())) tree_mh = leaf.data.minhash diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 172ad1ad33..40121f19bd 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1785,32 +1785,32 @@ def test_search_metagenome_downsample_containment(): assert '12 matches; showing first 3:' in out -def test_search_metagenome_downsample_index(): - with utils.TempDirectory() as location: - testdata_glob = utils.get_test_data('gather/GCF*.sig') - testdata_sigs = glob.glob(testdata_glob) - - query_sig = utils.get_test_data('gather/combined.sig') - - # downscale during indexing, rather than during search. - cmd = ['index', 'gcf_all', '-k', '21', '--scaled', '100000'] - cmd.extend(testdata_sigs) - - status, out, err = utils.runscript('sourmash', cmd, - in_directory=location) - - assert os.path.exists(os.path.join(location, 'gcf_all.sbt.json')) - - cmd = 'search {} gcf_all -k 21 --containment' - cmd = cmd.format(query_sig) - status, out, err = utils.runscript('sourmash', cmd.split(' '), - in_directory=location) - - print(out) - print(err) +@utils.in_tempdir +def test_search_metagenome_downsample_index(c): + # does same search as search_metagenome_downsample_containment but + # rescales during indexing + # + # for now, this test should fail; we need to clean up some internal + # stuff before we can properly implement this! + # + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + query_sig = utils.get_test_data('gather/combined.sig') + + # downscale during indexing, rather than during search. + c.run_sourmash('index', 'gcf_all', '-k', '21', '--scaled', '100000', + *testdata_sigs) + + assert os.path.exists(c.output('gcf_all.sbt.json')) + + c.run_sourmash('search', query_sig, 'gcf_all', '-k', '21', + '--containment') + print(c) - assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in out - assert '12 matches; showing first 3:' in out + assert ' 32.9% NC_003198.1 Salmonella enterica subsp. enterica serovar T...' in str(c) + assert ' 29.7% NC_003197.2 Salmonella enterica subsp. enterica serovar T...' in str(c) + assert '12 matches; showing first 3:' in str(c) def test_mash_csv_to_sig(): From 56c8f64f14ef3e091bd49deb773d6cc1e5f0335f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 26 Dec 2018 05:46:26 -0800 Subject: [PATCH 7/7] adjust indentation --- sourmash/commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index a6cf94b4f8..ecfce4c6d1 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -845,8 +845,8 @@ def search(args): # set up the search databases databases = sourmash_args.load_dbs_and_sigs(args.databases, query, - not args.containment, - args.traverse_directory) + not args.containment, + args.traverse_directory) if not len(databases): error('Nothing found to search!')