From 1d2ce25bd6d3502ee003c33675e50c30d266e815 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 9 Apr 2017 07:31:49 -0700 Subject: [PATCH 1/7] add tests for --scaled 1 and 2 --- sourmash_lib/test_sourmash.py | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/sourmash_lib/test_sourmash.py b/sourmash_lib/test_sourmash.py index 3eb77dba2f..e116e47bb5 100644 --- a/sourmash_lib/test_sourmash.py +++ b/sourmash_lib/test_sourmash.py @@ -339,6 +339,44 @@ def test_do_sourmash_compute_with_cardinality(): assert set(cards) == set([ 966, 986 ]) +def test_do_sourmash_compute_with_scaled_1(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '1', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.estimator.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == set(0) + + +def test_do_sourmash_compute_with_scaled_2(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '2', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.estimator.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == set([ int(2**64 /2.) ]) + + def test_do_sourmash_compute_with_scaled(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') From 11fde2a0f49cc138af9159e5ad388bc110ebcaba Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 9 Apr 2017 07:38:05 -0700 Subject: [PATCH 2/7] fix problem with args.scaled defaulting to None --- sourmash_lib/commands.py | 6 ++---- sourmash_lib/test_sourmash.py | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index cca44848ec..c017c21f35 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -186,8 +186,8 @@ def compute(args): def make_estimators(): seed = args.seed max_hash = 0 - if args.scaled: - max_hash = 2**64 / float(args.scaled) + if args.scaled and args.scaled > 1: + max_hash = int(round(2**64 / float(args.scaled), 0)) # one estimator for each ksize Elist = [] @@ -230,8 +230,6 @@ def save_siglist(siglist, output_fp, filename=None): with open(filename, 'w') as fp: sig.save_signatures(siglist, fp) - notify('Computing signature for ksizes: {}', str(ksizes)) - if args.with_cardinality: print('Calculating k-mer cardinality of input sequences.', file=sys.stderr) diff --git a/sourmash_lib/test_sourmash.py b/sourmash_lib/test_sourmash.py index e116e47bb5..2b549e01da 100644 --- a/sourmash_lib/test_sourmash.py +++ b/sourmash_lib/test_sourmash.py @@ -355,7 +355,7 @@ def test_do_sourmash_compute_with_scaled_1(): max_hashes = [ x.estimator.max_hash for x in siglist ] assert len(max_hashes) == 2 - assert set(max_hashes) == set(0) + assert set(max_hashes) == { 0 } def test_do_sourmash_compute_with_scaled_2(): From 97c0c061027559c74c9a645157c654f7a43d018a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 9 Apr 2017 07:57:45 -0700 Subject: [PATCH 3/7] add MAX_HASH into ext module --- sourmash_lib/__init__.py | 5 +++-- sourmash_lib/_minhash.pyx | 13 +++++++++++++ sourmash_lib/commands.py | 10 ++++++---- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/sourmash_lib/__init__.py b/sourmash_lib/__init__.py index d5b85fe06d..7211f788c0 100644 --- a/sourmash_lib/__init__.py +++ b/sourmash_lib/__init__.py @@ -5,7 +5,7 @@ from __future__ import print_function import re import math -from ._minhash import MinHash +from ._minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash khmer_available = False try: @@ -14,7 +14,8 @@ except ImportError: pass -DEFAULT_SEED=MinHash(1,1).seed +DEFAULT_SEED = get_minhash_default_seed() +MAX_HASH = get_minhash_max_hash() class Estimators(object): """ diff --git a/sourmash_lib/_minhash.pyx b/sourmash_lib/_minhash.pyx index 5fc02a8b66..2d5f0a1bd0 100644 --- a/sourmash_lib/_minhash.pyx +++ b/sourmash_lib/_minhash.pyx @@ -11,9 +11,22 @@ from libc.stdint cimport uint32_t from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur +# default MurmurHash seed cdef uint32_t MINHASH_DEFAULT_SEED = 42 +def get_minhash_default_seed(): + return MINHASH_DEFAULT_SEED + + +# we use the 64-bit hash space of MurmurHash only +cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1 + + +def get_minhash_max_hash(): + return MINHASH_MAX_HASH + + cdef bytes to_bytes(s): if not isinstance(s, (basestring, bytes)): raise TypeError("Requires a string-like sequence") diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index c017c21f35..457fb0a2c9 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -187,7 +187,8 @@ def make_estimators(): seed = args.seed max_hash = 0 if args.scaled and args.scaled > 1: - max_hash = int(round(2**64 / float(args.scaled), 0)) + max_hash = sourmash_lib.MAX_HASH / float(args.scaled) + max_hash = int(round(max_hash, 0)) # one estimator for each ksize Elist = [] @@ -716,7 +717,7 @@ def sbt_gather(args): orig_mins = orig_query.estimator.get_hashes() # calculate the band size/resolution R for the genome - R_metagenome = 2**64 / float(orig_query.estimator.max_hash) + R_metagenome = sourmash_lib.MAX_HASH / float(orig_query.estimator.max_hash) # define a function to do a 'best' search and get only top match. def find_best(tree, query): @@ -763,7 +764,8 @@ def build_new_signature(mins): # based either on an explicit --scaled parameter, or by calculating # using genome size est --with-cardinality. if best_leaf.estimator.max_hash: - R_genome = 2**64 / float(best_leaf.estimator.max_hash) + R_genome = sourmash_lib.MAX_HASH / \ + float(best_leaf.estimator.max_hash) elif best_leaf.estimator.hll: genome_size = best_leaf.estimator.hll.estimate_cardinality() genome_max_hash = max(found_mins) @@ -777,7 +779,7 @@ def build_new_signature(mins): # pick the highest R / lowest resolution R_comparison = max(R_metagenome, R_genome) - new_max_hash = 2**64 / float(R_comparison) + new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison) query_mins = set([ i for i in query_mins if i < new_max_hash ]) found_mins = set([ i for i in found_mins if i < new_max_hash ]) orig_mins = set([ i for i in orig_mins if i < new_max_hash ]) From 9e48e4c1c66aa6beb62c51e0afdc3b4d6343f033 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 9 Apr 2017 07:31:49 -0700 Subject: [PATCH 4/7] add tests for --scaled 1 and 2 --- tests/test_sourmash.py | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 63bb3c91f5..9f54e5042f 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -320,6 +320,44 @@ def test_do_sourmash_compute_multik_outfile(): assert 31 in ksizes +def test_do_sourmash_compute_with_scaled_1(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '1', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.estimator.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == set(0) + + +def test_do_sourmash_compute_with_scaled_2(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '2', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.estimator.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == set([ int(2**64 /2.) ]) + + def test_do_sourmash_compute_with_scaled(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -944,10 +982,10 @@ def test_sbt_gather_metagenome(): testdata_sigs = glob.glob(testdata_glob) query_sig = utils.get_test_data('gather/combined.sig') - + cmd = ['sbt_index', 'gcf_all', '-k', '21'] cmd.extend(testdata_sigs) - + status, out, err = utils.runscript('sourmash', cmd, in_directory=location) From 9c3c7b87b1566f9bd7aab710b4bf0b6bdcac1405 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 9 Apr 2017 07:38:05 -0700 Subject: [PATCH 5/7] fix problem with args.scaled defaulting to None --- sourmash_lib/commands.py | 4 ++-- tests/test_sourmash.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index 9ac6fca61d..43e253b544 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -179,8 +179,8 @@ def compute(args): def make_estimators(): seed = args.seed max_hash = 0 - if args.scaled: - max_hash = 2**64 / float(args.scaled) + if args.scaled and args.scaled > 1: + max_hash = int(round(2**64 / float(args.scaled), 0)) # one estimator for each ksize Elist = [] diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9f54e5042f..e1472c5cfc 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -336,7 +336,7 @@ def test_do_sourmash_compute_with_scaled_1(): max_hashes = [ x.estimator.max_hash for x in siglist ] assert len(max_hashes) == 2 - assert set(max_hashes) == set(0) + assert set(max_hashes) == { 0 } def test_do_sourmash_compute_with_scaled_2(): From 54f462dfc0c3f1b61ed2c07aad3ccc86b47e646b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 9 Apr 2017 07:57:45 -0700 Subject: [PATCH 6/7] add MAX_HASH into ext module --- sourmash_lib/__init__.py | 6 ++++-- sourmash_lib/_minhash.pyx | 13 +++++++++++++ sourmash_lib/commands.py | 10 ++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/sourmash_lib/__init__.py b/sourmash_lib/__init__.py index 7e6d74f7bd..a706daffbf 100644 --- a/sourmash_lib/__init__.py +++ b/sourmash_lib/__init__.py @@ -5,6 +5,8 @@ from __future__ import print_function import re import math -from ._minhash import MinHash, dotproduct +from ._minhash import (MinHash, dotproduct, get_minhash_default_seed, + get_minhash_max_hash) -DEFAULT_SEED=MinHash(1,1).seed +DEFAULT_SEED = get_minhash_default_seed() +MAX_HASH = get_minhash_max_hash() diff --git a/sourmash_lib/_minhash.pyx b/sourmash_lib/_minhash.pyx index 082d35a264..292953d9aa 100644 --- a/sourmash_lib/_minhash.pyx +++ b/sourmash_lib/_minhash.pyx @@ -12,9 +12,22 @@ from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur import math +# default MurmurHash seed cdef uint32_t MINHASH_DEFAULT_SEED = 42 +def get_minhash_default_seed(): + return MINHASH_DEFAULT_SEED + + +# we use the 64-bit hash space of MurmurHash only +cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1 + + +def get_minhash_max_hash(): + return MINHASH_MAX_HASH + + cdef bytes to_bytes(s): if not isinstance(s, (basestring, bytes)): raise TypeError("Requires a string-like sequence") diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index 43e253b544..2a1ceadce3 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -180,7 +180,8 @@ def make_estimators(): seed = args.seed max_hash = 0 if args.scaled and args.scaled > 1: - max_hash = int(round(2**64 / float(args.scaled), 0)) + max_hash = sourmash_lib.MAX_HASH / float(args.scaled) + max_hash = int(round(max_hash, 0)) # one estimator for each ksize Elist = [] @@ -705,7 +706,7 @@ def sbt_gather(args): orig_mins = orig_query.estimator.get_hashes() # calculate the band size/resolution R for the genome - R_metagenome = 2**64 / float(orig_query.estimator.max_hash) + R_metagenome = sourmash_lib.MAX_HASH / float(orig_query.estimator.max_hash) # define a function to do a 'best' search and get only top match. def find_best(tree, query): @@ -752,7 +753,8 @@ def build_new_signature(mins): # based either on an explicit --scaled parameter, or on genome # cardinality (deprecated) if best_leaf.estimator.max_hash: - R_genome = 2**64 / float(best_leaf.estimator.max_hash) + R_genome = sourmash_lib.MAX_HASH / \ + float(best_leaf.estimator.max_hash) elif best_leaf.estimator.hll: genome_size = best_leaf.estimator.hll.estimate_cardinality() genome_max_hash = max(found_mins) @@ -765,7 +767,7 @@ def build_new_signature(mins): # pick the highest R / lowest resolution R_comparison = max(R_metagenome, R_genome) - new_max_hash = 2**64 / float(R_comparison) + new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison) query_mins = set([ i for i in query_mins if i < new_max_hash ]) found_mins = set([ i for i in found_mins if i < new_max_hash ]) orig_mins = set([ i for i in orig_mins if i < new_max_hash ]) From 2324409aaaaff11f00cabf4334ca8bb943e2d82d Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Mon, 10 Apr 2017 18:41:26 +0000 Subject: [PATCH 7/7] Remove misleading notify --- sourmash_lib/commands.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index 2a1ceadce3..ab6aa4cdc4 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -223,8 +223,6 @@ def save_siglist(siglist, output_fp, filename=None): with open(filename, 'w') as fp: sig.save_signatures(siglist, fp) - notify('Computing signature for ksizes: {}', str(ksizes)) - if args.track_abundance: print('Tracking abundance of input k-mers.', file=sys.stderr)