diff --git a/sourmash_lib/__init__.py b/sourmash_lib/__init__.py index 7e6d74f7bd..2be1f92719 100644 --- a/sourmash_lib/__init__.py +++ b/sourmash_lib/__init__.py @@ -5,6 +5,7 @@ from __future__ import print_function import re import math -from ._minhash import MinHash, dotproduct +from ._minhash import (MinHash, get_minhash_default_seed, get_minhash_max_hash) -DEFAULT_SEED=MinHash(1,1).seed +DEFAULT_SEED = get_minhash_default_seed() +MAX_HASH = get_minhash_max_hash() diff --git a/sourmash_lib/_minhash.pyx b/sourmash_lib/_minhash.pyx index 082d35a264..292953d9aa 100644 --- a/sourmash_lib/_minhash.pyx +++ b/sourmash_lib/_minhash.pyx @@ -12,9 +12,22 @@ from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur import math +# default MurmurHash seed cdef uint32_t MINHASH_DEFAULT_SEED = 42 +def get_minhash_default_seed(): + return MINHASH_DEFAULT_SEED + + +# we use the 64-bit hash space of MurmurHash only +cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1 + + +def get_minhash_max_hash(): + return MINHASH_MAX_HASH + + cdef bytes to_bytes(s): if not isinstance(s, (basestring, bytes)): raise TypeError("Requires a string-like sequence") diff --git a/sourmash_lib/commands.py b/sourmash_lib/commands.py index 9ac6fca61d..ab6aa4cdc4 100644 --- a/sourmash_lib/commands.py +++ b/sourmash_lib/commands.py @@ -179,8 +179,9 @@ def compute(args): def make_estimators(): seed = args.seed max_hash = 0 - if args.scaled: - max_hash = 2**64 / float(args.scaled) + if args.scaled and args.scaled > 1: + max_hash = sourmash_lib.MAX_HASH / float(args.scaled) + max_hash = int(round(max_hash, 0)) # one estimator for each ksize Elist = [] @@ -222,8 +223,6 @@ def save_siglist(siglist, output_fp, filename=None): with open(filename, 'w') as fp: sig.save_signatures(siglist, fp) - notify('Computing signature for ksizes: {}', str(ksizes)) - if args.track_abundance: print('Tracking abundance of input k-mers.', file=sys.stderr) @@ -705,7 +704,7 @@ def sbt_gather(args): orig_mins = orig_query.estimator.get_hashes() # calculate the band size/resolution R for the genome - R_metagenome = 2**64 / float(orig_query.estimator.max_hash) + R_metagenome = sourmash_lib.MAX_HASH / float(orig_query.estimator.max_hash) # define a function to do a 'best' search and get only top match. def find_best(tree, query): @@ -752,7 +751,8 @@ def build_new_signature(mins): # based either on an explicit --scaled parameter, or on genome # cardinality (deprecated) if best_leaf.estimator.max_hash: - R_genome = 2**64 / float(best_leaf.estimator.max_hash) + R_genome = sourmash_lib.MAX_HASH / \ + float(best_leaf.estimator.max_hash) elif best_leaf.estimator.hll: genome_size = best_leaf.estimator.hll.estimate_cardinality() genome_max_hash = max(found_mins) @@ -765,7 +765,7 @@ def build_new_signature(mins): # pick the highest R / lowest resolution R_comparison = max(R_metagenome, R_genome) - new_max_hash = 2**64 / float(R_comparison) + new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison) query_mins = set([ i for i in query_mins if i < new_max_hash ]) found_mins = set([ i for i in found_mins if i < new_max_hash ]) orig_mins = set([ i for i in orig_mins if i < new_max_hash ]) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 63bb3c91f5..a1077f6f1e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -320,6 +320,44 @@ def test_do_sourmash_compute_multik_outfile(): assert 31 in ksizes +def test_do_sourmash_compute_with_scaled_1(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '1', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.estimator.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == { 0 } + + +def test_do_sourmash_compute_with_scaled_2(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + outfile = os.path.join(location, 'FOO.xxx') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '21,31', + '--scaled', '2', + testdata1, '-o', outfile], + in_directory=location) + assert os.path.exists(outfile) + + siglist = list(signature.load_signatures(outfile)) + assert len(siglist) == 2 + + max_hashes = [ x.estimator.max_hash for x in siglist ] + assert len(max_hashes) == 2 + assert set(max_hashes) == set([ int(2**64 /2.) ]) + + def test_do_sourmash_compute_with_scaled(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -944,10 +982,10 @@ def test_sbt_gather_metagenome(): testdata_sigs = glob.glob(testdata_glob) query_sig = utils.get_test_data('gather/combined.sig') - + cmd = ['sbt_index', 'gcf_all', '-k', '21'] cmd.extend(testdata_sigs) - + status, out, err = utils.runscript('sourmash', cmd, in_directory=location) @@ -995,37 +1033,6 @@ def test_sbt_gather_error_no_cardinality_query(): assert "query signature needs to be created with --scaled" in err -def test_sbt_gather_error_no_cardinality(): - with utils.TempDirectory() as location: - testdata1 = utils.get_test_data('short.fa') - testdata2 = utils.get_test_data('short2.fa') - status, out, err = utils.runscript('sourmash', - ['compute', testdata1, testdata2], - in_directory=location) - - status, out, err = utils.runscript('sourmash', - ['compute', testdata2, - '--scaled', '10', - '-o', 'query.fa.sig'], - in_directory=location) - - status, out, err = utils.runscript('sourmash', - ['sbt_index', 'zzz', - 'short.fa.sig', - 'short2.fa.sig'], - in_directory=location) - - assert os.path.exists(os.path.join(location, 'zzz.sbt.json')) - - status, out, err = utils.runscript('sourmash', - ['sbt_gather', 'zzz', - 'query.fa.sig'], - in_directory=location, - fail_ok=True) - assert status == -1, (status, out, err) - assert "Best hash match in sbt_gather has no cardinality" in err - - def test_sbt_categorize(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')