Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Fix error when --scaled 1 is set. #156

Merged
merged 11 commits into from
Apr 10, 2017
5 changes: 3 additions & 2 deletions sourmash_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import print_function
import re
import math
from ._minhash import MinHash, dotproduct
from ._minhash import (MinHash, get_minhash_default_seed, get_minhash_max_hash)

DEFAULT_SEED=MinHash(1,1).seed
DEFAULT_SEED = get_minhash_default_seed()
MAX_HASH = get_minhash_max_hash()
13 changes: 13 additions & 0 deletions sourmash_lib/_minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,22 @@ from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur
import math


# default MurmurHash seed
cdef uint32_t MINHASH_DEFAULT_SEED = 42


def get_minhash_default_seed():
return MINHASH_DEFAULT_SEED


# we use the 64-bit hash space of MurmurHash only
cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1


def get_minhash_max_hash():
return MINHASH_MAX_HASH


cdef bytes to_bytes(s):
if not isinstance(s, (basestring, bytes)):
raise TypeError("Requires a string-like sequence")
Expand Down
14 changes: 7 additions & 7 deletions sourmash_lib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,9 @@ def compute(args):
def make_estimators():
seed = args.seed
max_hash = 0
if args.scaled:
max_hash = 2**64 / float(args.scaled)
if args.scaled and args.scaled > 1:
max_hash = sourmash_lib.MAX_HASH / float(args.scaled)
max_hash = int(round(max_hash, 0))

# one estimator for each ksize
Elist = []
Expand Down Expand Up @@ -222,8 +223,6 @@ def save_siglist(siglist, output_fp, filename=None):
with open(filename, 'w') as fp:
sig.save_signatures(siglist, fp)

notify('Computing signature for ksizes: {}', str(ksizes))

if args.track_abundance:
print('Tracking abundance of input k-mers.',
file=sys.stderr)
Expand Down Expand Up @@ -705,7 +704,7 @@ def sbt_gather(args):
orig_mins = orig_query.estimator.get_hashes()

# calculate the band size/resolution R for the genome
R_metagenome = 2**64 / float(orig_query.estimator.max_hash)
R_metagenome = sourmash_lib.MAX_HASH / float(orig_query.estimator.max_hash)

# define a function to do a 'best' search and get only top match.
def find_best(tree, query):
Expand Down Expand Up @@ -752,7 +751,8 @@ def build_new_signature(mins):
# based either on an explicit --scaled parameter, or on genome
# cardinality (deprecated)
if best_leaf.estimator.max_hash:
R_genome = 2**64 / float(best_leaf.estimator.max_hash)
R_genome = sourmash_lib.MAX_HASH / \
float(best_leaf.estimator.max_hash)
elif best_leaf.estimator.hll:
genome_size = best_leaf.estimator.hll.estimate_cardinality()
genome_max_hash = max(found_mins)
Expand All @@ -765,7 +765,7 @@ def build_new_signature(mins):
# pick the highest R / lowest resolution
R_comparison = max(R_metagenome, R_genome)

new_max_hash = 2**64 / float(R_comparison)
new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison)
query_mins = set([ i for i in query_mins if i < new_max_hash ])
found_mins = set([ i for i in found_mins if i < new_max_hash ])
orig_mins = set([ i for i in orig_mins if i < new_max_hash ])
Expand Down
73 changes: 40 additions & 33 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,44 @@ def test_do_sourmash_compute_multik_outfile():
assert 31 in ksizes


def test_do_sourmash_compute_with_scaled_1():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
outfile = os.path.join(location, 'FOO.xxx')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '21,31',
'--scaled', '1',
testdata1, '-o', outfile],
in_directory=location)
assert os.path.exists(outfile)

siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.estimator.max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == { 0 }


def test_do_sourmash_compute_with_scaled_2():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
outfile = os.path.join(location, 'FOO.xxx')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '21,31',
'--scaled', '2',
testdata1, '-o', outfile],
in_directory=location)
assert os.path.exists(outfile)

siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.estimator.max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == set([ int(2**64 /2.) ])


def test_do_sourmash_compute_with_scaled():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -944,10 +982,10 @@ def test_sbt_gather_metagenome():
testdata_sigs = glob.glob(testdata_glob)

query_sig = utils.get_test_data('gather/combined.sig')

cmd = ['sbt_index', 'gcf_all', '-k', '21']
cmd.extend(testdata_sigs)

status, out, err = utils.runscript('sourmash', cmd,
in_directory=location)

Expand Down Expand Up @@ -995,37 +1033,6 @@ def test_sbt_gather_error_no_cardinality_query():
assert "query signature needs to be created with --scaled" in err


def test_sbt_gather_error_no_cardinality():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
status, out, err = utils.runscript('sourmash',
['compute', testdata1, testdata2],
in_directory=location)

status, out, err = utils.runscript('sourmash',
['compute', testdata2,
'--scaled', '10',
'-o', 'query.fa.sig'],
in_directory=location)

status, out, err = utils.runscript('sourmash',
['sbt_index', 'zzz',
'short.fa.sig',
'short2.fa.sig'],
in_directory=location)

assert os.path.exists(os.path.join(location, 'zzz.sbt.json'))

status, out, err = utils.runscript('sourmash',
['sbt_gather', 'zzz',
'query.fa.sig'],
in_directory=location,
fail_ok=True)
assert status == -1, (status, out, err)
assert "Best hash match in sbt_gather has no cardinality" in err


def test_sbt_categorize():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
Expand Down