Skip to content

Commit

Permalink
Merge pull request #156 from dib-lab/fix/scaled_1
Browse files Browse the repository at this point in the history
[MRG] Fix error when --scaled 1 is set.
  • Loading branch information
ctb committed Apr 10, 2017
2 parents 2dcedd7 + b2e26ee commit 213e9cb
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 42 deletions.
5 changes: 3 additions & 2 deletions sourmash_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import print_function
import re
import math
from ._minhash import MinHash, dotproduct
from ._minhash import (MinHash, get_minhash_default_seed, get_minhash_max_hash)

DEFAULT_SEED=MinHash(1,1).seed
DEFAULT_SEED = get_minhash_default_seed()
MAX_HASH = get_minhash_max_hash()
13 changes: 13 additions & 0 deletions sourmash_lib/_minhash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,22 @@ from ._minhash cimport KmerMinHash, KmerMinAbundance, _hash_murmur
import math


# default MurmurHash seed
cdef uint32_t MINHASH_DEFAULT_SEED = 42


def get_minhash_default_seed():
return MINHASH_DEFAULT_SEED


# we use the 64-bit hash space of MurmurHash only
cdef uint64_t MINHASH_MAX_HASH = 2**64 - 1


def get_minhash_max_hash():
return MINHASH_MAX_HASH


cdef bytes to_bytes(s):
if not isinstance(s, (basestring, bytes)):
raise TypeError("Requires a string-like sequence")
Expand Down
14 changes: 7 additions & 7 deletions sourmash_lib/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,9 @@ def compute(args):
def make_estimators():
seed = args.seed
max_hash = 0
if args.scaled:
max_hash = 2**64 / float(args.scaled)
if args.scaled and args.scaled > 1:
max_hash = sourmash_lib.MAX_HASH / float(args.scaled)
max_hash = int(round(max_hash, 0))

# one estimator for each ksize
Elist = []
Expand Down Expand Up @@ -222,8 +223,6 @@ def save_siglist(siglist, output_fp, filename=None):
with open(filename, 'w') as fp:
sig.save_signatures(siglist, fp)

notify('Computing signature for ksizes: {}', str(ksizes))

if args.track_abundance:
print('Tracking abundance of input k-mers.',
file=sys.stderr)
Expand Down Expand Up @@ -705,7 +704,7 @@ def sbt_gather(args):
orig_mins = orig_query.estimator.get_hashes()

# calculate the band size/resolution R for the genome
R_metagenome = 2**64 / float(orig_query.estimator.max_hash)
R_metagenome = sourmash_lib.MAX_HASH / float(orig_query.estimator.max_hash)

# define a function to do a 'best' search and get only top match.
def find_best(tree, query):
Expand Down Expand Up @@ -752,7 +751,8 @@ def build_new_signature(mins):
# based either on an explicit --scaled parameter, or on genome
# cardinality (deprecated)
if best_leaf.estimator.max_hash:
R_genome = 2**64 / float(best_leaf.estimator.max_hash)
R_genome = sourmash_lib.MAX_HASH / \
float(best_leaf.estimator.max_hash)
elif best_leaf.estimator.hll:
genome_size = best_leaf.estimator.hll.estimate_cardinality()
genome_max_hash = max(found_mins)
Expand All @@ -765,7 +765,7 @@ def build_new_signature(mins):
# pick the highest R / lowest resolution
R_comparison = max(R_metagenome, R_genome)

new_max_hash = 2**64 / float(R_comparison)
new_max_hash = sourmash_lib.MAX_HASH / float(R_comparison)
query_mins = set([ i for i in query_mins if i < new_max_hash ])
found_mins = set([ i for i in found_mins if i < new_max_hash ])
orig_mins = set([ i for i in orig_mins if i < new_max_hash ])
Expand Down
73 changes: 40 additions & 33 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,44 @@ def test_do_sourmash_compute_multik_outfile():
assert 31 in ksizes


def test_do_sourmash_compute_with_scaled_1():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
outfile = os.path.join(location, 'FOO.xxx')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '21,31',
'--scaled', '1',
testdata1, '-o', outfile],
in_directory=location)
assert os.path.exists(outfile)

siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.estimator.max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == { 0 }


def test_do_sourmash_compute_with_scaled_2():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
outfile = os.path.join(location, 'FOO.xxx')
status, out, err = utils.runscript('sourmash',
['compute', '-k', '21,31',
'--scaled', '2',
testdata1, '-o', outfile],
in_directory=location)
assert os.path.exists(outfile)

siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.estimator.max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == set([ int(2**64 /2.) ])


def test_do_sourmash_compute_with_scaled():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -944,10 +982,10 @@ def test_sbt_gather_metagenome():
testdata_sigs = glob.glob(testdata_glob)

query_sig = utils.get_test_data('gather/combined.sig')

cmd = ['sbt_index', 'gcf_all', '-k', '21']
cmd.extend(testdata_sigs)

status, out, err = utils.runscript('sourmash', cmd,
in_directory=location)

Expand Down Expand Up @@ -995,37 +1033,6 @@ def test_sbt_gather_error_no_cardinality_query():
assert "query signature needs to be created with --scaled" in err


def test_sbt_gather_error_no_cardinality():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
status, out, err = utils.runscript('sourmash',
['compute', testdata1, testdata2],
in_directory=location)

status, out, err = utils.runscript('sourmash',
['compute', testdata2,
'--scaled', '10',
'-o', 'query.fa.sig'],
in_directory=location)

status, out, err = utils.runscript('sourmash',
['sbt_index', 'zzz',
'short.fa.sig',
'short2.fa.sig'],
in_directory=location)

assert os.path.exists(os.path.join(location, 'zzz.sbt.json'))

status, out, err = utils.runscript('sourmash',
['sbt_gather', 'zzz',
'query.fa.sig'],
in_directory=location,
fail_ok=True)
assert status == -1, (status, out, err)
assert "Best hash match in sbt_gather has no cardinality" in err


def test_sbt_categorize():
with utils.TempDirectory() as location:
testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
Expand Down

0 comments on commit 213e9cb

Please sign in to comment.