Skip to content

Commit

Permalink
[MRG] Fix max hash deprecation warnings (#1301)
Browse files Browse the repository at this point in the history
* swizzle property max_hash to _max_hash

* replace use of max_hash with scaled

* remove max_hash terminology

* cleanup and refactor some of the stale max_hash stuff in minahsh

* update screed req to v1.0.5

Co-authored-by: Luiz Irber <luizirber@users.noreply.github.com>
  • Loading branch information
ctb and luizirber authored Feb 5, 2021
1 parent 3c1241b commit a49c997
Show file tree
Hide file tree
Showing 10 changed files with 46 additions and 44 deletions.
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ packages = find:
platforms = any
include_package_data = True
install_requires =
screed>=1.0
screed>=1.0.5
cffi>=1.14.0
numpy
matplotlib
Expand Down
12 changes: 6 additions & 6 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ def compare(args):
notify('loaded {} signatures total.'.format(len(siglist)))

# check to make sure they're potentially compatible - either using
# max_hash/scaled, or not.
scaled_sigs = [s.minhash.max_hash for s in siglist]
# scaled, or not.
scaled_sigs = [s.minhash.scaled for s in siglist]
is_scaled = all(scaled_sigs)
is_scaled_2 = any(scaled_sigs)

Expand Down Expand Up @@ -427,7 +427,7 @@ def search(args):

# downsample if requested
if args.scaled:
if query.minhash.max_hash == 0:
if not query.minhash.scaled:
error('cannot downsample a signature not created with --scaled')
sys.exit(-1)

Expand Down Expand Up @@ -581,7 +581,7 @@ def gather(args):
sourmash_args.get_moltype(query))

# verify signature was computed right.
if query.minhash.scaled == 0:
if not query.minhash.scaled:
error('query signature needs to be created with --scaled')
sys.exit(-1)

Expand Down Expand Up @@ -609,7 +609,7 @@ def gather(args):

found = []
weighted_missed = 1
new_max_hash = query.minhash.max_hash
new_max_hash = query.minhash._max_hash
next_query = query

for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
Expand Down Expand Up @@ -726,7 +726,7 @@ def multigather(args):
query.minhash.ksize, sourmash_args.get_moltype(query))

# verify signature was computed right.
if query.minhash.max_hash == 0:
if not query.minhash.scaled:
error('query signature needs to be created with --scaled; skipping')
continue

Expand Down
10 changes: 5 additions & 5 deletions src/sourmash/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def compare_serial(siglist, ignore_abundance, downsample=False):
If the sketches are abundance weighted, calculate the angular
similarity.
:param boolean downsample by max_hash if True
:param boolean downsample by scaled if True
:return: np.array similarity matrix
"""
import numpy as np
Expand All @@ -45,7 +45,7 @@ def compare_serial_containment(siglist, downsample=False):
process. Best to use when there is few signatures.
:param list siglist: list of signatures to compare
:param boolean downsample by max_hash if True
:param boolean downsample by scaled if True
:return: np.array similarity matrix
"""
import numpy as np
Expand Down Expand Up @@ -83,7 +83,7 @@ def get_similarities_at_index(index, ignore_abundance, downsample, siglist):
If the sketches are abundance weighted, calculate the angular
similarity.
:param boolean downsample by max_hash if True
:param boolean downsample by scaled if True
:param siglist list of signatures
:return: list of similarities for the combinations of signature at index
with rest of the signatures from index+1
Expand Down Expand Up @@ -114,7 +114,7 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs):
If the sketches are abundance weighted, calculate the angular
similarity.
:param boolean downsample by max_hash if True
:param boolean downsample by scaled if True
:param int n_jobs number of processes to run the similarity calculations on
:return: np.array similarity matrix
"""
Expand Down Expand Up @@ -194,7 +194,7 @@ def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None):
If the sketches are abundance weighted, calculate the angular
similarity.
:param boolean downsample by max_hash if True
:param boolean downsample by scaled if True
:param int n_jobs number of processes to run the similarity calculations on,
if number of jobs is None or 1, compare serially, otherwise parallely.
:return: np.array similarity matrix
Expand Down
19 changes: 10 additions & 9 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def __copy__(self):
hp=self.hp,
track_abundance=self.track_abundance,
seed=self.seed,
max_hash=self.max_hash,
max_hash=self._max_hash,
)
a.merge(self)
return a
Expand All @@ -233,7 +233,7 @@ def __getstate__(self):
self.hashes,
None,
self.track_abundance,
self.max_hash,
self._max_hash,
self.seed,
)

Expand Down Expand Up @@ -274,7 +274,7 @@ def copy_and_clear(self):
self.hp,
self.track_abundance,
self.seed,
self.max_hash,
self._max_hash,
)
return a

Expand Down Expand Up @@ -401,6 +401,11 @@ def ksize(self):
def max_hash(self):
return self._methodcall(lib.kmerminhash_max_hash)

# a non-deprecated `max_hash` property for internal testing purposes only
@property
def _max_hash(self):
return self._methodcall(lib.kmerminhash_max_hash)

@property
def track_abundance(self):
return self._methodcall(lib.kmerminhash_track_abundance)
Expand Down Expand Up @@ -458,11 +463,7 @@ def downsample(self, num=None, scaled=None):
elif scaled is not None:
if self.num:
raise ValueError("num != 0 - cannot downsample a standard MinHash")
max_hash = self.max_hash
if max_hash is None:
raise ValueError("no max_hash available - cannot downsample")

old_scaled = _get_scaled_for_max_hash(self.max_hash)
old_scaled = self.scaled
if old_scaled > scaled:
raise ValueError(
"new scaled {} is lower than current sample scaled {}".format(
Expand Down Expand Up @@ -492,7 +493,7 @@ def flatten(self):
# create new object:
a = MinHash(
self.num, self.ksize, self.is_protein, self.dayhoff, self.hp,
False, self.seed, self.max_hash
False, self.seed, self._max_hash
)
a.add_many(self)

Expand Down
18 changes: 9 additions & 9 deletions tests/test__minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def test_scaled(track_abundance):
scaled = _get_scaled_for_max_hash(35)
print('XX', scaled, _get_max_hash_for_scaled(scaled))
mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled)
assert mh.max_hash == 35
assert mh._max_hash == 35

mh.add_hash(10)
mh.add_hash(20)
Expand Down Expand Up @@ -1177,7 +1177,7 @@ def test_mh_copy_and_clear(track_abundance):
b = a.copy_and_clear()
assert a.ksize == b.ksize
assert b.num == a.num
assert b.max_hash == 0
assert b._max_hash == 0
assert not b.is_protein
assert b.track_abundance == track_abundance
assert b.seed == a.seed
Expand All @@ -1196,7 +1196,7 @@ def test_mh_copy_and_clear_with_max_hash(track_abundance):
b = a.copy_and_clear()
assert a.ksize == b.ksize
assert b.num == a.num
assert b.max_hash == 20
assert b._max_hash == 20
assert not b.is_protein
assert b.track_abundance == track_abundance
assert b.seed == a.seed
Expand All @@ -1220,8 +1220,8 @@ def test_pickle_max_hash(track_abundance):
b = pickle.loads(pickle.dumps(a))
assert a.ksize == b.ksize
assert b.num == a.num
assert b.max_hash == a.max_hash
assert b.max_hash == 20
assert b._max_hash == a._max_hash
assert b._max_hash == 20
assert not b.is_protein
assert b.track_abundance == track_abundance
assert b.seed == a.seed
Expand All @@ -1239,8 +1239,8 @@ def test_pickle_scaled(track_abundance):
b = pickle.loads(pickle.dumps(a))
assert a.ksize == b.ksize
assert b.num == a.num
assert b.max_hash == a.max_hash
assert b.max_hash == 20
assert b._max_hash == a._max_hash
assert b._max_hash == 20
assert not b.is_protein
assert b.track_abundance == track_abundance
assert b.seed == a.seed
Expand Down Expand Up @@ -1401,7 +1401,7 @@ def test_flatten():
# test behavior with scaled
scaled = _get_scaled_for_max_hash(35)
mh = MinHash(0, 4, track_abundance=True, scaled=scaled)
assert mh.max_hash == 35
assert mh._max_hash == 35

mh.add_hash(10)
mh.add_hash(10)
Expand Down Expand Up @@ -1512,7 +1512,7 @@ def test_downsample_scaled(track_abundance):
assert list(sorted(mh.hashes)) == list(mins)

mh2 = mh.downsample(scaled=2)
print(mh.max_hash, mh2.max_hash)
print(mh._max_hash, mh2._max_hash)

assert len(mh2) == 3
assert list(sorted(mh2.hashes)) == list(mins[:3])
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,7 +1180,7 @@ def test_sig_downsample_2_num_to_scaled(c):
actual_mins = actual_downsample_sig.minhash.hashes.keys()

# select those mins that are beneath the new max hash...
max_hash = actual_downsample_sig.minhash.max_hash
max_hash = actual_downsample_sig.minhash._max_hash
test_mins_down = { k for k in test_mins if k < max_hash }
assert test_mins_down == set(actual_mins)

Expand Down
3 changes: 2 additions & 1 deletion tests/test_jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ def test_pickle(track_abundance):
assert e1.num == e2.num
assert e1.ksize == e2.ksize
assert e1.is_protein == e2.is_protein
assert e1.max_hash == e2.max_hash
assert e1.scaled == e2.scaled
assert e1.scaled == 0
assert e1.seed == e2.seed


Expand Down
4 changes: 2 additions & 2 deletions tests/test_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def test_roundtrip_empty(track_abundance):
assert sig2.similarity(sig) == 0


def test_roundtrip_max_hash(track_abundance):
def test_roundtrip_scaled(track_abundance):
e = sourmash.MinHash(n=0, ksize=20, track_abundance=track_abundance,
max_hash=10)
e.add_hash(5)
Expand All @@ -139,7 +139,7 @@ def test_roundtrip_max_hash(track_abundance):
sig2 = siglist[0]
e2 = sig2.minhash

assert e.max_hash == e2.max_hash
assert e.scaled == e2.scaled

assert sig.similarity(sig2) == 1.0
assert sig2.similarity(sig) == 1.0
Expand Down
10 changes: 5 additions & 5 deletions tests/test_sourmash_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,9 +580,9 @@ def test_do_sourmash_compute_with_scaled_1():
siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.minhash.max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == { sourmash.MAX_HASH }
scaled_vals = [ x.minhash.scaled for x in siglist ]
assert len(scaled_vals) == 2
assert set(scaled_vals) == { 1 }


def test_do_sourmash_compute_with_scaled_2():
Expand All @@ -599,7 +599,7 @@ def test_do_sourmash_compute_with_scaled_2():
siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.minhash.max_hash for x in siglist ]
max_hashes = [ x.minhash._max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == set([ int(2**64 /2.) ])

Expand All @@ -618,7 +618,7 @@ def test_do_sourmash_compute_with_scaled():
siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.minhash.max_hash for x in siglist ]
max_hashes = [ x.minhash._max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == set([ int(2**64 /100.) ])

Expand Down
10 changes: 5 additions & 5 deletions tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,9 +638,9 @@ def test_do_sourmash_sketchdna_with_scaled_1():
siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.minhash.max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == { sourmash.MAX_HASH }
scaled_vals = [ x.minhash.scaled for x in siglist ]
assert len(scaled_vals) == 2
assert set(scaled_vals) == { 1 }


def test_do_sourmash_sketchdna_with_scaled_2():
Expand All @@ -657,7 +657,7 @@ def test_do_sourmash_sketchdna_with_scaled_2():
siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.minhash.max_hash for x in siglist ]
max_hashes = [ x.minhash._max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == set([ int(2**64 /2.) ])

Expand All @@ -676,7 +676,7 @@ def test_do_sourmash_sketchdna_with_scaled():
siglist = list(signature.load_signatures(outfile))
assert len(siglist) == 2

max_hashes = [ x.minhash.max_hash for x in siglist ]
max_hashes = [ x.minhash._max_hash for x in siglist ]
assert len(max_hashes) == 2
assert set(max_hashes) == set([ int(2**64 /100.) ])

Expand Down

0 comments on commit a49c997

Please sign in to comment.