Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] define get_mins in terms of .hashes, not the other way around #1154

Merged
merged 5 commits into from
Aug 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions doc/api-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ First, load two signatures:
Then, get the hashes, and (e.g.) compute the union:

```
>>> hashes1 = set(sig1.minhash.get_mins())
>>> hashes2 = set(sig2.minhash.get_mins())
>>> hashes1 = set(sig1.minhash.hashes.keys())
>>> hashes2 = set(sig2.minhash.hashes.keys())
>>> hash_union = hashes1.union(hashes2)
>>> print('{} hashes in union of {} and {}'.format(len(hash_union), len(hashes1), len(hashes2)))
1000 hashes in union of 500 and 500
Expand Down Expand Up @@ -242,7 +242,7 @@ sections.
MinHash objects have the following methods and attributes:

* `ksize`, `num`, and `scaled` - the basic parameters used to create a MinHash object.
* `get_mins()` - retrieve all of the hashes contained in this object.
* `hashes` - retrieve all of the hashes contained in this object.
* `add_sequence(seq)` - hash sequence and add hash values.
* `add(hash)` and `add_many(hashvals)` - add hash values directly.
* `similarity(other)` - calculate Jaccard similarity with the other MinHash object.
Expand Down Expand Up @@ -279,7 +279,7 @@ We can downsample this to 500 by extracting the hashes and using
`add_many` to add them to a new MinHash like so:

```
>>> hashvals = larger.get_mins()
>>> hashvals = larger.hashes.keys()
>>> smaller = sourmash.MinHash(n=500, ksize=31)
>>> smaller.add_many(hashvals)
>>> len(smaller)
Expand All @@ -304,7 +304,7 @@ The same can be done with scaled MinHashes:
>>> len(large_scaled)
459
>>> small_scaled = sourmash.MinHash(n=0, ksize=31, scaled=500)
>>> small_scaled.add_many(large_scaled.get_mins())
>>> small_scaled.add_many(large_scaled.hashes.keys())
>>> len(small_scaled)
69

Expand Down Expand Up @@ -341,7 +341,7 @@ your MinHash, and then extract the hash values:
```
>>> num_mh = sourmash.MinHash(n=1000, ksize=31)
>>> num_mh.add_sequence(sequence)
>>> hashvals = num_mh.get_mins()
>>> hashvals = num_mh.hashes.keys()

```

Expand All @@ -359,7 +359,7 @@ The same works in reverse, of course:
```
>>> scaled_mh = sourmash.MinHash(n=0, ksize=31, scaled=50)
>>> scaled_mh.add_sequence(sequence)
>>> hashvals = scaled_mh.get_mins()
>>> hashvals = scaled_mh.hashes.keys()
>>> num_mh = sourmash.MinHash(n=500, ksize=31)
>>> num_mh.add_many(hashvals)

Expand Down
46 changes: 24 additions & 22 deletions sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import math
import copy
import collections
from collections.abc import Mapping

from . import VERSION
from ._lowlevel import ffi, lib
Expand Down Expand Up @@ -82,7 +82,7 @@ def translate_codon(codon):
raise ValueError(e.message)


class _HashesWrapper(collections.Mapping):
class _HashesWrapper(Mapping):
"A read-only view of the hashes contained by a MinHash object."
def __init__(self, h):
self._data = h
Expand Down Expand Up @@ -214,7 +214,7 @@ def __getstate__(self):
self.is_protein,
self.dayhoff,
self.hp,
self.get_mins(with_abundance=self.track_abundance),
self.hashes,
None,
self.track_abundance,
self.max_hash,
Expand Down Expand Up @@ -290,39 +290,41 @@ def get_mins(self, with_abundance=False):
"""Return list of hashes or if ``with_abundance`` a list
of (hash, abund).
"""
mins = self.hashes
if not with_abundance:
return mins.keys()
return mins


@deprecated(deprecated_in="3.5", removed_in="5.0",
ctb marked this conversation as resolved.
Show resolved Hide resolved
current_version=VERSION,
details='Use .hashes property instead.')
def get_hashes(self):
"Return the list of hashes."
return self.hashes.keys()

@property
def hashes(self):
size = ffi.new("uintptr_t *")
mins_ptr = self._methodcall(lib.kmerminhash_get_mins, size)
size = size[0]

try:
if with_abundance and self.track_abundance:
if self.track_abundance:
size_abunds = ffi.new("uintptr_t *")
abunds_ptr = self._methodcall(lib.kmerminhash_get_abunds, size_abunds)
size_abunds = size_abunds[0]
assert size == size_abunds
result = dict(zip(ffi.unpack(mins_ptr, size), ffi.unpack(abunds_ptr, size)))
lib.kmerminhash_slice_free(abunds_ptr, size)
return _HashesWrapper(result)
else:
result = ffi.unpack(mins_ptr, size)
d = ffi.unpack(mins_ptr, size)
return _HashesWrapper({ k : 1 for k in d })

finally:
lib.kmerminhash_slice_free(mins_ptr, size)

return result

@deprecated(deprecated_in="3.5", removed_in="5.0",
ctb marked this conversation as resolved.
Show resolved Hide resolved
current_version=VERSION,
details='Use .hashes property instead.')
def get_hashes(self):
"Return the list of hashes."
return self.get_mins()

@property
def hashes(self):
if self.track_abundance:
return _HashesWrapper(self.get_mins(with_abundance=True))
else:
d = self.get_mins()
return _HashesWrapper({ k : 1 for k in d })

@property
def seed(self):
Expand Down Expand Up @@ -446,7 +448,7 @@ def downsample(self, num=None, scaled=None):
)
# copy over hashes:
if self.track_abundance:
a.set_abundances(self.get_mins(with_abundance=True))
a.set_abundances(self.hashes)
else:
a.add_many(self)

Expand Down
6 changes: 3 additions & 3 deletions sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
# track original query information for later usage.
track_abundance = query.minhash.track_abundance and not ignore_abundance
orig_query_mh = query.minhash
orig_query_mins = orig_query_mh.get_hashes()
orig_query_mins = orig_query_mh.hashes.keys()

# do we pay attention to abundances?
orig_query_abunds = { k: 1 for k in orig_query_mins }
Expand All @@ -137,8 +137,8 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
break

# subtract found hashes from search hashes, construct new search
query_mins = set(query.minhash.get_hashes())
found_mins = best_match.minhash.get_hashes()
query_mins = set(query.minhash.hashes.keys())
found_mins = best_match.minhash.hashes.keys()

# Is the best match computed with scaled? Die if not.
match_scaled = best_match.minhash.scaled
Expand Down
16 changes: 9 additions & 7 deletions tests/test__minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -1459,12 +1459,13 @@ def test_get_mins_deprecated(track_abundance):
mh.add_many(mins)
mh.add_many(mins)

assert set(mh.get_mins()) == set(mins)
if track_abundance:
d = mh.get_mins(with_abundance=True)
for k in mins:
assert d[k] == 4
assert len(d) == len(mins)
with pytest.warns(DeprecationWarning):
assert set(mh.get_mins()) == set(mins)
if track_abundance:
d = mh.get_mins(with_abundance=True)
for k in mins:
assert d[k] == 4
assert len(d) == len(mins)


def test_get_hashes_deprecated(track_abundance):
Expand All @@ -1476,7 +1477,8 @@ def test_get_hashes_deprecated(track_abundance):
mh.add_many(mins)
mh.add_many(mins)

assert set(mh.get_hashes()) == set(mins)
with pytest.warns(DeprecationWarning):
assert set(mh.get_hashes()) == set(mins)


def test_downsample_num(track_abundance):
Expand Down
4 changes: 2 additions & 2 deletions tests/test__minhash_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_set_abundance_num_hypothesis(hashes, abundances, sketch_size):

a.set_abundances(oracle)

mins = a.get_mins(with_abundance=True)
mins = a.hashes
size = min(sum(1 for v in oracle.values() if v > 0), sketch_size)
assert len(mins) == size

Expand All @@ -38,7 +38,7 @@ def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled):
max_hash = _get_max_hash_for_scaled(scaled)
below_max_hash = sum(1 for (k, v) in oracle.items() if k <= max_hash and v > 0)

mins = a.get_mins(with_abundance=True)
mins = a.hashes
assert len(mins) == below_max_hash

for k, v in mins.items():
Expand Down
36 changes: 18 additions & 18 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,11 @@ def test_sig_filter_2(c):
filtered_sig = sourmash.load_one_signature(out)
test_sig = sourmash.load_one_signature(sig47)

abunds = test_sig.minhash.get_mins(True)
abunds = test_sig.minhash.hashes
abunds = { k: v for (k, v) in abunds.items() if v >= 2 and v <= 5 }
assert abunds

assert filtered_sig.minhash.get_mins(True) == abunds
assert filtered_sig.minhash.hashes == abunds


@utils.in_tempdir
Expand All @@ -228,11 +228,11 @@ def test_sig_filter_3(c):
filtered_sig = sourmash.load_one_signature(out)
test_sig = sourmash.load_one_signature(sig47)

abunds = test_sig.minhash.get_mins(True)
abunds = test_sig.minhash.hashes
abunds = { k: v for (k, v) in abunds.items() if v >= 2 }
assert abunds

assert filtered_sig.minhash.get_mins(True) == abunds
assert filtered_sig.minhash.hashes == abunds


@utils.in_tempdir
Expand All @@ -247,11 +247,11 @@ def test_sig_filter_3_ksize_select(c):
filtered_sig = sourmash.load_one_signature(out)
test_sig = sourmash.load_one_signature(psw_mag, ksize=31)

abunds = test_sig.minhash.get_mins(True)
abunds = test_sig.minhash.hashes
abunds = { k: v for (k, v) in abunds.items() if v >= 2 }
assert abunds

assert filtered_sig.minhash.get_mins(True) == abunds
assert filtered_sig.minhash.hashes == abunds


@utils.in_tempdir
Expand Down Expand Up @@ -356,8 +356,8 @@ def test_sig_intersect_3(c):
# actually do an intersection ourselves for the test
mh47 = sourmash.load_one_signature(sig47).minhash
mh63 = sourmash.load_one_signature(sig63).minhash
mh47_abunds = mh47.get_mins(with_abundance=True)
mh63_mins = set(mh63.get_mins())
mh47_abunds = mh47.hashes
mh63_mins = set(mh63.hashes.keys())

# get the set of mins that are in common
mh63_mins.intersection_update(mh47_abunds)
Expand Down Expand Up @@ -388,8 +388,8 @@ def test_sig_intersect_4(c):
# actually do an intersection ourselves for the test
mh47 = sourmash.load_one_signature(sig47).minhash
mh63 = sourmash.load_one_signature(sig63).minhash
mh47_abunds = mh47.get_mins(with_abundance=True)
mh63_mins = set(mh63.get_mins())
mh47_abunds = mh47.hashes
mh63_mins = set(mh63.hashes.keys())

# get the set of mins that are in common
mh63_mins.intersection_update(mh47_abunds)
Expand Down Expand Up @@ -486,10 +486,10 @@ def test_sig_subtract_1(c):
test2_sig = sourmash.load_one_signature(sig63)
actual_subtract_sig = sourmash.load_one_signature(out)

mins = set(test1_sig.minhash.get_mins())
mins -= set(test2_sig.minhash.get_mins())
mins = set(test1_sig.minhash.hashes.keys())
mins -= set(test2_sig.minhash.hashes.keys())

assert set(actual_subtract_sig.minhash.get_mins()) == set(mins)
assert set(actual_subtract_sig.minhash.hashes.keys()) == set(mins)


@utils.in_tempdir
Expand All @@ -504,7 +504,7 @@ def test_sig_subtract_1_multisig(c):

actual_subtract_sig = sourmash.load_one_signature(out)

assert not set(actual_subtract_sig.minhash.get_mins())
assert not set(actual_subtract_sig.minhash.hashes.keys())


@utils.in_tempdir
Expand Down Expand Up @@ -1067,12 +1067,12 @@ def test_sig_downsample_1_scaled_to_num(c):
out = c.last_result.out

actual_downsample_sig = sourmash.load_one_signature(out)
actual_mins = actual_downsample_sig.minhash.get_mins()
actual_mins = actual_downsample_sig.minhash.hashes.keys()
actual_mins = list(actual_mins)
actual_mins.sort()

test_downsample_sig = sourmash.load_one_signature(sig47)
test_mins = test_downsample_sig.minhash.get_mins()
test_mins = test_downsample_sig.minhash.hashes.keys()
test_mins = list(test_mins)
test_mins.sort()
test_mins = test_mins[:500] # take 500 smallest
Expand Down Expand Up @@ -1130,8 +1130,8 @@ def test_sig_downsample_2_num_to_scaled(c):
select_moltype='DNA')
actual_downsample_sig = sourmash.load_one_signature(out)

test_mins = test_downsample_sig.minhash.get_mins()
actual_mins = actual_downsample_sig.minhash.get_mins()
test_mins = test_downsample_sig.minhash.hashes.keys()
actual_mins = actual_downsample_sig.minhash.hashes.keys()

# select those mins that are beneath the new max hash...
max_hash = actual_downsample_sig.minhash.max_hash
Expand Down
4 changes: 2 additions & 2 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ def test_linear_gather_threshold_1():
# now construct query signatures with specific numbers of hashes --
# note, these signatures all have scaled=1000.

mins = list(sorted(sig2.minhash.get_mins()))
mins = list(sorted(sig2.minhash.hashes.keys()))
new_mh = sig2.minhash.copy_and_clear()

# query with empty hashes
Expand Down Expand Up @@ -289,7 +289,7 @@ def test_linear_gather_threshold_5():
# now construct query signatures with specific numbers of hashes --
# note, these signatures all have scaled=1000.

mins = list(sorted(sig2.minhash.get_mins()))
mins = list(sorted(sig2.minhash.hashes.keys()))
new_mh = sig2.minhash.copy_and_clear()

# add five hashes
Expand Down
15 changes: 7 additions & 8 deletions tests/test_jaccard.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,10 @@ def test_dna_mh(track_abundance):
for i in range(len(seq) - 3):
e2.add_kmer(seq[i:i + 4])

assert e1.get_mins() == e2.get_mins()
print(e1.get_mins())
assert 726311917625663847 in e1.get_mins()
assert 3697418565283905118 in e1.get_mins()
assert e1.hashes.keys() == e2.hashes.keys()
print(e1.hashes.keys())
assert 726311917625663847 in e1.hashes.keys()
assert 3697418565283905118 in e1.hashes.keys()


def test_protein_mh(track_abundance):
Expand All @@ -95,8 +95,8 @@ def test_protein_mh(track_abundance):
kmer = seq[i:i + 6]
e2.add_kmer(kmer)

assert e1.get_mins() == e2.get_mins()
assert 901193879228338100 in e1.get_mins()
assert e1.hashes.keys() == e2.hashes.keys()
assert 901193879228338100 in e1.hashes.keys()


def test_pickle(track_abundance):
Expand All @@ -116,8 +116,7 @@ def test_pickle(track_abundance):
fp2 = BytesIO(fp.getvalue())
e2 = pickle.load(fp2)

assert e1.get_mins(with_abundance=track_abundance) == \
e2.get_mins(with_abundance=track_abundance)
assert e1.hashes == e2.hashes
assert e1.num == e2.num
assert e1.ksize == e2.ksize
assert e1.is_protein == e2.is_protein
Expand Down
Loading