Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] rename covered_bp property to better reflect function #2050

Merged
merged 15 commits into from
May 13, 2022
16 changes: 9 additions & 7 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,10 +932,14 @@ def std_abundance(self):
return None

@property
def covered_bp(self):
def unique_dataset_hashes(self):
"""
Approximate total number of hashes (num_hashes *scaled).
"""
if not self.scaled:
raise TypeError("can only calculate bp for scaled MinHashes")
return len(self.hashes) * self.scaled
raise TypeError("can only approximate unique_dataset_hashes for scaled MinHashes")
# TODO: replace set_size with HLL estimate when that gets implemented
return len(self.hashes) * self.scaled # + (self.ksize - 1) for bp estimation

def size_is_accurate(self, relative_error=0.05, confidence=0.95):
"""
Expand All @@ -947,10 +951,8 @@ def size_is_accurate(self, relative_error=0.05, confidence=0.95):
"""
if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]):
raise ValueError("Error: relative error and confidence values must be between 0 and 1.")

# TODO: replace set_size with HLL estimate when that gets implemented
set_size = len(self.hashes) * self.scaled
probability = set_size_chernoff(set_size, self.scaled, relative_error=relative_error)
# to do: replace unique_dataset_hashes with HLL estimation when it gets implemented
probability = set_size_chernoff(self.unique_dataset_hashes, self.scaled, relative_error=relative_error)
return probability >= confidence


Expand Down
12 changes: 6 additions & 6 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,10 +353,10 @@ def init_sigcomparison(self):
else:
raise TypeError("Error: prefetch and gather results must be between scaled signatures.")
self.get_cmpinfo() # grab comparison metadata
self.intersect_bp = self.cmp.intersect_bp
self.intersect_bp = self.cmp.total_unique_intersect_hashes
self.max_containment = self.cmp.max_containment
self.query_bp = self.mh1.covered_bp
self.match_bp = self.mh2.covered_bp
self.query_bp = self.mh1.unique_dataset_hashes
self.match_bp = self.mh2.unique_dataset_hashes
self.threshold = self.threshold_bp
self.estimate_containment_ani()

Expand Down Expand Up @@ -458,7 +458,7 @@ def build_gather_result(self):
self.query_n_hashes = self.orig_query_len

# calculate intersection with query hashes:
self.unique_intersect_bp = self.gather_comparison.intersect_bp
self.unique_intersect_bp = self.gather_comparison.total_unique_intersect_hashes

# calculate fraction of subject match with orig query
self.f_match_orig = self.cmp.mh2_containment
Expand All @@ -473,7 +473,7 @@ def build_gather_result(self):
self.f_unique_to_query = len(self.gather_comparison.intersect_mh)/self.orig_query_len

# here, need to make sure to use the mh1_cmp (bc was downsampled to cmp_scaled)
self.remaining_bp = (self.gather_comparison.mh1_cmp.covered_bp - self.gather_comparison.intersect_bp)
self.remaining_bp = (self.gather_comparison.mh1_cmp.unique_dataset_hashes - self.gather_comparison.total_unique_intersect_hashes)

# calculate stats on abundances, if desired.
self.average_abund, self.median_abund, self.std_abund = None, None, None
Expand Down Expand Up @@ -643,7 +643,7 @@ def __init__(self, query, counters, *,
# track original query information for later usage?
track_abundance = query.minhash.track_abundance and not ignore_abundance
self.orig_query = query
self.orig_query_bp = len(query.minhash) * query.minhash.scaled
self.orig_query_bp = query.minhash.unique_dataset_hashes
self.orig_query_filename = query.filename
self.orig_query_name = query.name
self.orig_query_md5 = query.md5sum()[:8]
Expand Down
11 changes: 8 additions & 3 deletions src/sourmash/sketchcomparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,16 @@ def __post_init__(self):

@property
def pass_threshold(self):
return self.intersect_bp >= self.threshold_bp
return self.total_unique_intersect_hashes >= self.threshold_bp

@property
def intersect_bp(self):
return len(self.intersect_mh) * self.cmp_scaled
def total_unique_intersect_hashes(self):
"""
approx equal to intersect_bp
To get true bp estimates, we would need to add `(k-1)`. However, this complicates
the iterative gather algorithm, so let's stick with hashes.
"""
return len(self.intersect_mh) * self.cmp_scaled # + (ksize-1) #for bp estimation

@property
def mh1_containment(self):
Expand Down
10 changes: 5 additions & 5 deletions tests/test_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2804,19 +2804,19 @@ def test_std_abundance(track_abundance):
assert not mh2.std_abundance


def test_covered_bp(track_abundance):
"test covered_bp"
def test_unique_dataset_hashes(track_abundance):
"test total_hashes approximation"
mh1 = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
mh2 = MinHash(4, 21, track_abundance=track_abundance)

mh1.add_many((1, 2, 3, 4))
mh1.add_many((1, 2))
mh2.add_many((1, 5))

assert mh1.covered_bp == 4 # hmmm...
assert mh1.unique_dataset_hashes == 4
with pytest.raises(TypeError) as exc:
mh2.covered_bp
assert "can only calculate bp for scaled MinHashes" in str(exc)
mh2.unique_dataset_hashes
assert "can only approximate unique_dataset_hashes for scaled MinHashes" in str(exc)


def test_containment_ANI():
Expand Down
68 changes: 34 additions & 34 deletions tests/test_sketchcomparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@

# can we parameterize scaled too (so don't need separate downsample tests?)
def test_FracMinHashComparison(track_abundance):
# build FracMinHash Comparison and check values
# build FracMinHash Comparison and check values
a = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
b = MinHash(0, 21, scaled=1, track_abundance=track_abundance)

a_values = { 1:5, 3:3, 5:2, 8:2}
b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }

if track_abundance:
a.set_abundances(a_values)
b.set_abundances(b_values)
else:
a.add_many(a_values.keys())
b.add_many(b_values.keys())

# build FracMinHashComparison
# build FracMinHashComparison
cmp = FracMinHashComparison(a, b)
assert cmp.mh1 == a
assert cmp.mh2 == b
Expand All @@ -42,7 +42,7 @@ def test_FracMinHashComparison(track_abundance):
assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
intersect_mh = a.flatten().intersection(b.flatten())
assert cmp.intersect_mh == intersect_mh == b.flatten().intersection(a.flatten())
assert cmp.intersect_bp == 4
assert cmp.total_unique_intersect_hashes == 4
assert cmp.pass_threshold # default threshold is 0; this should pass
if track_abundance:
assert cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a)
Expand All @@ -62,16 +62,16 @@ def test_FracMinHashComparison(track_abundance):
assert "Error: Angular (cosine) similarity requires both sketches to track hash abundance." in str(exc)
assert cmp.weighted_intersection(from_mh=cmp.mh1).hashes == intersect_mh.hashes
assert cmp.weighted_intersection(from_mh=cmp.mh2).hashes == intersect_mh.hashes


def test_FracMinHashComparison_downsample(track_abundance):
# build FracMinHash Comparison and check values
# build FracMinHash Comparison and check values
a = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
b = MinHash(0, 21, scaled=1, track_abundance=track_abundance)

a_values = { 1:5, 3:3, 5:2, 8:2}
b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }

if track_abundance:
a.set_abundances(a_values)
b.set_abundances(b_values)
Expand All @@ -83,7 +83,7 @@ def test_FracMinHashComparison_downsample(track_abundance):
ds_a = a.downsample(scaled=cmp_scaled)
ds_b = b.downsample(scaled=cmp_scaled)

# build FracMinHashComparison
# build FracMinHashComparison
cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled)
assert cmp.mh1 == a
assert cmp.mh2 == b
Expand All @@ -100,7 +100,7 @@ def test_FracMinHashComparison_downsample(track_abundance):
assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 8
assert cmp.total_unique_intersect_hashes == 8
assert cmp.pass_threshold # default threshold is 0; this should pass
if track_abundance:
assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a)
Expand All @@ -123,13 +123,13 @@ def test_FracMinHashComparison_downsample(track_abundance):


def test_FracMinHashComparison_autodownsample(track_abundance):
# build FracMinHash Comparison and check values
# build FracMinHash Comparison and check values
a = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
b = MinHash(0, 21, scaled=2, track_abundance=track_abundance)

a_values = { 1:5, 3:3, 5:2, 8:2}
b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }

if track_abundance:
a.set_abundances(a_values)
b.set_abundances(b_values)
Expand All @@ -141,7 +141,7 @@ def test_FracMinHashComparison_autodownsample(track_abundance):
ds_a = a.downsample(scaled=cmp_scaled)
ds_b = b.downsample(scaled=cmp_scaled)

# build FracMinHashComparison
# build FracMinHashComparison
cmp = FracMinHashComparison(a, b)
assert cmp.mh1 == a
assert cmp.mh2 == b
Expand All @@ -158,7 +158,7 @@ def test_FracMinHashComparison_autodownsample(track_abundance):
assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 8
assert cmp.total_unique_intersect_hashes == 8
assert cmp.pass_threshold # default threshold is 0; this should pass
if track_abundance:
assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a)
Expand Down Expand Up @@ -188,19 +188,19 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance):
a_values = { 1:5, 3:3, 5:2, 8:2}
b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }
intersection_w_abund = {1:8, 3:5, 5:3, 8:3}

if track_abundance:
a.set_abundances(a_values)
b.set_abundances(b_values)
else:
a.add_many(a_values.keys())
b.add_many(b_values.keys())

cmp_scaled = 2
ds_a = a.flatten().downsample(scaled=cmp_scaled)
ds_b = b.flatten().downsample(scaled=cmp_scaled)

# build FracMinHashComparison
# build FracMinHashComparison
cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled, ignore_abundance=True)
assert cmp.mh1 == a
assert cmp.mh2 == b
Expand All @@ -215,7 +215,7 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance):
assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 8
assert cmp.total_unique_intersect_hashes == 8
assert cmp.pass_threshold # default threshold is 0; this should pass
# with ignore_abundance = True, all of these should not be usable. Do we want errors, or ""/None?
with pytest.raises(TypeError) as exc:
Expand All @@ -233,7 +233,7 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance):


def test_FracMinHashComparison_fail_threshold(track_abundance):
# build FracMinHash Comparison and check values
# build FracMinHash Comparison and check values
a = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
b = MinHash(0, 21, scaled=1, track_abundance=track_abundance)

Expand All @@ -252,7 +252,7 @@ def test_FracMinHashComparison_fail_threshold(track_abundance):
ds_b = b.flatten().downsample(scaled=cmp_scaled)

# build FracMinHashComparison
cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled, threshold_bp=10)
cmp = FracMinHashComparison(a, b, cmp_scaled = cmp_scaled, threshold_bp=40)
assert cmp.mh1 == a
assert cmp.mh2 == b
assert cmp.ignore_abundance == False
Expand All @@ -266,8 +266,8 @@ def test_FracMinHashComparison_fail_threshold(track_abundance):
assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 8
assert not cmp.pass_threshold # threshold is 10; this should fail
assert cmp.total_unique_intersect_hashes == 8
assert not cmp.pass_threshold # threshold is 40; this should fail


def test_FracMinHashComparison_potential_false_negative():
Expand Down Expand Up @@ -423,23 +423,23 @@ def test_FracMinHashComparison_redownsample_without_scaled(track_abundance):


def test_NumMinHashComparison(track_abundance):
# build FracMinHash Comparison and check values
# build FracMinHash Comparison and check values
a = MinHash(10, 21, scaled=0, track_abundance=track_abundance)
b = MinHash(10, 21, scaled=0, track_abundance=track_abundance)

a_values = { 1:5, 3:3, 5:2, 8:2}
b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }

if track_abundance:
a.set_abundances(a_values)
b.set_abundances(b_values)
else:
a.add_many(a_values.keys())
b.add_many(b_values.keys())

assert a.num and b.num and not a.scaled and not b.scaled
# build NumMinHashComparison

# build NumMinHashComparison
cmp = NumMinHashComparison(a, b)
assert cmp.mh1 == a
assert cmp.mh2 == b
Expand All @@ -465,26 +465,26 @@ def test_NumMinHashComparison(track_abundance):


def test_NumMinHashComparison_downsample(track_abundance):
# build FracMinHash Comparison and check values
# build FracMinHash Comparison and check values
a = MinHash(10, 21, scaled=0, track_abundance=track_abundance)
b = MinHash(10, 21, scaled=0, track_abundance=track_abundance)

a_values = { 1:5, 3:3, 5:2, 8:2}
b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }

if track_abundance:
a.set_abundances(a_values)
b.set_abundances(b_values)
else:
a.add_many(a_values.keys())
b.add_many(b_values.keys())

assert a.num and b.num and not a.scaled and not b.scaled

cmp_num = 5
ds_a = a.downsample(num=cmp_num)
ds_b = b.downsample(num=cmp_num)
# build NumMinHashComparison
# build NumMinHashComparison
cmp = NumMinHashComparison(a, b, cmp_num = cmp_num)
assert cmp.mh1 == a
assert cmp.mh2 == b
Expand All @@ -510,26 +510,26 @@ def test_NumMinHashComparison_downsample(track_abundance):


def test_NumMinHashComparison_autodownsample(track_abundance):
# build FracMinHash Comparison and check values
# build FracMinHash Comparison and check values
a = MinHash(10, 21, scaled=0, track_abundance=track_abundance)
b = MinHash(5, 21, scaled=0, track_abundance=track_abundance)

a_values = { 1:5, 3:3, 5:2, 8:2}
b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }

if track_abundance:
a.set_abundances(a_values)
b.set_abundances(b_values)
else:
a.add_many(a_values.keys())
b.add_many(b_values.keys())

assert a.num and b.num and not a.scaled and not b.scaled

cmp_num = 5
ds_a = a.downsample(num=cmp_num)
ds_b = b.downsample(num=cmp_num)
# build NumMinHashComparison
# build NumMinHashComparison
cmp = NumMinHashComparison(a, b)
assert cmp.mh1 == a
assert cmp.mh2 == b
Expand Down