Skip to content

Commit

Permalink
fix k -1
Browse files Browse the repository at this point in the history
  • Loading branch information
bluegenes committed May 4, 2022
1 parent 73bcd1d commit f4f5935
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 22 deletions.
2 changes: 1 addition & 1 deletion src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,7 +921,7 @@ def std_abundance(self):
def unique_covered_bp(self):
if not self.scaled:
raise TypeError("can only calculate bp for scaled MinHashes")
return len(self.hashes) * self.scaled + (self.ksize + 1)
return len(self.hashes) * self.scaled + (self.ksize - 1)


class FrozenMinHash(MinHash):
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@ def build_gather_result(self):
# this affects estimation of original query information, and requires us to pass in orig_query_len and orig_query_abunds.
# we also need to overwrite self.query_bp, self.query_n_hashes, and self.query_abundance
# todo: find a better solution?
self.query_bp = self.orig_query_len * self.query.minhash.scaled + self.ksize + 1
self.query_bp = self.orig_query_len * self.query.minhash.scaled + self.ksize - 1
self.query_n_hashes = self.orig_query_len

# calculate intersection with query hashes:
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sketchcomparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def pass_threshold(self):

@property
def intersect_bp(self):
return (len(self.intersect_mh) * self.cmp_scaled) + (self.ksize + 1)
return (len(self.intersect_mh) * self.cmp_scaled) + (self.ksize - 1)

@property
def mh1_containment(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2813,7 +2813,7 @@ def test_unique_covered_bp(track_abundance):
mh1.add_many((1, 2))
mh2.add_many((1, 5))

assert mh1.unique_covered_bp == 26
assert mh1.unique_covered_bp == 24
with pytest.raises(TypeError) as exc:
mh2.unique_covered_bp
assert "can only calculate bp for scaled MinHashes" in str(exc)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_prefetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ def test_prefetch_csv_out(runtmp, linear_gather):
assert c.last_result.status == 0
assert os.path.exists(csvout)

expected_intersect_bp = [2529032, 5177032]
expected_intersect_bp = [2529030, 5177030]
with open(csvout, 'rt', newline="") as fp:
r = csv.DictReader(fp)
for (row, expected) in zip(r, expected_intersect_bp):
Expand Down
10 changes: 5 additions & 5 deletions tests/test_sketchcomparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_FracMinHashComparison(track_abundance):
assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
intersect_mh = a.flatten().intersection(b.flatten())
assert cmp.intersect_mh == intersect_mh == b.flatten().intersection(a.flatten())
assert cmp.intersect_bp == 26
assert cmp.intersect_bp == 24
assert cmp.pass_threshold # default threshold is 0; this should pass
if track_abundance:
assert cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a)
Expand Down Expand Up @@ -100,7 +100,7 @@ def test_FracMinHashComparison_downsample(track_abundance):
assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 30
assert cmp.intersect_bp == 28
assert cmp.pass_threshold # default threshold is 0; this should pass
if track_abundance:
assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a)
Expand Down Expand Up @@ -158,7 +158,7 @@ def test_FracMinHashComparison_autodownsample(track_abundance):
assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 30
assert cmp.intersect_bp == 28
assert cmp.pass_threshold # default threshold is 0; this should pass
if track_abundance:
assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a)
Expand Down Expand Up @@ -215,7 +215,7 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance):
assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 30
assert cmp.intersect_bp == 28
assert cmp.pass_threshold # default threshold is 0; this should pass
# with ignore_abundance = True, all of these should not be usable. Do we want errors, or ""/None?
with pytest.raises(TypeError) as exc:
Expand Down Expand Up @@ -266,7 +266,7 @@ def test_FracMinHashComparison_fail_threshold(track_abundance):
assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
assert cmp.intersect_bp == 30
assert cmp.intersect_bp == 28
assert not cmp.pass_threshold # threshold is 40; this should fail


Expand Down
24 changes: 12 additions & 12 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -2872,8 +2872,8 @@ def test_gather_csv(runtmp, linear_gather, prefetch_gather):
reader = csv.DictReader(fp)
row = next(reader)
print(row)
assert float(row['intersect_bp']) == 942
assert float(row['unique_intersect_bp']) == 942
assert float(row['intersect_bp']) == 940
assert float(row['unique_intersect_bp']) == 940
assert float(row['remaining_bp']) == 0
assert float(row['f_orig_query']) == 1.0
assert float(row['f_unique_to_query']) == 1.0
Expand All @@ -2885,7 +2885,7 @@ def test_gather_csv(runtmp, linear_gather, prefetch_gather):
assert row['query_filename'].endswith('short2.fa')
assert row['query_name'] == 'tr1 4'
assert row['query_md5'] == 'c9d5a795'
assert row['query_bp'] == '942'
assert row['query_bp'] == '940'


def test_gather_abund_x_abund(runtmp, prefetch_gather, linear_gather):
Expand Down Expand Up @@ -2974,7 +2974,7 @@ def test_gather_multiple_sbts_save_prefetch_csv(runtmp, linear_gather):
with open(runtmp.output('prefetch.csv')) as f:
output = f.read()
print((output,))
assert '902,0.925531914893617,0.9666666666666667' in output
assert '900,0.925531914893617,0.9666666666666667' in output


def test_gather_multiple_sbts_save_prefetch_and_prefetch_csv(runtmp, linear_gather):
Expand Down Expand Up @@ -3004,7 +3004,7 @@ def test_gather_multiple_sbts_save_prefetch_and_prefetch_csv(runtmp, linear_gath
with open(runtmp.output('prefetch.csv')) as f:
output = f.read()
print((output,))
assert '902,0.925531914893617,0.9666666666666667' in output
assert '900,0.925531914893617,0.9666666666666667' in output
assert os.path.exists(runtmp.output('out.zip'))


Expand Down Expand Up @@ -3048,7 +3048,7 @@ def test_gather_file_output(runtmp, linear_gather, prefetch_gather):
with open(runtmp.output('foo.out')) as f:
output = f.read()
print((output,))
assert '942,1.0,1.0' in output
assert '940,1.0,1.0' in output


def test_gather_f_match_orig(runtmp, linear_gather, prefetch_gather):
Expand Down Expand Up @@ -5628,8 +5628,8 @@ def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather):
print(row)
assert gather_result_names == list(row.keys())
assert gather_result_names_ci != list(row.keys())
assert float(row['intersect_bp']) == 942
assert float(row['unique_intersect_bp']) == 942
assert float(row['intersect_bp']) == 940
assert float(row['unique_intersect_bp']) == 940
assert float(row['remaining_bp']) == 0
assert float(row['f_orig_query']) == 1.0
assert float(row['f_unique_to_query']) == 1.0
Expand All @@ -5641,7 +5641,7 @@ def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather):
assert row['query_filename'].endswith('short2.fa')
assert row['query_name'] == 'tr1 4'
assert row['query_md5'] == 'c9d5a795'
assert row['query_bp'] == '942'
assert row['query_bp'] == '940'
assert row['query_containment_ani']== '1.0'
assert row['match_containment_ani'] == '1.0'
assert row['average_containment_ani'] == '1.0'
Expand Down Expand Up @@ -5675,8 +5675,8 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
row = next(reader)
print(row)
assert gather_result_names == list(row.keys())
assert float(row['intersect_bp']) == 942
assert float(row['unique_intersect_bp']) == 942
assert float(row['intersect_bp']) == 940
assert float(row['unique_intersect_bp']) == 940
assert float(row['remaining_bp']) == 0
assert float(row['f_orig_query']) == 1.0
assert float(row['f_unique_to_query']) == 1.0
Expand All @@ -5688,7 +5688,7 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
assert row['query_filename'].endswith('short2.fa')
assert row['query_name'] == 'tr1 4'
assert row['query_md5'] == 'c9d5a795'
assert row['query_bp'] == '942'
assert row['query_bp'] == '940'
assert row['query_containment_ani']== '1.0'
assert row['query_containment_ani_low']== ''
assert row['query_containment_ani_high']== ''
Expand Down

0 comments on commit f4f5935

Please sign in to comment.