fix k -1

sourmash-bio · May 4, 2022 · f4f5935 · f4f5935
1 parent 73bcd1d
commit f4f5935
Show file tree

Hide file tree

Showing 7 changed files with 22 additions and 22 deletions.
diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
@@ -921,7 +921,7 @@ def std_abundance(self):
     def unique_covered_bp(self):
         if not self.scaled:
             raise TypeError("can only calculate bp for scaled MinHashes")
-        return len(self.hashes) * self.scaled + (self.ksize + 1)
+        return len(self.hashes) * self.scaled + (self.ksize - 1)
 
 
 class FrozenMinHash(MinHash):

diff --git a/src/sourmash/search.py b/src/sourmash/search.py
@@ -454,7 +454,7 @@ def build_gather_result(self):
         # this affects estimation of original query information, and requires us to pass in orig_query_len and orig_query_abunds.
         # we also need to overwrite self.query_bp, self.query_n_hashes, and self.query_abundance
         # todo: find a better solution?
-        self.query_bp = self.orig_query_len * self.query.minhash.scaled + self.ksize + 1
+        self.query_bp = self.orig_query_len * self.query.minhash.scaled + self.ksize - 1
         self.query_n_hashes = self.orig_query_len
 
         # calculate intersection with query hashes:

diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py
@@ -104,7 +104,7 @@ def pass_threshold(self):
 
     @property
     def intersect_bp(self):
-        return (len(self.intersect_mh) * self.cmp_scaled) + (self.ksize + 1)
+        return (len(self.intersect_mh) * self.cmp_scaled) + (self.ksize - 1)
 
     @property
     def mh1_containment(self):

diff --git a/tests/test_minhash.py b/tests/test_minhash.py
@@ -2813,7 +2813,7 @@ def test_unique_covered_bp(track_abundance):
     mh1.add_many((1, 2))
     mh2.add_many((1, 5))
 
-    assert mh1.unique_covered_bp == 26
+    assert mh1.unique_covered_bp == 24
     with pytest.raises(TypeError) as exc:
         mh2.unique_covered_bp
     assert "can only calculate bp for scaled MinHashes" in str(exc)

diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py
@@ -193,7 +193,7 @@ def test_prefetch_csv_out(runtmp, linear_gather):
     assert c.last_result.status == 0
     assert os.path.exists(csvout)
 
-    expected_intersect_bp = [2529032, 5177032]
+    expected_intersect_bp = [2529030, 5177030]
     with open(csvout, 'rt', newline="") as fp:
         r = csv.DictReader(fp)
         for (row, expected) in zip(r, expected_intersect_bp):

diff --git a/tests/test_sketchcomparison.py b/tests/test_sketchcomparison.py
@@ -42,7 +42,7 @@ def test_FracMinHashComparison(track_abundance):
     assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
     intersect_mh = a.flatten().intersection(b.flatten())
     assert cmp.intersect_mh == intersect_mh == b.flatten().intersection(a.flatten())
-    assert cmp.intersect_bp == 26
+    assert cmp.intersect_bp == 24
     assert cmp.pass_threshold # default threshold is 0; this should pass
     if track_abundance:
         assert cmp.angular_similarity == a.angular_similarity(b) == b.angular_similarity(a)
@@ -100,7 +100,7 @@ def test_FracMinHashComparison_downsample(track_abundance):
     assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a)
     intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
     assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
-    assert cmp.intersect_bp == 30
+    assert cmp.intersect_bp == 28
     assert cmp.pass_threshold # default threshold is 0; this should pass
     if track_abundance:
         assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a)
@@ -158,7 +158,7 @@ def test_FracMinHashComparison_autodownsample(track_abundance):
     assert cmp.jaccard == ds_a.jaccard(ds_b) == ds_b.jaccard(ds_a)
     intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
     assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
-    assert cmp.intersect_bp == 30
+    assert cmp.intersect_bp == 28
     assert cmp.pass_threshold # default threshold is 0; this should pass
     if track_abundance:
         assert cmp.angular_similarity == ds_a.angular_similarity(ds_b) == ds_b.angular_similarity(ds_a)
@@ -215,7 +215,7 @@ def test_FracMinHashComparison_ignore_abundance(track_abundance):
     assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
     intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
     assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
-    assert cmp.intersect_bp == 30
+    assert cmp.intersect_bp == 28
     assert cmp.pass_threshold # default threshold is 0; this should pass
     # with ignore_abundance = True, all of these should not be usable. Do we want errors, or ""/None?
     with pytest.raises(TypeError) as exc:
@@ -266,7 +266,7 @@ def test_FracMinHashComparison_fail_threshold(track_abundance):
     assert cmp.jaccard == a.jaccard(b) == b.jaccard(a)
     intersect_mh = ds_a.flatten().intersection(ds_b.flatten())
     assert cmp.intersect_mh == intersect_mh == ds_b.flatten().intersection(ds_a.flatten())
-    assert cmp.intersect_bp == 30
+    assert cmp.intersect_bp == 28
     assert not cmp.pass_threshold # threshold is 40; this should fail
 
 

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
@@ -2872,8 +2872,8 @@ def test_gather_csv(runtmp, linear_gather, prefetch_gather):
         reader = csv.DictReader(fp)
         row = next(reader)
         print(row)
-        assert float(row['intersect_bp']) == 942
-        assert float(row['unique_intersect_bp']) == 942
+        assert float(row['intersect_bp']) == 940
+        assert float(row['unique_intersect_bp']) == 940
         assert float(row['remaining_bp']) == 0
         assert float(row['f_orig_query']) == 1.0
         assert float(row['f_unique_to_query']) == 1.0
@@ -2885,7 +2885,7 @@ def test_gather_csv(runtmp, linear_gather, prefetch_gather):
         assert row['query_filename'].endswith('short2.fa')
         assert row['query_name'] == 'tr1 4'
         assert row['query_md5'] == 'c9d5a795'
-        assert row['query_bp'] == '942'
+        assert row['query_bp'] == '940'
 
 
 def test_gather_abund_x_abund(runtmp, prefetch_gather, linear_gather):
@@ -2974,7 +2974,7 @@ def test_gather_multiple_sbts_save_prefetch_csv(runtmp, linear_gather):
     with open(runtmp.output('prefetch.csv')) as f:
         output = f.read()
         print((output,))
-        assert '902,0.925531914893617,0.9666666666666667' in output
+        assert '900,0.925531914893617,0.9666666666666667' in output
 
 
 def test_gather_multiple_sbts_save_prefetch_and_prefetch_csv(runtmp, linear_gather):
@@ -3004,7 +3004,7 @@ def test_gather_multiple_sbts_save_prefetch_and_prefetch_csv(runtmp, linear_gath
     with open(runtmp.output('prefetch.csv')) as f:
         output = f.read()
         print((output,))
-        assert '902,0.925531914893617,0.9666666666666667' in output
+        assert '900,0.925531914893617,0.9666666666666667' in output
     assert os.path.exists(runtmp.output('out.zip'))
 
 
@@ -3048,7 +3048,7 @@ def test_gather_file_output(runtmp, linear_gather, prefetch_gather):
     with open(runtmp.output('foo.out')) as f:
         output = f.read()
         print((output,))
-        assert '942,1.0,1.0' in output
+        assert '940,1.0,1.0' in output
 
 
 def test_gather_f_match_orig(runtmp, linear_gather, prefetch_gather):
@@ -5628,8 +5628,8 @@ def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather):
         print(row)
         assert gather_result_names == list(row.keys())
         assert gather_result_names_ci != list(row.keys())
-        assert float(row['intersect_bp']) == 942
-        assert float(row['unique_intersect_bp']) == 942
+        assert float(row['intersect_bp']) == 940
+        assert float(row['unique_intersect_bp']) == 940
         assert float(row['remaining_bp']) == 0
         assert float(row['f_orig_query']) == 1.0
         assert float(row['f_unique_to_query']) == 1.0
@@ -5641,7 +5641,7 @@ def test_gather_ani_csv(runtmp, linear_gather, prefetch_gather):
         assert row['query_filename'].endswith('short2.fa')
         assert row['query_name'] == 'tr1 4'
         assert row['query_md5'] == 'c9d5a795'
-        assert row['query_bp'] == '942'
+        assert row['query_bp'] == '940'
         assert row['query_containment_ani']== '1.0'
         assert row['match_containment_ani'] == '1.0'
         assert row['average_containment_ani'] == '1.0'
@@ -5675,8 +5675,8 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
         row = next(reader)
         print(row)
         assert gather_result_names == list(row.keys())
-        assert float(row['intersect_bp']) == 942
-        assert float(row['unique_intersect_bp']) == 942
+        assert float(row['intersect_bp']) == 940
+        assert float(row['unique_intersect_bp']) == 940
         assert float(row['remaining_bp']) == 0
         assert float(row['f_orig_query']) == 1.0
         assert float(row['f_unique_to_query']) == 1.0
@@ -5688,7 +5688,7 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
         assert row['query_filename'].endswith('short2.fa')
         assert row['query_name'] == 'tr1 4'
         assert row['query_md5'] == 'c9d5a795'
-        assert row['query_bp'] == '942'
+        assert row['query_bp'] == '940'
         assert row['query_containment_ani']== '1.0'
         assert row['query_containment_ani_low']== ''
         assert row['query_containment_ani_high']== ''