[MRG] replace chernoff bounds with exact probabilities (#2268)

* replace chernoff bounds with exact probabilities * fix failing tests (though there are some depreciation warnings that aren't showing up) #2268 #2267
sourmash-bio · Sep 8, 2022 · a8bd648 · a8bd648
1 parent 569d212
commit a8bd648
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 24 deletions.
diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py
@@ -6,6 +6,7 @@
 from dataclasses import dataclass, field
 from scipy.optimize import brentq
 from scipy.stats import norm as scipy_norm
+from scipy.stats import binom
 import numpy as np
 from math import log, exp
 
@@ -168,7 +169,8 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05):
     Computes the probability that the estimate: sketch_size * scaled deviates from the true
     set_size by more than relative_error. This relies on the fact that the sketch_size
     is binomially distributed with parameters sketch_size and 1/scale. The two-sided Chernoff
-    bounds are used.
+    bounds are used. This is depreciated in favor of set_size_exact_prob due to the later
+    being accurate even for very small set sizes
     @param set_size: The number of distinct k-mers in the given set
     @param relative_error: the desired relative error (defaults to 5%)
     @return: float (the upper bound probability)
@@ -177,6 +179,28 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05):
     return upper_bound
 
 
+def set_size_exact_prob(set_size, scaled, *, relative_error=0.05):
+    """
+    Computes the exact probability that the estimate: sketch_size * scaled deviates from the true
+    set_size by more than relative_error. This relies on the fact that the sketch_size
+    is binomially distributed with parameters sketch_size and 1/scale. The CDF of the binomial distribution
+    is used.
+    @param set_size: The number of distinct k-mers in the given set
+    @param relative_error: the desired relative error (defaults to 5%)
+    @return: float (the upper bound probability)
+    """
+    # Need to check if the edge case is an integer or not. If not, don't include it in the equation
+    pmf_arg = -set_size/scaled * (relative_error - 1)
+    if pmf_arg == int(pmf_arg):
+        prob = binom.cdf(set_size/scaled * (relative_error + 1), set_size, 1/scaled) - \
+               binom.cdf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled) + \
+               binom.pmf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled)
+    else:
+        prob = binom.cdf(set_size / scaled * (relative_error + 1), set_size, 1 / scaled) - \
+               binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled)
+    return prob
+
+
 def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction):
     """helper function
     Note that scaled here needs to be between 0 and 1

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
@@ -6,7 +6,7 @@ class MinHash - core MinHash class.
 class FrozenMinHash - read-only MinHash class.
 """
 from __future__ import unicode_literals, division
-from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_chernoff
+from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_exact_prob
 from .logging import notify
 
 import numpy as np
@@ -1051,7 +1051,7 @@ def size_is_accurate(self, relative_error=0.20, confidence=0.95):
         if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]):
             raise ValueError("Error: relative error and confidence values must be between 0 and 1.")
         # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented 
-        probability = set_size_chernoff(self.unique_dataset_hashes, self.scaled, relative_error=relative_error)
+        probability = set_size_exact_prob(self.unique_dataset_hashes, self.scaled, relative_error=relative_error)
         return probability >= confidence
 
 

diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py
@@ -6,7 +6,7 @@
 from sourmash.distance_utils import (containment_to_distance, get_exp_probability_nothing_common,
                                     handle_seqlen_nkmers, jaccard_to_distance,
                                     ANIResult, ciANIResult, jaccardANIResult, var_n_mutated,
-                                    set_size_chernoff)
+                                    set_size_chernoff, set_size_exact_prob)
 
 def test_aniresult():
     res = ANIResult(0.4, 0.1)
@@ -416,4 +416,37 @@ def test_set_size_chernoff():
     set_size = 10
     s = 1/.01
     value_from_mathematica = -1
-    assert np.abs(set_size_chernoff(set_size, s,relative_error=rel_error) - value_from_mathematica) < eps
+    assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps
+
+
+def test_set_size_exact_prob():
+    # values obtained from Mathematica
+    # specifically: Probability[Abs[X*s - n]/n <= delta,
+    #   X \[Distributed] BinomialDistribution[n, 1/s]] // N
+    set_size = 100
+    scaled = 2
+    relative_error = 0.05
+    prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
+    true_prob = 0.382701
+    np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)
+
+    set_size = 200
+    scaled = 5
+    relative_error = 0.15
+    prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
+    true_prob = 0.749858
+    np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)
+
+    set_size = 10
+    scaled = 10
+    relative_error = 0.10
+    prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
+    true_prob = 0.38742
+    np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)
+
+    set_size = 1000
+    scaled = 10
+    relative_error = 0.10
+    prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
+    true_prob = 0.73182
+    np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)
diff --git a/tests/test_minhash.py b/tests/test_minhash.py
@@ -36,6 +36,7 @@
 import itertools
 import pickle
 import math
+import numpy as np
 
 import pytest
 
@@ -3144,9 +3145,9 @@ def test_containment_ani_ci_tiny_testdata():
 
     m2_cani_m1 = mh2.containment_ani(mh1, estimate_ci=True)
     print(m2_cani_m1)
-    assert m2_cani_m1.ani == None
+    # from the formula ANI = c^(1/k) for c=3/4 and k=21
+    np.testing.assert_almost_equal(m2_cani_m1.ani, 0.986394259982259, decimal=3)
     m2_cani_m1.size_is_inaccurate = False
-    assert m2_cani_m1.ani == 0.986394259982259
     assert m2_cani_m1.ani_low == None
     assert m2_cani_m1.ani_high == None
 
@@ -3207,7 +3208,7 @@ def test_minhash_set_size_estimate_is_accurate():
     f2 = utils.get_test_data('2+63.fa.sig')
     mh1 = sourmash.load_one_signature(f1, ksize=31).minhash
     mh2 = sourmash.load_one_signature(f2).minhash
-    mh1_ds = mh1.downsample(scaled=10000)
+    mh1_ds = mh1.downsample(scaled=100000)
     # check accuracy using default thresholds (rel_err= 0.2, confidence=0.95)
     assert mh1.size_is_accurate() == True
     assert mh1_ds.size_is_accurate() == False
@@ -3219,7 +3220,7 @@ def test_minhash_set_size_estimate_is_accurate():
 
     # change prob
     assert mh1.size_is_accurate(confidence=0.5) == True
-    assert mh1.size_is_accurate(confidence=1) == False
+    assert mh1.size_is_accurate(relative_error=0.001, confidence=1) == False
 
     # check that relative error and confidence must be between 0 and 1
     with pytest.raises(ValueError) as exc:
@@ -3236,15 +3237,16 @@ def test_minhash_set_size_estimate_is_accurate():
 
 
 def test_minhash_ani_inaccurate_size_est():
+    # TODO: It's actually really tricky to get the set size to be inaccurate. Eg. For a scale factor of 10000,
+    # you would need
     f1 = utils.get_test_data('2.fa.sig')
     f2 = utils.get_test_data('2+63.fa.sig')
     mh1 = sourmash.load_one_signature(f1, ksize=31).minhash
     mh2 = sourmash.load_one_signature(f2).minhash
     # downsample
-    mh1_ds = mh1.downsample(scaled=10000)
-    mh2_ds = mh2.downsample(scaled=10000)
-
-    assert mh1.size_is_accurate(relative_error=0.05, confidence=0.95) == False
+    mh1_ds = mh1.downsample(scaled=100000)
+    mh2_ds = mh2.downsample(scaled=100000)
+    assert mh1.size_is_accurate(relative_error=0.05, confidence=0.95) == True
     assert mh1.size_is_accurate() == True
     assert mh1_ds.size_is_accurate() == False
     assert mh2.size_is_accurate() == True

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
@@ -5946,9 +5946,9 @@ def test_search_ani_containment_fail(runtmp):
         print(row)
         assert search_result_names == list(row.keys())
         assert round(float(row['similarity']), 3) == 0.967
-        assert row['ani'] == ""
-
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err
+        assert row['ani'] == "0.998906999319701"
+    # With PR #2268, this error message should not appear
+    #assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err
 
 
 def test_search_ani_containment_estimate_ci(runtmp):
@@ -6185,14 +6185,14 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
         assert row['query_name'] == 'tr1 4'
         assert row['query_md5'] == 'c9d5a795'
         assert row['query_bp'] == '910'
-        assert row['query_containment_ani']== ''
-        assert row['query_containment_ani_low']== ''
-        assert row['query_containment_ani_high']== ''
-        assert row['match_containment_ani'] == ''
-        assert row['match_containment_ani_low'] == ''
-        assert row['match_containment_ani_high'] == ''
-        assert row['average_containment_ani'] == ''
-        assert row['max_containment_ani'] ==''
+        assert row['query_containment_ani'] == '1.0'
+        assert row['query_containment_ani_low'] == '1.0'
+        assert row['query_containment_ani_high'] == '1.0'
+        assert row['match_containment_ani'] == '1.0'
+        assert row['match_containment_ani_low'] == '1.0'
+        assert row['match_containment_ani_high'] == '1.0'
+        assert row['average_containment_ani'] == '1.0'
+        assert row['max_containment_ani'] == '1.0'
         assert row['potential_false_negative'] == 'False'