Skip to content

Commit

Permalink
[MRG] replace chernoff bounds with exact probabilities (#2268)
Browse files Browse the repository at this point in the history
* replace chernoff bounds with exact probabilities

* fix failing tests (though there are some depreciation warnings that
aren't showing up) #2268 #2267
  • Loading branch information
dkoslicki authored Sep 8, 2022
1 parent 569d212 commit a8bd648
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 24 deletions.
26 changes: 25 additions & 1 deletion src/sourmash/distance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from dataclasses import dataclass, field
from scipy.optimize import brentq
from scipy.stats import norm as scipy_norm
from scipy.stats import binom
import numpy as np
from math import log, exp

Expand Down Expand Up @@ -168,7 +169,8 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05):
Computes the probability that the estimate: sketch_size * scaled deviates from the true
set_size by more than relative_error. This relies on the fact that the sketch_size
is binomially distributed with parameters sketch_size and 1/scale. The two-sided Chernoff
bounds are used.
bounds are used. This is depreciated in favor of set_size_exact_prob due to the later
being accurate even for very small set sizes
@param set_size: The number of distinct k-mers in the given set
@param relative_error: the desired relative error (defaults to 5%)
@return: float (the upper bound probability)
Expand All @@ -177,6 +179,28 @@ def set_size_chernoff(set_size, scaled, *, relative_error=0.05):
return upper_bound


def set_size_exact_prob(set_size, scaled, *, relative_error=0.05):
"""
Computes the exact probability that the estimate: sketch_size * scaled deviates from the true
set_size by more than relative_error. This relies on the fact that the sketch_size
is binomially distributed with parameters sketch_size and 1/scale. The CDF of the binomial distribution
is used.
@param set_size: The number of distinct k-mers in the given set
@param relative_error: the desired relative error (defaults to 5%)
@return: float (the upper bound probability)
"""
# Need to check if the edge case is an integer or not. If not, don't include it in the equation
pmf_arg = -set_size/scaled * (relative_error - 1)
if pmf_arg == int(pmf_arg):
prob = binom.cdf(set_size/scaled * (relative_error + 1), set_size, 1/scaled) - \
binom.cdf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled) + \
binom.pmf(-set_size/scaled * (relative_error - 1), set_size, 1/scaled)
else:
prob = binom.cdf(set_size / scaled * (relative_error + 1), set_size, 1 / scaled) - \
binom.cdf(-set_size / scaled * (relative_error - 1), set_size, 1 / scaled)
return prob


def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction):
"""helper function
Note that scaled here needs to be between 0 and 1
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class MinHash - core MinHash class.
class FrozenMinHash - read-only MinHash class.
"""
from __future__ import unicode_literals, division
from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_chernoff
from .distance_utils import jaccard_to_distance, containment_to_distance, set_size_exact_prob
from .logging import notify

import numpy as np
Expand Down Expand Up @@ -1051,7 +1051,7 @@ def size_is_accurate(self, relative_error=0.20, confidence=0.95):
if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]):
raise ValueError("Error: relative error and confidence values must be between 0 and 1.")
# to do: replace unique_dataset_hashes with HLL estimation when it gets implemented
probability = set_size_chernoff(self.unique_dataset_hashes, self.scaled, relative_error=relative_error)
probability = set_size_exact_prob(self.unique_dataset_hashes, self.scaled, relative_error=relative_error)
return probability >= confidence


Expand Down
37 changes: 35 additions & 2 deletions tests/test_distance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sourmash.distance_utils import (containment_to_distance, get_exp_probability_nothing_common,
handle_seqlen_nkmers, jaccard_to_distance,
ANIResult, ciANIResult, jaccardANIResult, var_n_mutated,
set_size_chernoff)
set_size_chernoff, set_size_exact_prob)

def test_aniresult():
res = ANIResult(0.4, 0.1)
Expand Down Expand Up @@ -416,4 +416,37 @@ def test_set_size_chernoff():
set_size = 10
s = 1/.01
value_from_mathematica = -1
assert np.abs(set_size_chernoff(set_size, s,relative_error=rel_error) - value_from_mathematica) < eps
assert np.abs(set_size_chernoff(set_size, s, relative_error=rel_error) - value_from_mathematica) < eps


def test_set_size_exact_prob():
# values obtained from Mathematica
# specifically: Probability[Abs[X*s - n]/n <= delta,
# X \[Distributed] BinomialDistribution[n, 1/s]] // N
set_size = 100
scaled = 2
relative_error = 0.05
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.382701
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)

set_size = 200
scaled = 5
relative_error = 0.15
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.749858
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)

set_size = 10
scaled = 10
relative_error = 0.10
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.38742
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)

set_size = 1000
scaled = 10
relative_error = 0.10
prob = set_size_exact_prob(set_size, scaled, relative_error=relative_error)
true_prob = 0.73182
np.testing.assert_array_almost_equal(true_prob, prob, decimal=3)
18 changes: 10 additions & 8 deletions tests/test_minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import itertools
import pickle
import math
import numpy as np

import pytest

Expand Down Expand Up @@ -3144,9 +3145,9 @@ def test_containment_ani_ci_tiny_testdata():

m2_cani_m1 = mh2.containment_ani(mh1, estimate_ci=True)
print(m2_cani_m1)
assert m2_cani_m1.ani == None
# from the formula ANI = c^(1/k) for c=3/4 and k=21
np.testing.assert_almost_equal(m2_cani_m1.ani, 0.986394259982259, decimal=3)
m2_cani_m1.size_is_inaccurate = False
assert m2_cani_m1.ani == 0.986394259982259
assert m2_cani_m1.ani_low == None
assert m2_cani_m1.ani_high == None

Expand Down Expand Up @@ -3207,7 +3208,7 @@ def test_minhash_set_size_estimate_is_accurate():
f2 = utils.get_test_data('2+63.fa.sig')
mh1 = sourmash.load_one_signature(f1, ksize=31).minhash
mh2 = sourmash.load_one_signature(f2).minhash
mh1_ds = mh1.downsample(scaled=10000)
mh1_ds = mh1.downsample(scaled=100000)
# check accuracy using default thresholds (rel_err= 0.2, confidence=0.95)
assert mh1.size_is_accurate() == True
assert mh1_ds.size_is_accurate() == False
Expand All @@ -3219,7 +3220,7 @@ def test_minhash_set_size_estimate_is_accurate():

# change prob
assert mh1.size_is_accurate(confidence=0.5) == True
assert mh1.size_is_accurate(confidence=1) == False
assert mh1.size_is_accurate(relative_error=0.001, confidence=1) == False

# check that relative error and confidence must be between 0 and 1
with pytest.raises(ValueError) as exc:
Expand All @@ -3236,15 +3237,16 @@ def test_minhash_set_size_estimate_is_accurate():


def test_minhash_ani_inaccurate_size_est():
# TODO: It's actually really tricky to get the set size to be inaccurate. Eg. For a scale factor of 10000,
# you would need
f1 = utils.get_test_data('2.fa.sig')
f2 = utils.get_test_data('2+63.fa.sig')
mh1 = sourmash.load_one_signature(f1, ksize=31).minhash
mh2 = sourmash.load_one_signature(f2).minhash
# downsample
mh1_ds = mh1.downsample(scaled=10000)
mh2_ds = mh2.downsample(scaled=10000)

assert mh1.size_is_accurate(relative_error=0.05, confidence=0.95) == False
mh1_ds = mh1.downsample(scaled=100000)
mh2_ds = mh2.downsample(scaled=100000)
assert mh1.size_is_accurate(relative_error=0.05, confidence=0.95) == True
assert mh1.size_is_accurate() == True
assert mh1_ds.size_is_accurate() == False
assert mh2.size_is_accurate() == True
Expand Down
22 changes: 11 additions & 11 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -5946,9 +5946,9 @@ def test_search_ani_containment_fail(runtmp):
print(row)
assert search_result_names == list(row.keys())
assert round(float(row['similarity']), 3) == 0.967
assert row['ani'] == ""

assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err
assert row['ani'] == "0.998906999319701"
# With PR #2268, this error message should not appear
#assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err


def test_search_ani_containment_estimate_ci(runtmp):
Expand Down Expand Up @@ -6185,14 +6185,14 @@ def test_gather_ani_csv_estimate_ci(runtmp, linear_gather, prefetch_gather):
assert row['query_name'] == 'tr1 4'
assert row['query_md5'] == 'c9d5a795'
assert row['query_bp'] == '910'
assert row['query_containment_ani']== ''
assert row['query_containment_ani_low']== ''
assert row['query_containment_ani_high']== ''
assert row['match_containment_ani'] == ''
assert row['match_containment_ani_low'] == ''
assert row['match_containment_ani_high'] == ''
assert row['average_containment_ani'] == ''
assert row['max_containment_ani'] ==''
assert row['query_containment_ani'] == '1.0'
assert row['query_containment_ani_low'] == '1.0'
assert row['query_containment_ani_high'] == '1.0'
assert row['match_containment_ani'] == '1.0'
assert row['match_containment_ani_low'] == '1.0'
assert row['match_containment_ani_high'] == '1.0'
assert row['average_containment_ani'] == '1.0'
assert row['max_containment_ani'] == '1.0'
assert row['potential_false_negative'] == 'False'


Expand Down

0 comments on commit a8bd648

Please sign in to comment.