From f728db1c62b7ca4b3621c189d69573a468fda4a9 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 13:06:49 -0800 Subject: [PATCH 01/38] init dist utils --- src/sourmash/distance_utils.py | 133 ++++++++++++++++++++++ tests/test_distance_utils.py | 194 +++++++++++++++++++++++++++++++++ 2 files changed, 327 insertions(+) create mode 100644 src/sourmash/distance_utils.py create mode 100644 tests/test_distance_utils.py diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py new file mode 100644 index 0000000000..a22bf9def2 --- /dev/null +++ b/src/sourmash/distance_utils.py @@ -0,0 +1,133 @@ +""" +Utility functions for jaccard/containment --> distance estimates +From https://github.com/KoslickiLab/mutation-rate-ci-calculator +https://doi.org/10.1101/2022.01.11.475870 +""" +from scipy.optimize import brentq #, fsolve, newton +from scipy.stats import norm as scipy_norm +from scipy.special import hyp2f1 +from numpy import sqrt + +#FROM mrcc.kmer_mutation_formulas_thm5 +def r1_to_q(k,r1): + r1 = float(r1) + q = 1-(1-r1)**k + return float(q) + + +def var_n_mutated(L,k,r1,q=None): + # there are computational issues in the variance formula that we solve here + # by the use of higher-precision arithmetic; the problem occurs when r is + # very small; for example, with L=10,k=2,r1=1e-6 standard precision + # gives varN<0 which is nonsense; by using the mpf type, we get the correct + # answer which is about 0.000038. + if (r1 == 0): return 0.0 + r1 = float(r1) + if (q == None): # we assume that if q is provided, it is correct for r1 + q = r1_to_q(k,r1) + varN = L*(1-q)*(q*(2*k+(2/r1)-1)-2*k) \ + + k*(k-1)*(1-q)**2 \ + + (2*(1-q)/(r1**2))*((1+(k-1)*(1-q))*r1-q) + assert (varN>=0.0) + return float(varN) + + +def exp_n_mutated(L,k,r1): + q = r1_to_q(k,r1) + return L*q + + +# FROM mrcc.third_moment_calculator +def exp_n_mutated_squared(L, k, p): + return var_n_mutated(L, k, p) + exp_n_mutated(L, k, p) ** 2 + + +def third_moment_nmut_exact(L,k,p): + t1 = (L * (-2 + 3*L) * p**2 + 3 * (1 - p)**(2*k) * (2 + (-1 + k - L) * p * (2 + k * p - L * p)) - (1 - p)**k * (6 + p * (-6 + L * (-6 + p + 6 * L * p))))/(p**2) + t2 = (-2 + 2 * k - L) * (-1 + 2 * k - L) * (2 * k - L) * (-1 + (1 - p)**k)**3 + t3 = (1/(p**3))*(-6 * (-1 + k)**2 * (k - L) * p**3 + 6 * (1 - p)**(3 * k) * (2 + (-2 + 2 * k - L) * p) + (1 - p)**(2 * k) * (-12 + 6 * (2 * k + L) * p + 6 * (4 * k**2 + 2 * (1 + L) - 3 * k * (2 + L)) * p**2 - (-1 + k) * k * (-2 + 4 * k - 3 * L) * p**3) + 6 * (-1 + k) * (1 - p)**k * p * (-2 + p * (2 - k + 2 * L + (k * (-2 + 3 * k - 3 * L) + L) * p))) + t4 = 6 * (-1 + (1 - p)**k) * ((k + k**2 - 2 * k * L + (-1 + L) * L) * (-1 + 2 * (1 - p)**k) * hyp2f1(1, 2 + k - L, k - L, 1) + (k + k**2 - 2 * k * L + (-1 + L) * L) * (1 - p)**k * (-1 + p) * hyp2f1(1, 2 + k - L, k - L, 1 - p) - (-2 * k + 4 * k**2 + L - 4 * k * L + L**2) * ((-1 + 2 * (1 - p)**k) * hyp2f1(1, 1 + 2 * k - L, -1 + 2 * k - L, 1)- (-1 + p)**(2 * k) * hyp2f1(1, 1 + 2 * k - L, -1 + 2 * k - L, 1 - p))) + return t1+t2+t3+t4 + + +def exp_n_mutated_cubed(L, k, p): + return third_moment_nmut_exact(L, k, p) + + +#from mrcc.p_from_scaled_containment +def probit(p): + return scipy_norm.ppf(p) + + +def containment_to_distance(containment, n_unique_kmers, ksize, scaled, confidence=0.95): + """ + Containment --> distance CI (one step) + """ + if containment <= 0.0001: # changed from 0.0, to mirror jaccard_to_distance_CI_one_step + sol2 = sol1 = distance_point_estimate = 1.0 + elif containment >= 0.9999: # changed from 1.0, to mirror jaccard_to_distance_CI_one_step + sol1 = sol2 = distance_point_estimate = 0.0 + else: + alpha = 1 - confidence + z_alpha = probit(1-alpha/2) + f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 + + bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers + + term_1 = (1.0-f_scaled) / (f_scaled * n_unique_kmers**3 * bias_factor**2) + term_2 = lambda pest: n_unique_kmers * exp_n_mutated(n_unique_kmers, ksize, pest) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) + term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (n_unique_kmers**2) + + var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) + + f1 = lambda pest: (1-pest)**ksize + z_alpha * sqrt(var_direct(pest)) - containment + f2 = lambda pest: (1-pest)**ksize - z_alpha * sqrt(var_direct(pest)) - containment + + sol1 = brentq(f1, 0.0000001, 0.9999999) + sol2 = brentq(f2, 0.0000001, 0.9999999) + + distance_point_estimate = 1.0-containment**(1.0/ksize) + #distance_using_CImidpoint = (sol2+sol1)/2.0 + return distance_point_estimate,sol2,sol1 + + +#from from mrcc.p_from_scaled_jaccard +def variance_scaled_jaccard(L, p, k, s): + exp_n_mut = exp_n_mutated(L, k, p) + exp_n_mut_squared = exp_n_mutated_squared(L, k, p) + exp_n_mut_cubed = exp_n_mutated_cubed(L, k, p) + bias_factor = 1 - (1 - s) ** ( int(L + exp_n_mut) ) + + factor1 = (1-s)/(s * bias_factor**2) + factor2 = (2 * L * exp_n_mut - 2 * exp_n_mut_squared) / (L ** 3 + 3*L*exp_n_mut_squared + 3*L*L*exp_n_mut + exp_n_mut_cubed) + term1 = factor1 * factor2 + term2 = (L**2 - 2 * L * exp_n_mut + exp_n_mut_squared) / (L**2 + 2 * L * exp_n_mut + exp_n_mut_squared) + term3 = ((L - exp_n_mut) / (L + exp_n_mut))**2 + + return term1 + term2 - term3 + + +def jaccard_to_distance(jaccard, n_unique_kmers, ksize, scaled,confidence=0.95): + """ + Scaled Jaccard to distance estimate and CI (one step) + """ + if jaccard <= 0.0001: + sol2 = sol1 = distance_point_estimate = 1.0 + elif jaccard >= 0.9999: + sol1 = sol2 = distance_point_estimate = 0.0 + else: + alpha = 1 - confidence + z_alpha = probit(1-alpha/2) + f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 + + var_direct = lambda pest: variance_scaled_jaccard(n_unique_kmers, pest, ksize, f_scaled) + + f1 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 + z_alpha * sqrt(var_direct(pest)) - jaccard + f2 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 - z_alpha * sqrt(var_direct(pest)) - jaccard + + sol1 = brentq(f1, 0.0001, 0.9999) + sol2 = brentq(f2, 0.0001, 0.9999) + + distance_point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) + + return distance_point_estimate,sol2,sol1 \ No newline at end of file diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py new file mode 100644 index 0000000000..27cb153ec6 --- /dev/null +++ b/tests/test_distance_utils.py @@ -0,0 +1,194 @@ +""" +Tests for distance utils. +""" +from sourmash.distance_utils import containment_to_distance, jaccard_to_distance + + +def test_containment_to_distance_zero(): + contain = 0 + scaled = 1 + nkmers = 10000 + ksize=21 + dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + # check results + exp_dist, exp_low,exp_high = 1.0,1.0,1.0 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_containment_to_distance_one(): + contain = 1 + scaled = 1 + nkmers = 10000 + ksize=21 + dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + # check results + exp_dist, exp_low,exp_high = 0.0,0.0,0.0 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_containment_to_distance_scaled1(): + contain = 0.5 + scaled = 1 + nkmers = 10000 + ksize=21 + dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.032468221476108394,0.028709912966405623,0.03647860197289783 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_containment_to_distance_2(): + contain = 0.1 + scaled = 100 + nkmers = 10000 + ksize=31 + dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.07158545548052564,0.05320779238601372,0.09055547672455365 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_containment_to_distance_scaled100(): + contain = 0.5 + scaled = 100 + nkmers = 10000 + ksize=21 + dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.032468221476108394,0.023712063916639017,0.04309960543965866 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_containment_to_distance_k10(): + jaccard = 0.5 + scaled = 100 + nkmers = 10000 + ksize=10 + dist,low,high = containment_to_distance(jaccard,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.06696700846319259,0.04982777541057476,0.08745108232411622 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_jaccard_to_distance_zero(): + jaccard = 0 + scaled = 1 + nkmers = 10000 + ksize=21 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 1.0,1.0,1.0 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_jaccard_to_distance_one(): + jaccard = 1 + scaled = 1 + nkmers = 10000 + ksize=21 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.0,0.0,0.0 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_jaccard_to_distance_scaled1(): + jaccard = 0.5 + scaled = 1 + nkmers = 10000 + ksize=21 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.019122659390482077,0.017549816764592427,0.02085752009333101 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_jaccard_to_distance_scaled100(): + jaccard = 0.5 + scaled = 100 + nkmers = 10000 + ksize=21 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.019122659390482077,0.014156518319318126,0.025095471100086125 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_jaccard_to_distance_k31(): + jaccard = 0.5 + scaled = 100 + nkmers = 10000 + ksize=31 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.012994354410710174,0.009559840649526866,0.017172145712325716 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + +def test_jaccard_to_distance_k31_2(): + jaccard = 0.1 + scaled = 100 + nkmers = 10000 + ksize=31 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.0535071608231702,0.040739632227821516,0.06673746391115623 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high \ No newline at end of file From 70d2ae2941e23d90e6b8dc058cd0c55f58869400 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 13:22:25 -0800 Subject: [PATCH 02/38] add confidence tests --- tests/test_distance_utils.py | 54 ++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index 27cb153ec6..d50fdad146 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -97,6 +97,33 @@ def test_containment_to_distance_k10(): assert low == exp_low assert high == exp_high +def test_containment_to_distance_confidence(): + contain = 0.1 + scaled = 100 + nkmers = 10000 + ksize=31 + confidence=0.99 + dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled,confidence) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.07158545548052564,0.04802880300938562,0.09619930040790341 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + confidence=0.90 + dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled,confidence) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.07158545548052564,0.05599435479247415,0.08758718871990222 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + def test_jaccard_to_distance_zero(): jaccard = 0 @@ -191,4 +218,31 @@ def test_jaccard_to_distance_k31_2(): exp_dist, exp_low,exp_high = 0.0535071608231702,0.040739632227821516,0.06673746391115623 assert dist == exp_dist assert low == exp_low + assert high == exp_high + +def test_jaccard_to_distance_confidence(): + jaccard = 0.1 + scaled = 100 + nkmers = 10000 + ksize=31 + confidence=0.99 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,confidence) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.0535071608231702,0.03702518586582857,0.07080999238232429 + assert dist == exp_dist + assert low == exp_low + assert high == exp_high + + confidence=0.90 + dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,confidence) + print("\nDIST:", dist) + print("CI:", low, " - ", high) + print(f"{dist},{low},{high}") + # check results + exp_dist, exp_low,exp_high = 0.0535071608231702,0.042708361726101415,0.0646280650023921 + assert dist == exp_dist + assert low == exp_low assert high == exp_high \ No newline at end of file From 6c061a059bc4d7576c613134abf4491cab7700f7 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 14:06:14 -0800 Subject: [PATCH 03/38] optionally return sim/ident instead --- src/sourmash/distance_utils.py | 44 ++++++++++++++++++------- tests/test_distance_utils.py | 59 +++++++++++++++++++++++++++++++++- 2 files changed, 90 insertions(+), 13 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index a22bf9def2..e2aefc469e 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -3,7 +3,7 @@ From https://github.com/KoslickiLab/mutation-rate-ci-calculator https://doi.org/10.1101/2022.01.11.475870 """ -from scipy.optimize import brentq #, fsolve, newton +from scipy.optimize import brentq from scipy.stats import norm as scipy_norm from scipy.special import hyp2f1 from numpy import sqrt @@ -14,7 +14,7 @@ def r1_to_q(k,r1): q = 1-(1-r1)**k return float(q) - + def var_n_mutated(L,k,r1,q=None): # there are computational issues in the variance formula that we solve here # by the use of higher-precision arithmetic; the problem occurs when r is @@ -59,14 +59,14 @@ def probit(p): return scipy_norm.ppf(p) -def containment_to_distance(containment, n_unique_kmers, ksize, scaled, confidence=0.95): +def containment_to_distance(containment, n_unique_kmers, ksize, scaled, confidence=0.95, return_identity=False): """ Containment --> distance CI (one step) """ if containment <= 0.0001: # changed from 0.0, to mirror jaccard_to_distance_CI_one_step - sol2 = sol1 = distance_point_estimate = 1.0 + sol2 = sol1 = point_estimate = 1.0 elif containment >= 0.9999: # changed from 1.0, to mirror jaccard_to_distance_CI_one_step - sol1 = sol2 = distance_point_estimate = 0.0 + sol1 = sol2 = point_estimate = 0.0 else: alpha = 1 - confidence z_alpha = probit(1-alpha/2) @@ -86,9 +86,11 @@ def containment_to_distance(containment, n_unique_kmers, ksize, scaled, confiden sol1 = brentq(f1, 0.0000001, 0.9999999) sol2 = brentq(f2, 0.0000001, 0.9999999) - distance_point_estimate = 1.0-containment**(1.0/ksize) + point_estimate = 1.0-containment**(1.0/ksize) + if return_identity: + point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) #distance_using_CImidpoint = (sol2+sol1)/2.0 - return distance_point_estimate,sol2,sol1 + return point_estimate,sol2,sol1 #from from mrcc.p_from_scaled_jaccard @@ -107,14 +109,14 @@ def variance_scaled_jaccard(L, p, k, s): return term1 + term2 - term3 -def jaccard_to_distance(jaccard, n_unique_kmers, ksize, scaled,confidence=0.95): +def jaccard_to_distance(jaccard, n_unique_kmers, ksize, scaled, confidence=0.95, return_identity=False): """ Scaled Jaccard to distance estimate and CI (one step) """ if jaccard <= 0.0001: - sol2 = sol1 = distance_point_estimate = 1.0 + sol2 = sol1 = point_estimate = 1.0 elif jaccard >= 0.9999: - sol1 = sol2 = distance_point_estimate = 0.0 + sol1 = sol2 = point_estimate = 0.0 else: alpha = 1 - confidence z_alpha = probit(1-alpha/2) @@ -128,6 +130,24 @@ def jaccard_to_distance(jaccard, n_unique_kmers, ksize, scaled,confidence=0.95): sol1 = brentq(f1, 0.0001, 0.9999) sol2 = brentq(f2, 0.0001, 0.9999) - distance_point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) + point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) + if return_identity: + point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) + + return point_estimate,sol2,sol1 + - return distance_point_estimate,sol2,sol1 \ No newline at end of file +def distance_to_identity(dist,d_low=None,d_high=None): + """ + ANI = 1-distance + """ + for d in [dist,d_low,d_high]: + if not 0 <= d <= 1: + raise ValueError(f"Error: distance value {d} is not between 0 and 1!") + id = 1-dist + id_low,id_high=None,None + if d_low is not None: # need to be explicit so will work on 0 value + id_high = 1-d_low + if d_high is not None: # need to be explicit so will work on 0 value + id_low = 1-d_high + return id,id_low,id_high \ No newline at end of file diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index d50fdad146..10d2eac225 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -1,7 +1,24 @@ """ Tests for distance utils. """ -from sourmash.distance_utils import containment_to_distance, jaccard_to_distance + +import pytest +from sourmash.distance_utils import containment_to_distance, jaccard_to_distance,distance_to_identity + +def test_distance_to_identity(): + id,id_low,id_high = distance_to_identity(0.5,0.4,0.6) + assert id == 0.5 + assert id_low == 0.4 + assert id_high ==0.6 + + +def test_distance_to_identity_fail(): + with pytest.raises(Exception) as exc: + id,id_low,id_high = distance_to_identity(1.1,0.4,0.6) + assert "distance value 1.1 is not between 0 and 1!" in str(exc.value) + with pytest.raises(Exception) as exc: + id,id_low,id_high = distance_to_identity(-0.1,0.4,0.6) + assert "distance value -0.1 is not between 0 and 1!" in str(exc.value) def test_containment_to_distance_zero(): @@ -17,6 +34,12 @@ def test_containment_to_distance_zero(): assert dist == exp_dist assert low == exp_low assert high == exp_high + # return identity instead + exp_id, exp_idlow,exp_idhigh = 0.0,0.0,0.0 + ident,low,high = containment_to_distance(contain,nkmers,ksize,scaled,return_identity=True) + assert ident == exp_id + assert low == exp_idlow + assert high == exp_idhigh def test_containment_to_distance_one(): @@ -32,6 +55,12 @@ def test_containment_to_distance_one(): assert dist == exp_dist assert low == exp_low assert high == exp_high + # return identity instead + exp_id, exp_idlow,exp_idhigh = 1.0,1.0,1.0 + ident,low,high = containment_to_distance(contain,nkmers,ksize,scaled,return_identity=True) + assert ident == exp_id + assert low == exp_idlow + assert high == exp_idhigh def test_containment_to_distance_scaled1(): @@ -48,6 +77,13 @@ def test_containment_to_distance_scaled1(): assert dist == exp_dist assert low == exp_low assert high == exp_high + # return identity instead + exp_id, exp_idlow,exp_idhigh = 0.9675317785238916,0.9635213980271021,0.9712900870335944 + ident,low,high = containment_to_distance(contain,nkmers,ksize,scaled,return_identity=True) + print(f"{ident},{low},{high}") + assert ident == exp_id + assert low == exp_idlow + assert high == exp_idhigh def test_containment_to_distance_2(): @@ -139,6 +175,13 @@ def test_jaccard_to_distance_zero(): assert dist == exp_dist assert low == exp_low assert high == exp_high + # return identity instead + exp_id, exp_idlow,exp_idhigh = 0.0,0.0,0.0 + ident,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,return_identity=True) + print(f"{ident},{low},{high}") + assert ident == exp_id + assert low == exp_idlow + assert high == exp_idhigh def test_jaccard_to_distance_one(): @@ -155,6 +198,13 @@ def test_jaccard_to_distance_one(): assert dist == exp_dist assert low == exp_low assert high == exp_high + # return identity instead + exp_id, exp_idlow,exp_idhigh = 1.0,1.0,1.0 + ident,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,return_identity=True) + print(f"{ident},{low},{high}") + assert ident == exp_id + assert low == exp_idlow + assert high == exp_idhigh def test_jaccard_to_distance_scaled1(): @@ -171,6 +221,13 @@ def test_jaccard_to_distance_scaled1(): assert dist == exp_dist assert low == exp_low assert high == exp_high + # return identity instead + exp_id, exp_idlow,exp_idhigh = 0.9808773406095179,0.979142479906669,0.9824501832354076 + ident,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,return_identity=True) + print(f"{ident},{low},{high}") + assert ident == exp_id + assert low == exp_idlow + assert high == exp_idhigh def test_jaccard_to_distance_scaled100(): From 6da758e2394f788789186649d47f66bdcff9c638 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 15:24:47 -0800 Subject: [PATCH 04/38] add ani estimation to signature --- src/sourmash/distance_utils.py | 9 ++++++--- src/sourmash/signature.py | 30 ++++++++++++++++++++++++++++++ tests/test_signature.py | 30 ++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 3 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index e2aefc469e..7193bf934c 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -8,6 +8,8 @@ from scipy.special import hyp2f1 from numpy import sqrt +from .logging import notify + #FROM mrcc.kmer_mutation_formulas_thm5 def r1_to_q(k,r1): r1 = float(r1) @@ -24,11 +26,12 @@ def var_n_mutated(L,k,r1,q=None): if (r1 == 0): return 0.0 r1 = float(r1) if (q == None): # we assume that if q is provided, it is correct for r1 - q = r1_to_q(k,r1) + q = r1_to_q(k,r1) varN = L*(1-q)*(q*(2*k+(2/r1)-1)-2*k) \ + k*(k-1)*(1-q)**2 \ - + (2*(1-q)/(r1**2))*((1+(k-1)*(1-q))*r1-q) - assert (varN>=0.0) + + (2*(1-q)/(r1**2))*((1+(k-1)*(1-q))*r1-q) + if (varN<0.0): + raise ValueError('Error: varN <0.0!') return float(varN) diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 386fdb1733..e1f95c0f0c 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -10,6 +10,7 @@ from .logging import error from . import MinHash from .minhash import to_bytes, FrozenMinHash +from .distance_utils import jaccard_to_distance,containment_to_distance from ._lowlevel import ffi, lib from .utils import RustObject, rustcall, decode_str @@ -142,14 +143,43 @@ def jaccard(self, other): return self.minhash.similarity(other.minhash, ignore_abundance=True, downsample=False) + def jaccard_ani(self, other): + "Compute Jaccard similarity with the other MinHash signature." + jaccard = self.minhash.similarity(other.minhash, ignore_abundance=True, + downsample=False) + avg_scaled_kmers = (len(self.minhash.hashes) + len(other.minhash.hashes))/2 + avg_n_kmers = avg_scaled_kmers * self.minhash.scaled # would be better if hll estimate + j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, + self.minhash.ksize, self.minhash.scaled, + return_identity=True) + return j_ani + def contained_by(self, other, downsample=False): "Compute containment by the other signature. Note: ignores abundance." return self.minhash.contained_by(other.minhash, downsample) + def containment_ani(self, other, downsample=False): + "Estimate ANI from containment with the other MinHash signature." + containment = self.minhash.contained_by(other.minhash, downsample) + n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate + c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, + self.minhash.ksize, self.minhash.scaled, + return_identity=True) + return c_ani + def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." return self.minhash.max_containment(other.minhash, downsample) + def max_containment_ani(self, other, downsample=False): + "Estimate ANI from max containment w/other signature. Note: ignores abundance." + max_containment = self.minhash.max_containment(other.minhash, downsample) + n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate + c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, + self.minhash.ksize, self.minhash.scaled, + return_identity=True) + return c_ani + def add_sequence(self, sequence, force=False): self._methodcall(lib.signature_add_sequence, to_bytes(sequence), force) diff --git a/tests/test_signature.py b/tests/test_signature.py index 92467b5c5e..487ef06058 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -426,3 +426,33 @@ def test_max_containment_equal(): assert ss2.contained_by(ss1) == 1 assert ss1.max_containment(ss2) == 1 assert ss2.max_containment(ss1) == 1 + + +def test_containment_ANI(): + ## todo: use sigs with known ANI vals + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + ss1 = load_one_signature(f1, ksize=31) + ss2 = load_one_signature(f2) + + print("\nss1 contained by ss2", ss1.containment_ani(ss2)) + print("ss2 contained by ss1",ss2.containment_ani(ss1)) + print("ss1 max containment", ss1.max_containment_ani(ss2)) + print("ss2 max containment", ss2.max_containment_ani(ss1)) + + assert ss1.containment_ani(ss2) == 1.0 + assert ss2.containment_ani(ss1) == 0.9658183324254062 + assert ss1.max_containment_ani(ss2) == 1.0 + assert ss2.max_containment_ani(ss1) == 1.0 + + +def test_jaccard_ANI(): + ## todo: use sigs with known ANI vals + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + ss1 = load_one_signature(f1, ksize=31) + ss2 = load_one_signature(f2) + + print("\nJACCARD_ANI", ss1.jaccard_ani(ss2)) + + assert ss1.jaccard_ani(ss2) == 0.9783711630110239 From bd6485ddc5bef7b691b26ea6490665d238355ff5 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 15:38:30 -0800 Subject: [PATCH 05/38] return confidence interval in addition to point estimate --- src/sourmash/signature.py | 8 ++++---- tests/test_signature.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index e1f95c0f0c..f457e36b5b 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -10,7 +10,7 @@ from .logging import error from . import MinHash from .minhash import to_bytes, FrozenMinHash -from .distance_utils import jaccard_to_distance,containment_to_distance +from .distance_utils import jaccard_to_distance, containment_to_distance from ._lowlevel import ffi, lib from .utils import RustObject, rustcall, decode_str @@ -152,7 +152,7 @@ def jaccard_ani(self, other): j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, self.minhash.ksize, self.minhash.scaled, return_identity=True) - return j_ani + return j_ani, ani_low, ani_high def contained_by(self, other, downsample=False): "Compute containment by the other signature. Note: ignores abundance." @@ -165,7 +165,7 @@ def containment_ani(self, other, downsample=False): c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, self.minhash.ksize, self.minhash.scaled, return_identity=True) - return c_ani + return c_ani, ani_low, ani_high def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." @@ -178,7 +178,7 @@ def max_containment_ani(self, other, downsample=False): c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, self.minhash.ksize, self.minhash.scaled, return_identity=True) - return c_ani + return c_ani, ani_low, ani_high def add_sequence(self, sequence, force=False): self._methodcall(lib.signature_add_sequence, to_bytes(sequence), force) diff --git a/tests/test_signature.py b/tests/test_signature.py index 487ef06058..ab852adc72 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -440,10 +440,10 @@ def test_containment_ANI(): print("ss1 max containment", ss1.max_containment_ani(ss2)) print("ss2 max containment", ss2.max_containment_ani(ss1)) - assert ss1.containment_ani(ss2) == 1.0 - assert ss2.containment_ani(ss1) == 0.9658183324254062 - assert ss1.max_containment_ani(ss2) == 1.0 - assert ss2.max_containment_ani(ss1) == 1.0 + assert ss1.containment_ani(ss2) == (1.0, 1.0, 1.0) + assert ss2.containment_ani(ss1) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) + assert ss1.max_containment_ani(ss2) == (1.0, 1.0, 1.0) + assert ss2.max_containment_ani(ss1) == (1.0, 1.0, 1.0) def test_jaccard_ANI(): @@ -455,4 +455,4 @@ def test_jaccard_ANI(): print("\nJACCARD_ANI", ss1.jaccard_ani(ss2)) - assert ss1.jaccard_ani(ss2) == 0.9783711630110239 + assert ss1.jaccard_ani(ss2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) From 217784357f8026debb9e7e962bc162b6c779e662 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 16:44:56 -0800 Subject: [PATCH 06/38] return ANI from compare --- src/sourmash/cli/compare.py | 4 ++++ src/sourmash/commands.py | 19 +++++++++++---- src/sourmash/compare.py | 46 +++++++++++++++++++++++++------------ src/sourmash/signature.py | 12 ++++++++-- tests/test_compare.py | 40 ++++++++++++++++++++++++++++++++ tests/test_signature.py | 2 -- 6 files changed, 99 insertions(+), 24 deletions(-) diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index 839b2e9f37..b8df7a226f 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -58,6 +58,10 @@ def subparser(subparsers): '--max-containment', action='store_true', help='calculate max containment instead of similarity' ) + subparser.add_argument( + '--estimate-ani', action='store_true', + help='return ANI estimated from jaccard, containment, or max containment; see https://doi.org/10.1101/2022.01.11.475870' + ) subparser.add_argument( '--from-file', help='a text file containing a list of files to load signatures from' diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 96760fb6d0..c3275cedf2 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -108,11 +108,20 @@ def compare(args): error('must use scaled signatures with --containment and --max-containment') sys.exit(-1) + # complain if --ani and not is_scaled + return_ani = False + if args.estimate_ani: + return_ani = True + + if return_ani and not is_scaled: + error('must use scaled signatures with --estimate-ani') + sys.exit(-1) + # notify about implicit --ignore-abundance: - if is_containment: + if is_containment or return_ani: track_abundances = any(( s.minhash.track_abundance for s in siglist )) if track_abundances: - notify('NOTE: --containment and --max-containment ignore signature abundances.') + notify('NOTE: --containment, --max-containment, and --estimate-ani ignore signature abundances.') # if using --scaled, downsample appropriately printed_scaled_msg = False @@ -138,12 +147,12 @@ def compare(args): labeltext = [str(item) for item in siglist] if args.containment: - similarity = compare_serial_containment(siglist) + similarity = compare_serial_containment(siglist, return_ANI=return_ani) elif args.max_containment: - similarity = compare_serial_max_containment(siglist) + similarity = compare_serial_max_containment(siglist, return_ANI=return_ani) else: similarity = compare_all_pairs(siglist, args.ignore_abundance, - n_jobs=args.processes) + n_jobs=args.processes, return_ANI=return_ani) if len(siglist) < 30: for i, E in enumerate(siglist): diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index fba5bbcb7b..05243210f1 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -9,7 +9,7 @@ from sourmash.np_utils import to_memmap -def compare_serial(siglist, ignore_abundance, downsample=False): +def compare_serial(siglist, ignore_abundance, downsample=False, return_ANI=False): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations serially on a single process. Best to use when there is few signatures. @@ -34,12 +34,15 @@ def compare_serial(siglist, ignore_abundance, downsample=False): similarities = np.ones((n, n)) for i, j in iterator: - similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance, downsample) + if return_ANI: + similarities[i][j] = similarities[j][i] = siglist[i].jaccard_ani(siglist[j], downsample)[0] + else: + similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance, downsample) return similarities -def compare_serial_containment(siglist, downsample=False): +def compare_serial_containment(siglist, downsample=False, return_ANI=False): """Compare all combinations of signatures and return a matrix of containments. Processes combinations serially on a single process. Best to only use when there are few signatures. @@ -55,13 +58,17 @@ def compare_serial_containment(siglist, downsample=False): containments = np.ones((n, n)) for i in range(n): for j in range(n): - containments[i][j] = siglist[j].contained_by(siglist[i], + if return_ANI: + containments[i][j] = siglist[j].containment_ani(siglist[i], + downsample=downsample)[0] + else: + containments[i][j] = siglist[j].contained_by(siglist[i], downsample=downsample) return containments -def compare_serial_max_containment(siglist, downsample=False): +def compare_serial_max_containment(siglist, downsample=False, return_ANI=False): """Compare all combinations of signatures and return a matrix of max_containments. Processes combinations serially on a single process. Best to only use when there are few signatures. @@ -77,22 +84,30 @@ def compare_serial_max_containment(siglist, downsample=False): containments = np.ones((n, n)) for i in range(n): for j in range(n): - containments[i][j] = siglist[j].max_containment(siglist[i], + if return_ANI: + containments[i][j] = siglist[j].max_containment_ani(siglist[i], + downsample=downsample)[0] + else: + containments[i][j] = siglist[j].max_containment(siglist[i], downsample=downsample) return containments -def similarity_args_unpack(args, ignore_abundance, downsample): +def similarity_args_unpack(args, ignore_abundance, downsample, return_ANI=False): """Helper function to unpack the arguments. Written to use in pool.imap as it can only be given one argument.""" sig1, sig2 = args - return sig1.similarity(sig2, + if return_ANI: + return sig1.jaccard_ani(sig2, + downsample=downsample)[0] + else: + return sig1.similarity(sig2, ignore_abundance=ignore_abundance, downsample=downsample) -def get_similarities_at_index(index, ignore_abundance, downsample, siglist): +def get_similarities_at_index(index, ignore_abundance, downsample, siglist, return_ANI=False): """Returns similarities of all the combinations of signature at index in the siglist with the rest of the indices starting at index + 1. Doesn't redundantly calculate signatures with all the other indices prior to @@ -114,14 +129,14 @@ def get_similarities_at_index(index, ignore_abundance, downsample, siglist): sig_iterator = itertools.product([siglist[index]], siglist[index + 1:]) func = partial(similarity_args_unpack, ignore_abundance=ignore_abundance, - downsample=downsample) + downsample=downsample, return_ANI=return_ANI) similarity_list = list(map(func, sig_iterator)) notify( f"comparison for index {index} done in {time.time() - startt:.5f} seconds", end='\r') return similarity_list -def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): +def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ANI=False): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations parallely on number of processes given by n_jobs @@ -163,7 +178,8 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): get_similarities_at_index, siglist=siglist, ignore_abundance=ignore_abundance, - downsample=downsample) + downsample=downsample, + return_ANI=return_ANI) notify("Created similarity func") # Initialize multiprocess.pool @@ -198,7 +214,7 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs): return np.memmap(filename, dtype=np.float64, shape=(length_siglist, length_siglist)) -def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None): +def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, return_ANI=False): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations either serially or based on parallely on number of processes given by n_jobs @@ -216,7 +232,7 @@ def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None): :return: np.array similarity matrix """ if n_jobs is None or n_jobs == 1: - similarities = compare_serial(siglist, ignore_abundance, downsample) + similarities = compare_serial(siglist, ignore_abundance, downsample, return_ANI) else: - similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs) + similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ANI) return similarities diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index f457e36b5b..d6c6b411fb 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -143,10 +143,10 @@ def jaccard(self, other): return self.minhash.similarity(other.minhash, ignore_abundance=True, downsample=False) - def jaccard_ani(self, other): + def jaccard_ani(self, other, downsample=False): "Compute Jaccard similarity with the other MinHash signature." jaccard = self.minhash.similarity(other.minhash, ignore_abundance=True, - downsample=False) + downsample=downsample) avg_scaled_kmers = (len(self.minhash.hashes) + len(other.minhash.hashes))/2 avg_n_kmers = avg_scaled_kmers * self.minhash.scaled # would be better if hll estimate j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, @@ -167,6 +167,14 @@ def containment_ani(self, other, downsample=False): return_identity=True) return c_ani, ani_low, ani_high + #def containment_distance(self, other, downsample=False): + # "Estimate distance (1-ANI) from containment with the other MinHash signature." + containment = self.minhash.contained_by(other.minhash, downsample) + n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate + c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, + self.minhash.ksize, self.minhash.scaled) + return c_ani, ani_low, ani_high + def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." return self.minhash.max_containment(other.minhash, downsample) diff --git a/tests/test_compare.py b/tests/test_compare.py index 5c7b6eee6b..a662e348cb 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -19,6 +19,16 @@ def siglist(): sigs.extend(sourmash.load_file_as_signatures(filename)) return sigs +@pytest.fixture() +def scaled_siglist(): + demo_path = utils.get_test_data("scaled") + filenames = sorted(glob.glob(os.path.join(demo_path, "*.sig"))) + sigs = [] + for filename in filenames: + these_sigs = sourmash.load_file_as_signatures(filename) + scaled_sigs = [s for s in these_sigs if s.minhash.scaled != 0] + sigs.extend(scaled_sigs) + return sigs @pytest.fixture() def ignore_abundance(track_abundance): @@ -59,3 +69,33 @@ def test_compare_all_pairs(siglist, ignore_abundance): similarities_parallel = compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=2) similarities_serial = compare_serial(siglist, ignore_abundance, downsample=False) np.testing.assert_array_equal(similarities_parallel, similarities_serial) + + +def test_compare_serial_ANI(scaled_siglist, ignore_abundance): + similarities = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ANI=True) + + true_similarities = np.array( + [[1., 0.942, 0.988, 0.986, 0.], + [0.942, 1., 0.960, 0., 0.], + [0.988, 0.960, 1., 0., 0.], + [0.986, 0., 0., 1., 0.], + [0., 0., 0., 0., 1.]]) + + np.testing.assert_array_almost_equal(similarities, true_similarities, decimal=3) + +def test_compare_parallel_ANI(scaled_siglist, ignore_abundance): + similarities = compare_parallel(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ANI=True) + + true_similarities = np.array( + [[1., 0.942, 0.988, 0.986, 0.], + [0.942, 1., 0.960, 0., 0.], + [0.988, 0.960, 1., 0., 0.], + [0.986, 0., 0., 1., 0.], + [0., 0., 0., 0., 1.]]) + + np.testing.assert_array_almost_equal(similarities, true_similarities, decimal=3) + +def test_compare_all_pairs_ANI(scaled_siglist, ignore_abundance): + similarities_parallel = compare_all_pairs(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ANI=True) + similarities_serial = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ANI=True) + np.testing.assert_array_equal(similarities_parallel, similarities_serial) \ No newline at end of file diff --git a/tests/test_signature.py b/tests/test_signature.py index ab852adc72..f21f5c9ad9 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -429,7 +429,6 @@ def test_max_containment_equal(): def test_containment_ANI(): - ## todo: use sigs with known ANI vals f1 = utils.get_test_data('2.fa.sig') f2 = utils.get_test_data('2+63.fa.sig') ss1 = load_one_signature(f1, ksize=31) @@ -447,7 +446,6 @@ def test_containment_ANI(): def test_jaccard_ANI(): - ## todo: use sigs with known ANI vals f1 = utils.get_test_data('2.fa.sig') f2 = utils.get_test_data('2+63.fa.sig') ss1 = load_one_signature(f1, ksize=31) From 4d61a381b1186eee52aaa030f26ec6977364a538 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 20:27:27 -0800 Subject: [PATCH 07/38] add containment ANI tests to test_compare --- tests/test_compare.py | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/tests/test_compare.py b/tests/test_compare.py index a662e348cb..147852c808 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -6,7 +6,7 @@ import sourmash from sourmash.compare import (compare_all_pairs, compare_parallel, - compare_serial) + compare_serial, compare_serial_containment, compare_serial_max_containment) import sourmash_tst_utils as utils @@ -71,7 +71,7 @@ def test_compare_all_pairs(siglist, ignore_abundance): np.testing.assert_array_equal(similarities_parallel, similarities_serial) -def test_compare_serial_ANI(scaled_siglist, ignore_abundance): +def test_compare_serial_jaccardANI(scaled_siglist, ignore_abundance): similarities = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ANI=True) true_similarities = np.array( @@ -83,19 +83,46 @@ def test_compare_serial_ANI(scaled_siglist, ignore_abundance): np.testing.assert_array_almost_equal(similarities, true_similarities, decimal=3) -def test_compare_parallel_ANI(scaled_siglist, ignore_abundance): + +def test_compare_parallel_jaccardANI(scaled_siglist, ignore_abundance): similarities = compare_parallel(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ANI=True) - true_similarities = np.array( + true_containment = np.array( [[1., 0.942, 0.988, 0.986, 0.], [0.942, 1., 0.960, 0., 0.], [0.988, 0.960, 1., 0., 0.], [0.986, 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]) - np.testing.assert_array_almost_equal(similarities, true_similarities, decimal=3) + np.testing.assert_array_almost_equal(similarities, true_containment, decimal=3) + -def test_compare_all_pairs_ANI(scaled_siglist, ignore_abundance): +def test_compare_all_pairs_jaccardANI(scaled_siglist, ignore_abundance): similarities_parallel = compare_all_pairs(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ANI=True) similarities_serial = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ANI=True) - np.testing.assert_array_equal(similarities_parallel, similarities_serial) \ No newline at end of file + np.testing.assert_array_equal(similarities_parallel, similarities_serial) + + +def test_compare_serial_containmentANI(scaled_siglist, ignore_abundance): + containment = compare_serial_containment(scaled_siglist, ignore_abundance, return_ANI=True) + + true_containment = np.array( + [[1., 1., 1., 1., 0.], + [0.92391599, 1., 0.94383993, 0., 0.], + [0.97889056, 1., 1., 0., 0.], + [0.97685474, 0., 0., 1., 0.], + [0., 0., 0., 0., 1.]]) + + np.testing.assert_array_almost_equal(containment, true_containment, decimal=3) + + # check max_containment ANI + max_containment = compare_serial_max_containment(scaled_siglist, ignore_abundance, return_ANI=True) + + true_max_containment = np.array( + [[1., 1., 1., 1., 0.], + [1., 1., 1., 0., 0.], + [1., 1., 1., 0., 0.], + [1., 0., 0., 1., 0.], + [0., 0., 0., 0., 1.,]]) + + np.testing.assert_array_almost_equal(max_containment, true_max_containment, decimal=3) From 0e351b890a8d445cc6327cb985b73eb8fdaf892e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 21:18:55 -0800 Subject: [PATCH 08/38] init compare ANI tests --- tests/test_sourmash.py | 158 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 1 deletion(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 60846792b1..7670a1c32e 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -473,6 +473,86 @@ def test_compare_containment(c): assert containment == mat_val, (i, j) +@utils.in_tempdir +def test_compare_containment_ani(c): + import numpy + + testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_sigs = glob.glob(testdata_glob) + + c.run_sourmash('compare', '--containment', '-k', '31', + '--estimate-ani', '--csv', 'output.csv', *testdata_sigs) + + # load the matrix output of compare --containment --estimate-ani + with open(c.output('output.csv'), 'rt') as fp: + r = iter(csv.reader(fp)) + headers = next(r) + + mat = numpy.zeros((len(headers), len(headers))) + for i, row in enumerate(r): + for j, val in enumerate(row): + mat[i][j] = float(val) + + print(mat) + + # load in all the input signatures + idx_to_sig = dict() + for idx, filename in enumerate(testdata_sigs): + ss = sourmash.load_one_signature(filename, ksize=31) + idx_to_sig[idx] = ss + + # check explicit containment against output of compare + for i in range(len(idx_to_sig)): + ss_i = idx_to_sig[i] + for j in range(len(idx_to_sig)): + ss_j = idx_to_sig[j] + containment_ani, ci_low, ci_high = ss_j.containment_ani(ss_i) + containment_ani = round(containment_ani, 3) + mat_val = round(mat[i][j], 3) + + assert containment_ani == mat_val, (i, j) + + +@utils.in_tempdir +def test_compare_jaccard_ani(c): + import numpy + + testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_sigs = glob.glob(testdata_glob) + + c.run_sourmash('compare', '-k', '31', '--estimate-ani', + '--csv', 'output.csv', *testdata_sigs) + + # load the matrix output of compare --estimate-ani + with open(c.output('output.csv'), 'rt') as fp: + r = iter(csv.reader(fp)) + headers = next(r) + + mat = numpy.zeros((len(headers), len(headers))) + for i, row in enumerate(r): + for j, val in enumerate(row): + mat[i][j] = float(val) + + print(mat) + + # load in all the input signatures + idx_to_sig = dict() + for idx, filename in enumerate(testdata_sigs): + ss = sourmash.load_one_signature(filename, ksize=31) + idx_to_sig[idx] = ss + + # check explicit containment against output of compare + for i in range(len(idx_to_sig)): + ss_i = idx_to_sig[i] + for j in range(len(idx_to_sig)): + ss_j = idx_to_sig[j] + jaccard_ani, ci_low, ci_high = ss_j.jaccard_ani(ss_i) + jaccard_ani = round(jaccard_ani, 3) + mat_val = round(mat[i][j], 3) + + assert jaccard_ani == mat_val, (i, j) + + @utils.in_tempdir def test_compare_max_containment(c): import numpy @@ -513,6 +593,46 @@ def test_compare_max_containment(c): assert containment == mat_val, (i, j) +@utils.in_tempdir +def test_compare_max_containment_ani(c): + import numpy + + testdata_glob = utils.get_test_data('scaled/*.sig') + testdata_sigs = glob.glob(testdata_glob) + + c.run_sourmash('compare', '--max-containment', '-k', '31', + '--estimate-ani', '--csv', 'output.csv', *testdata_sigs) + + # load the matrix output of compare --max-containment --estimate-ani + with open(c.output('output.csv'), 'rt') as fp: + r = iter(csv.reader(fp)) + headers = next(r) + + mat = numpy.zeros((len(headers), len(headers))) + for i, row in enumerate(r): + for j, val in enumerate(row): + mat[i][j] = float(val) + + print(mat) + + # load in all the input signatures + idx_to_sig = dict() + for idx, filename in enumerate(testdata_sigs): + ss = sourmash.load_one_signature(filename, ksize=31) + idx_to_sig[idx] = ss + + # check explicit containment against output of compare + for i in range(len(idx_to_sig)): + ss_i = idx_to_sig[i] + for j in range(len(idx_to_sig)): + ss_j = idx_to_sig[j] + containment_ani, ci_low, ci_high = ss_j.max_containment_ani(ss_i) + containment_ani = round(containment_ani, 3) + mat_val = round(mat[i][j], 3) + + assert containment_ani == mat_val, (i, j) + + @utils.in_tempdir def test_compare_max_containment_and_containment(c): testdata_glob = utils.get_test_data('scaled/*.sig') @@ -536,7 +656,20 @@ def test_compare_containment_abund_flatten(c): print(c.last_result.out) print(c.last_result.err) - assert 'NOTE: --containment and --max-containment ignore signature abundances.' in \ + assert 'NOTE: --containment, --max-containment, and --estimate-ani ignore signature abundances.' in \ + c.last_result.err + + +@utils.in_tempdir +def test_compare_ani_abund_flatten(c): + s47 = utils.get_test_data('track_abund/47.fa.sig') + s63 = utils.get_test_data('track_abund/63.fa.sig') + + c.run_sourmash('compare', '--estimate-ani', '-k', '31', s47, s63) + print(c.last_result.out) + print(c.last_result.err) + + assert 'NOTE: --containment, --max-containment, and --estimate-ani ignore signature abundances.' in \ c.last_result.err @@ -554,6 +687,29 @@ def test_compare_containment_require_scaled(c): assert c.last_result.status != 0 +@utils.in_tempdir +def test_compare_ANI_require_scaled(c): + s47 = utils.get_test_data('num/47.fa.sig') + s63 = utils.get_test_data('num/63.fa.sig') + + # containment and estimate ANI will give this error + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('compare', '--containment', '--estimate-ani', '-k', '31', s47, s63, + fail_ok=True) + assert 'must use scaled signatures with --containment and --max-containment' in \ + c.last_result.err + assert c.last_result.status != 0 + + # jaccard + estimate ANI will give this error + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('compare', '--estimate-ani', '-k', '31', s47, s63, + fail_ok=True) + + assert 'must use scaled signatures with --estimate-ani' in \ + c.last_result.err + assert c.last_result.status != 0 + + @utils.in_tempdir def test_do_plot_comparison(c): testdata1 = utils.get_test_data('short.fa') From 66a570f7e25d5f762304df12c60829d67c9b4953 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 13 Jan 2022 21:19:57 -0800 Subject: [PATCH 09/38] add option --- src/sourmash/cli/compare.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index b8df7a226f..f64bb46393 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -59,7 +59,7 @@ def subparser(subparsers): help='calculate max containment instead of similarity' ) subparser.add_argument( - '--estimate-ani', action='store_true', + '--estimate-ani', '--estimate-ANI', action='store_true', help='return ANI estimated from jaccard, containment, or max containment; see https://doi.org/10.1101/2022.01.11.475870' ) subparser.add_argument( From 530ab760c0b9c260dfdd22edb4e64deeb6002ced Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 14 Jan 2022 10:50:43 -0800 Subject: [PATCH 10/38] ani in prefetch and gather --- src/sourmash/commands.py | 49 ++++++++++++++++++++------------------- src/sourmash/search.py | 26 +++++++++++++++++---- src/sourmash/signature.py | 17 ++++++++------ 3 files changed, 57 insertions(+), 35 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index c3275cedf2..3bad6954fb 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -16,7 +16,7 @@ from .logging import notify, error, print_results, set_quiet from .sourmash_args import (FileOutput, FileOutputCSV, SaveSignaturesToLocation) -from .search import prefetch_database, PrefetchResult, calculate_prefetch_info +from .search import prefetch_database, PrefetchResult, GatherResult, calculate_prefetch_info from .index import LazyLinearIndex WATERMARK_SIZE = 10000 @@ -682,10 +682,13 @@ def gather(args): prefetch_csvout_fp = None prefetch_csvout_w = None if args.save_prefetch_csv: - fieldnames = ['intersect_bp', 'jaccard', - 'max_containment', 'f_query_match', 'f_match_query', - 'match_filename', 'match_name', 'match_md5', 'match_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] + fieldnames = PrefetchResult._fields +# fieldnames = ['intersect_bp', 'jaccard', +# 'max_containment', 'f_query_match', 'f_match_query', +# 'match_filename', 'match_name', 'match_md5', 'match_bp', +# 'query_filename', 'query_name', 'query_md5', 'query_bp', +# 'jaccard_ani', 'max_containment_ani', 'query_containment_ani', +# 'match_containment_ani'] prefetch_csvout_fp = FileOutput(args.save_prefetch_csv, 'wt').open() prefetch_csvout_w = csv.DictWriter(prefetch_csvout_fp, fieldnames=fieldnames) @@ -801,12 +804,14 @@ def gather(args): # save CSV? if found and args.output: - fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', - 'filename', 'md5', 'f_match_orig', 'unique_intersect_bp', - 'gather_result_rank', 'remaining_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] + fieldnames = GatherResult._fields +# fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', +# 'f_unique_to_query', 'f_unique_weighted', +# 'average_abund', 'median_abund', 'std_abund', 'name', +# 'filename', 'md5', 'f_match_orig', 'unique_intersect_bp', +# 'gather_result_rank', 'remaining_bp', +# 'query_filename', 'query_name', 'query_md5', 'query_bp', +# 'match_containment_ani'] with FileOutputCSV(args.output) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) @@ -973,14 +978,14 @@ def multigather(args): output_base = os.path.basename(query_filename) output_csv = output_base + '.csv' - - fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', - 'filename', 'md5', 'f_match_orig', - 'unique_intersect_bp', 'gather_result_rank', - 'remaining_bp', 'query_filename', 'query_name', - 'query_md5', 'query_bp'] + fieldnames = GatherResult._fields + #fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', + # 'f_unique_to_query', 'f_unique_weighted', + # 'average_abund', 'median_abund', 'std_abund', 'name', + # 'filename', 'md5', 'f_match_orig', + # 'unique_intersect_bp', 'gather_result_rank', + # 'remaining_bp', 'query_filename', 'query_name', + # 'query_md5', 'query_bp', 'match_containment_ani'] with FileOutputCSV(output_csv) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() @@ -1175,11 +1180,7 @@ def prefetch(args): csvout_fp = None csvout_w = None if args.output: - fieldnames = ['intersect_bp', 'jaccard', - 'max_containment', 'f_query_match', 'f_match_query', - 'match_filename', 'match_name', 'match_md5', 'match_bp', - 'query_filename', 'query_name', 'query_md5', 'query_bp'] - + fieldnames = PrefetchResult._fields csvout_fp = FileOutput(args.output, 'wt').open() csvout_w = csv.DictWriter(csvout_fp, fieldnames=fieldnames) csvout_w.writeheader() diff --git a/src/sourmash/search.py b/src/sourmash/search.py index e2af14818e..da2c763dac 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -5,6 +5,8 @@ from enum import Enum import numpy as np +from sourmash.distance_utils import containment_to_distance + from .signature import SourmashSignature @@ -241,7 +243,7 @@ def search_databases_with_abund_query(query, databases, **kwargs): ### GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp, query_filename, query_name, query_md5, query_bp') + 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp, query_filename, query_name, query_md5, query_bp, match_containment_ani') def _find_best(counters, query, threshold_bp): @@ -402,6 +404,10 @@ def __next__(self): # calculate fraction of subject match with orig query f_match_orig = found_mh.contained_by(orig_query_mh) + # calculate ani using match containment by query + match_containment_ani = containment_to_distance(f_match_orig, len(found_mh) * scaled, + found_mh.ksize, scaled, return_identity=True) + # calculate scores weighted by abundances f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mh.hashes )) f_unique_weighted /= sum_abunds @@ -450,6 +456,7 @@ def __next__(self): query_filename=self.orig_query_filename, query_name=self.orig_query_name, query_md5=self.orig_query_md5, + match_containment_ani=match_containment_ani, ) self.result_n += 1 self.query = new_query @@ -463,7 +470,7 @@ def __next__(self): ### PrefetchResult = namedtuple('PrefetchResult', - 'intersect_bp, jaccard, max_containment, f_query_match, f_match_query, match, match_filename, match_name, match_md5, match_bp, query, query_filename, query_name, query_md5, query_bp') + 'intersect_bp, jaccard, max_containment, f_query_match, f_match_query, match, match_filename, match_name, match_md5, match_bp, query, query_filename, query_name, query_md5, query_bp, jaccard_ani, max_containment_ani, query_containment_ani, match_containment_ani') def calculate_prefetch_info(query, match, scaled, threshold): @@ -481,13 +488,20 @@ def calculate_prefetch_info(query, match, scaled, threshold): f_query_match = db_mh.contained_by(query_mh) f_match_query = query_mh.contained_by(db_mh) max_containment = max(f_query_match, f_match_query) + jaccard=db_mh.jaccard(query_mh) + + # passing in jaccard/containment avoids recalc (but it better be the right one :) + jaccard_ani = query.jaccard_ani(match, jaccard=jaccard)[0] + query_containment_ani = query.containment_ani(match, containment=f_match_query)[0] + match_containment_ani = match.containment_ani(query, containment=f_query_match)[0] + max_containment_ani = max(query_containment_ani, match_containment_ani) # build a result namedtuple result = PrefetchResult( intersect_bp=len(intersect_mh) * scaled, query_bp = len(query_mh) * scaled, match_bp = len(db_mh) * scaled, - jaccard=db_mh.jaccard(query_mh), + jaccard=jaccard, max_containment=max_containment, f_query_match=f_query_match, f_match_query=f_match_query, @@ -498,7 +512,11 @@ def calculate_prefetch_info(query, match, scaled, threshold): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + jaccard_ani=jaccard_ani, + max_containment_ani=max_containment_ani, + query_containment_ani=query_containment_ani, + match_containment_ani=match_containment_ani, ) return result diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index d6c6b411fb..26fee4ac65 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -143,10 +143,11 @@ def jaccard(self, other): return self.minhash.similarity(other.minhash, ignore_abundance=True, downsample=False) - def jaccard_ani(self, other, downsample=False): + def jaccard_ani(self, other, downsample=False, jaccard=None): "Compute Jaccard similarity with the other MinHash signature." - jaccard = self.minhash.similarity(other.minhash, ignore_abundance=True, - downsample=downsample) + if not jaccard: + jaccard = self.minhash.similarity(other.minhash, ignore_abundance=True, + downsample=downsample) avg_scaled_kmers = (len(self.minhash.hashes) + len(other.minhash.hashes))/2 avg_n_kmers = avg_scaled_kmers * self.minhash.scaled # would be better if hll estimate j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, @@ -158,9 +159,10 @@ def contained_by(self, other, downsample=False): "Compute containment by the other signature. Note: ignores abundance." return self.minhash.contained_by(other.minhash, downsample) - def containment_ani(self, other, downsample=False): + def containment_ani(self, other, downsample=False, containment=None): "Estimate ANI from containment with the other MinHash signature." - containment = self.minhash.contained_by(other.minhash, downsample) + if not containment: + containment = self.minhash.contained_by(other.minhash, downsample) n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, self.minhash.ksize, self.minhash.scaled, @@ -179,9 +181,10 @@ def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." return self.minhash.max_containment(other.minhash, downsample) - def max_containment_ani(self, other, downsample=False): + def max_containment_ani(self, other, downsample=False, max_containment=None): "Estimate ANI from max containment w/other signature. Note: ignores abundance." - max_containment = self.minhash.max_containment(other.minhash, downsample) + if not max_containment: + max_containment = self.minhash.max_containment(other.minhash, downsample) n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, self.minhash.ksize, self.minhash.scaled, From 361bb4d506ee2ee23f599e0470020d434a3ec546 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 14 Jan 2022 11:10:41 -0800 Subject: [PATCH 11/38] test ani from precalculated score --- tests/test_signature.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_signature.py b/tests/test_signature.py index f21f5c9ad9..4992ae43f9 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -444,6 +444,16 @@ def test_containment_ANI(): assert ss1.max_containment_ani(ss2) == (1.0, 1.0, 1.0) assert ss2.max_containment_ani(ss1) == (1.0, 1.0, 1.0) + # precalc containments and assert same results + s1c = ss1.contained_by(ss2) + s2c = ss2.contained_by(ss1) + mc = max(s1c, s2c) + assert ss1.containment_ani(ss2, containment=s1c) == (1.0, 1.0, 1.0) + assert ss2.containment_ani(ss1, containment=s2c) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) + assert ss1.max_containment_ani(ss2, max_containment=mc) == (1.0, 1.0, 1.0) + assert ss2.max_containment_ani(ss1, max_containment=mc) == (1.0, 1.0, 1.0) + + def test_jaccard_ANI(): f1 = utils.get_test_data('2.fa.sig') @@ -454,3 +464,9 @@ def test_jaccard_ANI(): print("\nJACCARD_ANI", ss1.jaccard_ani(ss2)) assert ss1.jaccard_ani(ss2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) + + # precalc jaccard and assert same result + jaccard = ss1.jaccard(ss2) + print("\nJACCARD_ANI", ss1.jaccard_ani(ss2,jaccard=jaccard)) + + assert ss1.jaccard_ani(ss2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) From b012f5a2cfb92531fa68a92ab50daecd59dac44f Mon Sep 17 00:00:00 2001 From: Tessa Pierce Ward Date: Fri, 14 Jan 2022 11:34:02 -0800 Subject: [PATCH 12/38] return_ANI --> return_ani Co-authored-by: C. Titus Brown --- src/sourmash/commands.py | 6 +++--- src/sourmash/compare.py | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 3bad6954fb..911fc0796f 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -147,12 +147,12 @@ def compare(args): labeltext = [str(item) for item in siglist] if args.containment: - similarity = compare_serial_containment(siglist, return_ANI=return_ani) + similarity = compare_serial_containment(siglist, return_ani=return_ani) elif args.max_containment: - similarity = compare_serial_max_containment(siglist, return_ANI=return_ani) + similarity = compare_serial_max_containment(siglist, return_ani=return_ani) else: similarity = compare_all_pairs(siglist, args.ignore_abundance, - n_jobs=args.processes, return_ANI=return_ani) + n_jobs=args.processes, return_ani=return_ani) if len(siglist) < 30: for i, E in enumerate(siglist): diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 05243210f1..071bddce66 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -9,7 +9,7 @@ from sourmash.np_utils import to_memmap -def compare_serial(siglist, ignore_abundance, downsample=False, return_ANI=False): +def compare_serial(siglist, ignore_abundance, downsample=False, return_ani=False): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations serially on a single process. Best to use when there is few signatures. @@ -94,11 +94,11 @@ def compare_serial_max_containment(siglist, downsample=False, return_ANI=False): return containments -def similarity_args_unpack(args, ignore_abundance, downsample, return_ANI=False): +def similarity_args_unpack(args, ignore_abundance, downsample, return_ani=False): """Helper function to unpack the arguments. Written to use in pool.imap as it can only be given one argument.""" sig1, sig2 = args - if return_ANI: + if return_ani: return sig1.jaccard_ani(sig2, downsample=downsample)[0] else: @@ -107,7 +107,7 @@ def similarity_args_unpack(args, ignore_abundance, downsample, return_ANI=False) downsample=downsample) -def get_similarities_at_index(index, ignore_abundance, downsample, siglist, return_ANI=False): +def get_similarities_at_index(index, ignore_abundance, downsample, siglist, return_ani=False): """Returns similarities of all the combinations of signature at index in the siglist with the rest of the indices starting at index + 1. Doesn't redundantly calculate signatures with all the other indices prior to @@ -129,14 +129,14 @@ def get_similarities_at_index(index, ignore_abundance, downsample, siglist, retu sig_iterator = itertools.product([siglist[index]], siglist[index + 1:]) func = partial(similarity_args_unpack, ignore_abundance=ignore_abundance, - downsample=downsample, return_ANI=return_ANI) + downsample=downsample, return_ani=return_ani) similarity_list = list(map(func, sig_iterator)) notify( f"comparison for index {index} done in {time.time() - startt:.5f} seconds", end='\r') return similarity_list -def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ANI=False): +def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ani=False): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations parallely on number of processes given by n_jobs @@ -179,7 +179,7 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ANI=F siglist=siglist, ignore_abundance=ignore_abundance, downsample=downsample, - return_ANI=return_ANI) + return_ani=return_ani) notify("Created similarity func") # Initialize multiprocess.pool @@ -214,7 +214,7 @@ def compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ANI=F return np.memmap(filename, dtype=np.float64, shape=(length_siglist, length_siglist)) -def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, return_ANI=False): +def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, return_ani=False): """Compare all combinations of signatures and return a matrix of similarities. Processes combinations either serially or based on parallely on number of processes given by n_jobs @@ -232,7 +232,7 @@ def compare_all_pairs(siglist, ignore_abundance, downsample=False, n_jobs=None, :return: np.array similarity matrix """ if n_jobs is None or n_jobs == 1: - similarities = compare_serial(siglist, ignore_abundance, downsample, return_ANI) + similarities = compare_serial(siglist, ignore_abundance, downsample, return_ani) else: - similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ANI) + similarities = compare_parallel(siglist, ignore_abundance, downsample, n_jobs, return_ani) return similarities From 48b5ccf4c9ab078f963fa6ddf554b882c96eedf0 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 14 Jan 2022 12:12:46 -0800 Subject: [PATCH 13/38] remaining return_ANI changes --- src/sourmash/commands.py | 22 ---------------------- src/sourmash/compare.py | 10 +++++----- tests/test_compare.py | 12 ++++++------ 3 files changed, 11 insertions(+), 33 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 911fc0796f..0f0539b666 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -683,13 +683,6 @@ def gather(args): prefetch_csvout_w = None if args.save_prefetch_csv: fieldnames = PrefetchResult._fields -# fieldnames = ['intersect_bp', 'jaccard', -# 'max_containment', 'f_query_match', 'f_match_query', -# 'match_filename', 'match_name', 'match_md5', 'match_bp', -# 'query_filename', 'query_name', 'query_md5', 'query_bp', -# 'jaccard_ani', 'max_containment_ani', 'query_containment_ani', -# 'match_containment_ani'] - prefetch_csvout_fp = FileOutput(args.save_prefetch_csv, 'wt').open() prefetch_csvout_w = csv.DictWriter(prefetch_csvout_fp, fieldnames=fieldnames) prefetch_csvout_w.writeheader() @@ -805,14 +798,6 @@ def gather(args): # save CSV? if found and args.output: fieldnames = GatherResult._fields -# fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', -# 'f_unique_to_query', 'f_unique_weighted', -# 'average_abund', 'median_abund', 'std_abund', 'name', -# 'filename', 'md5', 'f_match_orig', 'unique_intersect_bp', -# 'gather_result_rank', 'remaining_bp', -# 'query_filename', 'query_name', 'query_md5', 'query_bp', -# 'match_containment_ani'] - with FileOutputCSV(args.output) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() @@ -979,13 +964,6 @@ def multigather(args): output_base = os.path.basename(query_filename) output_csv = output_base + '.csv' fieldnames = GatherResult._fields - #fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - # 'f_unique_to_query', 'f_unique_weighted', - # 'average_abund', 'median_abund', 'std_abund', 'name', - # 'filename', 'md5', 'f_match_orig', - # 'unique_intersect_bp', 'gather_result_rank', - # 'remaining_bp', 'query_filename', 'query_name', - # 'query_md5', 'query_bp', 'match_containment_ani'] with FileOutputCSV(output_csv) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 071bddce66..a18ba38f06 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -34,7 +34,7 @@ def compare_serial(siglist, ignore_abundance, downsample=False, return_ani=False similarities = np.ones((n, n)) for i, j in iterator: - if return_ANI: + if return_ani: similarities[i][j] = similarities[j][i] = siglist[i].jaccard_ani(siglist[j], downsample)[0] else: similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance, downsample) @@ -42,7 +42,7 @@ def compare_serial(siglist, ignore_abundance, downsample=False, return_ani=False return similarities -def compare_serial_containment(siglist, downsample=False, return_ANI=False): +def compare_serial_containment(siglist, downsample=False, return_ani=False): """Compare all combinations of signatures and return a matrix of containments. Processes combinations serially on a single process. Best to only use when there are few signatures. @@ -58,7 +58,7 @@ def compare_serial_containment(siglist, downsample=False, return_ANI=False): containments = np.ones((n, n)) for i in range(n): for j in range(n): - if return_ANI: + if return_ani: containments[i][j] = siglist[j].containment_ani(siglist[i], downsample=downsample)[0] else: @@ -68,7 +68,7 @@ def compare_serial_containment(siglist, downsample=False, return_ANI=False): return containments -def compare_serial_max_containment(siglist, downsample=False, return_ANI=False): +def compare_serial_max_containment(siglist, downsample=False, return_ani=False): """Compare all combinations of signatures and return a matrix of max_containments. Processes combinations serially on a single process. Best to only use when there are few signatures. @@ -84,7 +84,7 @@ def compare_serial_max_containment(siglist, downsample=False, return_ANI=False): containments = np.ones((n, n)) for i in range(n): for j in range(n): - if return_ANI: + if return_ani: containments[i][j] = siglist[j].max_containment_ani(siglist[i], downsample=downsample)[0] else: diff --git a/tests/test_compare.py b/tests/test_compare.py index 147852c808..9bac10a607 100644 --- a/tests/test_compare.py +++ b/tests/test_compare.py @@ -72,7 +72,7 @@ def test_compare_all_pairs(siglist, ignore_abundance): def test_compare_serial_jaccardANI(scaled_siglist, ignore_abundance): - similarities = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ANI=True) + similarities = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ani=True) true_similarities = np.array( [[1., 0.942, 0.988, 0.986, 0.], @@ -85,7 +85,7 @@ def test_compare_serial_jaccardANI(scaled_siglist, ignore_abundance): def test_compare_parallel_jaccardANI(scaled_siglist, ignore_abundance): - similarities = compare_parallel(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ANI=True) + similarities = compare_parallel(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True) true_containment = np.array( [[1., 0.942, 0.988, 0.986, 0.], @@ -98,13 +98,13 @@ def test_compare_parallel_jaccardANI(scaled_siglist, ignore_abundance): def test_compare_all_pairs_jaccardANI(scaled_siglist, ignore_abundance): - similarities_parallel = compare_all_pairs(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ANI=True) - similarities_serial = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ANI=True) + similarities_parallel = compare_all_pairs(scaled_siglist, ignore_abundance, downsample=False, n_jobs=2, return_ani=True) + similarities_serial = compare_serial(scaled_siglist, ignore_abundance, downsample=False, return_ani=True) np.testing.assert_array_equal(similarities_parallel, similarities_serial) def test_compare_serial_containmentANI(scaled_siglist, ignore_abundance): - containment = compare_serial_containment(scaled_siglist, ignore_abundance, return_ANI=True) + containment = compare_serial_containment(scaled_siglist, ignore_abundance, return_ani=True) true_containment = np.array( [[1., 1., 1., 1., 0.], @@ -116,7 +116,7 @@ def test_compare_serial_containmentANI(scaled_siglist, ignore_abundance): np.testing.assert_array_almost_equal(containment, true_containment, decimal=3) # check max_containment ANI - max_containment = compare_serial_max_containment(scaled_siglist, ignore_abundance, return_ANI=True) + max_containment = compare_serial_max_containment(scaled_siglist, ignore_abundance, return_ani=True) true_max_containment = np.array( [[1., 1., 1., 1., 0.], From 5da01cc1430746a52498146959af59a21ccb6f96 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 14 Jan 2022 13:18:01 -0800 Subject: [PATCH 14/38] cleanup unused fn --- src/sourmash/signature.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 26fee4ac65..2b777c0d79 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -169,13 +169,6 @@ def containment_ani(self, other, downsample=False, containment=None): return_identity=True) return c_ani, ani_low, ani_high - #def containment_distance(self, other, downsample=False): - # "Estimate distance (1-ANI) from containment with the other MinHash signature." - containment = self.minhash.contained_by(other.minhash, downsample) - n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate - c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, - self.minhash.ksize, self.minhash.scaled) - return c_ani, ani_low, ani_high def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." From f452d9ebf34d94e0298d857ebbd08a30b54c96e9 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 14 Jan 2022 13:20:32 -0800 Subject: [PATCH 15/38] spacing --- src/sourmash/signature.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 2b777c0d79..8ebed3a264 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -169,7 +169,6 @@ def containment_ani(self, other, downsample=False, containment=None): return_identity=True) return c_ani, ani_low, ani_high - def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." return self.minhash.max_containment(other.minhash, downsample) From 64b8b9679a7792bf443d1ea53da5467f99837617 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Mon, 17 Jan 2022 11:51:16 -0800 Subject: [PATCH 16/38] fix mc kmers --- src/sourmash/signature.py | 4 +++- tests/test_signature.py | 18 ++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 8ebed3a264..76652fdb33 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -177,7 +177,9 @@ def max_containment_ani(self, other, downsample=False, max_containment=None): "Estimate ANI from max containment w/other signature. Note: ignores abundance." if not max_containment: max_containment = self.minhash.max_containment(other.minhash, downsample) - n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate + # max_containment will always use smaller sig as denominator + min_n_kmers = min(len(self.minhash.hashes), len(other.minhash.hashes)) + n_kmers = min_n_kmers * self.minhash.scaled c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, self.minhash.ksize, self.minhash.scaled, return_identity=True) diff --git a/tests/test_signature.py b/tests/test_signature.py index 4992ae43f9..67dcd431c3 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -431,28 +431,46 @@ def test_max_containment_equal(): def test_containment_ANI(): f1 = utils.get_test_data('2.fa.sig') f2 = utils.get_test_data('2+63.fa.sig') + f3 = utils.get_test_data('47+63.fa.sig') ss1 = load_one_signature(f1, ksize=31) ss2 = load_one_signature(f2) + ss3 = load_one_signature(f3) print("\nss1 contained by ss2", ss1.containment_ani(ss2)) print("ss2 contained by ss1",ss2.containment_ani(ss1)) print("ss1 max containment", ss1.max_containment_ani(ss2)) print("ss2 max containment", ss2.max_containment_ani(ss1)) + print("\nss2 contained by ss3", ss2.containment_ani(ss3)) + print("ss3 contained by ss2",ss3.containment_ani(ss2)) + assert ss1.containment_ani(ss2) == (1.0, 1.0, 1.0) assert ss2.containment_ani(ss1) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) assert ss1.max_containment_ani(ss2) == (1.0, 1.0, 1.0) assert ss2.max_containment_ani(ss1) == (1.0, 1.0, 1.0) + # containment 1 is special case. check max containment for non 0/1 values: + assert ss2.containment_ani(ss3) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) + assert ss3.containment_ani(ss2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert ss2.max_containment_ani(ss3) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert ss3.max_containment_ani(ss2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert ss2.max_containment_ani(ss3)[0] == max(ss2.containment_ani(ss3)[0], ss3.containment_ani(ss2)[0]) + # precalc containments and assert same results s1c = ss1.contained_by(ss2) s2c = ss2.contained_by(ss1) + s3c = ss2.contained_by(ss3) + s4c = ss3.contained_by(ss2) mc = max(s1c, s2c) assert ss1.containment_ani(ss2, containment=s1c) == (1.0, 1.0, 1.0) assert ss2.containment_ani(ss1, containment=s2c) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) assert ss1.max_containment_ani(ss2, max_containment=mc) == (1.0, 1.0, 1.0) assert ss2.max_containment_ani(ss1, max_containment=mc) == (1.0, 1.0, 1.0) + assert ss2.containment_ani(ss3, containment=s3c) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) + assert ss3.containment_ani(ss2, containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert ss2.max_containment_ani(ss3, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert ss3.max_containment_ani(ss2, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) def test_jaccard_ANI(): From 70eb0ee7557efc834d32d52a0679702f34609571 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Mon, 17 Jan 2022 14:32:32 -0800 Subject: [PATCH 17/38] add corresponding mh methods --- src/sourmash/minhash.py | 33 ++++++++++++++++++++++ tests/test_minhash.py | 62 +++++++++++++++++++++++++++++++++++++++++ tests/test_signature.py | 4 +-- 3 files changed, 97 insertions(+), 2 deletions(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index bc90f16a0a..3f2342ff78 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -6,6 +6,7 @@ class MinHash - core MinHash class. class FrozenMinHash - read-only MinHash class. """ from __future__ import unicode_literals, division +from .distance_utils import jaccard_to_distance, containment_to_distance __all__ = ['get_minhash_default_seed', 'get_minhash_max_hash', @@ -646,6 +647,17 @@ def jaccard(self, other, downsample=False): raise TypeError(err) return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample) + def jaccard_ani(self, other, downsample=False, jaccard=None): + "Calculate Jaccard --> ANI of two MinHash objects." + if not jaccard: + jaccard = self.jaccard(other, downsample=downsample) + avg_scaled_kmers = (len(self) + len(other))/2 + avg_n_kmers = avg_scaled_kmers * self.scaled # would be better if hll estimate + j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, + self.ksize, self.scaled, + return_identity=True) + return j_ani, ani_low, ani_high + def similarity(self, other, ignore_abundance=False, downsample=False): """Calculate similarity of two sketches. @@ -683,6 +695,16 @@ def contained_by(self, other, downsample=False): return self.count_common(other, downsample) / len(self) + def containment_ani(self, other, downsample=False, containment=None): + "Estimate ANI from containment with the other MinHash." + if not containment: + containment = self.contained_by(other, downsample) + n_kmers = len(self) * self.scaled # would be better if hll estimate + c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, + self.ksize, self.scaled, + return_identity=True) + return c_ani, ani_low, ani_high + def max_containment(self, other, downsample=False): """ Calculate maximum containment. @@ -695,6 +717,17 @@ def max_containment(self, other, downsample=False): return self.count_common(other, downsample) / min_denom + def max_containment_ani(self, other, downsample=False, max_containment=None): + "Estimate ANI from containment with the other MinHash." + if not max_containment: + max_containment = self.max_containment(other, downsample) + min_n_kmers = min(len(self), len(other)) + n_kmers = min_n_kmers * self.scaled + c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, + self.ksize, self.scaled, + return_identity=True) + return c_ani, ani_low, ani_high + def __add__(self, other): if not isinstance(other, MinHash): raise TypeError("can only add MinHash objects to MinHash objects!") diff --git a/tests/test_minhash.py b/tests/test_minhash.py index dc39c9065c..fa580659e2 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -2602,3 +2602,65 @@ def test_translate_dayhoff_hashes_2(): assert kmer == k assert _hash_fwd_only(mh_translate, kmer) == h + + +def test_containment_ANI(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + f3 = utils.get_test_data('47+63.fa.sig') + mh1 = sourmash.load_one_signature(f1, ksize=31).minhash + mh2 = sourmash.load_one_signature(f2, ksize=31).minhash + mh3 = sourmash.load_one_signature(f3, ksize=31).minhash + + print("\nmh1 contained by mh2", mh1.containment_ani(mh2)) + print("mh2 contained by mh1",mh2.containment_ani(mh1)) + print("mh1 max containment", mh1.max_containment_ani(mh2)) + print("mh2 max containment", mh2.max_containment_ani(mh1)) + + print("\nmh2 contained by mh3", mh2.containment_ani(mh3)) + print("mh3 contained by mh2",mh3.containment_ani(mh2)) + + assert mh1.containment_ani(mh2) == (1.0, 1.0, 1.0) + assert mh2.containment_ani(mh1) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) + assert mh1.max_containment_ani(mh2) == (1.0, 1.0, 1.0) + assert mh2.max_containment_ani(mh1) == (1.0, 1.0, 1.0) + + # containment 1 is special case. check max containment for non 0/1 values: + assert mh2.containment_ani(mh3) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) + assert mh3.containment_ani(mh2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert mh2.max_containment_ani(mh3) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert mh3.max_containment_ani(mh2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert mh2.max_containment_ani(mh3)[0] == max(mh2.containment_ani(mh3)[0], mh3.containment_ani(mh2)[0]) + + # precalc containments and assert same results + s1c = mh1.contained_by(mh2) + s2c = mh2.contained_by(mh1) + s3c = mh2.contained_by(mh3) + s4c = mh3.contained_by(mh2) + mc = max(s1c, s2c) + assert mh1.containment_ani(mh2, containment=s1c) == (1.0, 1.0, 1.0) + assert mh2.containment_ani(mh1, containment=s2c) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) + assert mh1.max_containment_ani(mh2, max_containment=mc) == (1.0, 1.0, 1.0) + assert mh2.max_containment_ani(mh1, max_containment=mc) == (1.0, 1.0, 1.0) + + assert mh2.containment_ani(mh3, containment=s3c) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) + assert mh3.containment_ani(mh2, containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert mh2.max_containment_ani(mh3, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) +# assert mh3.max_containment_ani(mh2, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + + +def test_jaccard_ANI(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + mh1 = sourmash.load_one_signature(f1, ksize=31).minhash + mh2 = sourmash.load_one_signature(f2).minhash + + print("\nJACCARD_ANI", mh1.jaccard_ani(mh2)) + + assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) + + # precalc jaccard and assert same result + jaccard = mh1.jaccard(mh2) + print("\nJACCARD_ANI", mh1.jaccard_ani(mh2,jaccard=jaccard)) + + assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) diff --git a/tests/test_signature.py b/tests/test_signature.py index 67dcd431c3..f20febbdaf 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -433,8 +433,8 @@ def test_containment_ANI(): f2 = utils.get_test_data('2+63.fa.sig') f3 = utils.get_test_data('47+63.fa.sig') ss1 = load_one_signature(f1, ksize=31) - ss2 = load_one_signature(f2) - ss3 = load_one_signature(f3) + ss2 = load_one_signature(f2, ksize=31) + ss3 = load_one_signature(f3, ksize=31) print("\nss1 contained by ss2", ss1.containment_ani(ss2)) print("ss2 contained by ss1",ss2.containment_ani(ss1)) From 8ec9e10fa2dd39d35e1f1bb353b4cec80f0b0ec4 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Mon, 17 Jan 2022 15:25:33 -0800 Subject: [PATCH 18/38] round kmers for jaccard --- src/sourmash/minhash.py | 2 +- src/sourmash/signature.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 3f2342ff78..8b911d7d87 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -651,7 +651,7 @@ def jaccard_ani(self, other, downsample=False, jaccard=None): "Calculate Jaccard --> ANI of two MinHash objects." if not jaccard: jaccard = self.jaccard(other, downsample=downsample) - avg_scaled_kmers = (len(self) + len(other))/2 + avg_scaled_kmers = round((len(self) + len(other))/2) avg_n_kmers = avg_scaled_kmers * self.scaled # would be better if hll estimate j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, self.ksize, self.scaled, diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 76652fdb33..536d7f1537 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -148,7 +148,7 @@ def jaccard_ani(self, other, downsample=False, jaccard=None): if not jaccard: jaccard = self.minhash.similarity(other.minhash, ignore_abundance=True, downsample=downsample) - avg_scaled_kmers = (len(self.minhash.hashes) + len(other.minhash.hashes))/2 + avg_scaled_kmers = round((len(self.minhash.hashes) + len(other.minhash.hashes))/2) avg_n_kmers = avg_scaled_kmers * self.minhash.scaled # would be better if hll estimate j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, self.minhash.ksize, self.minhash.scaled, From 357aa581b2d35f26e8e81639feaa8f5d39d93614 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 18 Jan 2022 09:36:08 -0800 Subject: [PATCH 19/38] allow confidence tuning --- src/sourmash/minhash.py | 12 ++++++------ src/sourmash/signature.py | 24 ++++++++++++------------ tests/test_minhash.py | 13 +++++++++++++ tests/test_signature.py | 15 ++++++++++++++- 4 files changed, 45 insertions(+), 19 deletions(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 8b911d7d87..5560ce2bca 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -647,14 +647,14 @@ def jaccard(self, other, downsample=False): raise TypeError(err) return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample) - def jaccard_ani(self, other, downsample=False, jaccard=None): + def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): "Calculate Jaccard --> ANI of two MinHash objects." if not jaccard: jaccard = self.jaccard(other, downsample=downsample) avg_scaled_kmers = round((len(self) + len(other))/2) avg_n_kmers = avg_scaled_kmers * self.scaled # would be better if hll estimate j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, - self.ksize, self.scaled, + self.ksize, self.scaled, confidence=confidence, return_identity=True) return j_ani, ani_low, ani_high @@ -695,13 +695,13 @@ def contained_by(self, other, downsample=False): return self.count_common(other, downsample) / len(self) - def containment_ani(self, other, downsample=False, containment=None): + def containment_ani(self, other, downsample=False, containment=None, confidence=0.95): "Estimate ANI from containment with the other MinHash." if not containment: containment = self.contained_by(other, downsample) n_kmers = len(self) * self.scaled # would be better if hll estimate c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, - self.ksize, self.scaled, + self.ksize, self.scaled, confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high @@ -717,14 +717,14 @@ def max_containment(self, other, downsample=False): return self.count_common(other, downsample) / min_denom - def max_containment_ani(self, other, downsample=False, max_containment=None): + def max_containment_ani(self, other, downsample=False, max_containment=None, confidence=0.95): "Estimate ANI from containment with the other MinHash." if not max_containment: max_containment = self.max_containment(other, downsample) min_n_kmers = min(len(self), len(other)) n_kmers = min_n_kmers * self.scaled c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, - self.ksize, self.scaled, + self.ksize, self.scaled, confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 536d7f1537..95c5e8b9de 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -143,46 +143,46 @@ def jaccard(self, other): return self.minhash.similarity(other.minhash, ignore_abundance=True, downsample=False) - def jaccard_ani(self, other, downsample=False, jaccard=None): + def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): "Compute Jaccard similarity with the other MinHash signature." if not jaccard: jaccard = self.minhash.similarity(other.minhash, ignore_abundance=True, downsample=downsample) avg_scaled_kmers = round((len(self.minhash.hashes) + len(other.minhash.hashes))/2) avg_n_kmers = avg_scaled_kmers * self.minhash.scaled # would be better if hll estimate - j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, - self.minhash.ksize, self.minhash.scaled, - return_identity=True) + j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, self.minhash.ksize, + self.minhash.scaled, confidence=confidence, + return_identity=True) return j_ani, ani_low, ani_high def contained_by(self, other, downsample=False): "Compute containment by the other signature. Note: ignores abundance." return self.minhash.contained_by(other.minhash, downsample) - def containment_ani(self, other, downsample=False, containment=None): + def containment_ani(self, other, downsample=False, containment=None, confidence=0.95): "Estimate ANI from containment with the other MinHash signature." if not containment: containment = self.minhash.contained_by(other.minhash, downsample) n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate - c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, - self.minhash.ksize, self.minhash.scaled, - return_identity=True) + c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, self.minhash.ksize, + self.minhash.scaled, confidence=confidence, + return_identity=True) return c_ani, ani_low, ani_high def max_containment(self, other, downsample=False): "Compute max containment w/other signature. Note: ignores abundance." return self.minhash.max_containment(other.minhash, downsample) - def max_containment_ani(self, other, downsample=False, max_containment=None): + def max_containment_ani(self, other, downsample=False, max_containment=None, confidence=0.95): "Estimate ANI from max containment w/other signature. Note: ignores abundance." if not max_containment: max_containment = self.minhash.max_containment(other.minhash, downsample) # max_containment will always use smaller sig as denominator min_n_kmers = min(len(self.minhash.hashes), len(other.minhash.hashes)) n_kmers = min_n_kmers * self.minhash.scaled - c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, - self.minhash.ksize, self.minhash.scaled, - return_identity=True) + c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, self.minhash.ksize, + self.minhash.scaled, confidence=confidence, + return_identity=True) return c_ani, ani_low, ani_high def add_sequence(self, sequence, force=False): diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 42fc6f74ca..f00d544150 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -2635,6 +2635,9 @@ def test_containment_ANI(): print("\nmh2 contained by mh3", mh2.containment_ani(mh3)) print("mh3 contained by mh2",mh3.containment_ani(mh2)) + print("\nmh2 contained by mh3, CI 90%", mh2.containment_ani(mh3, confidence=0.9)) + print("mh3 contained by mh2, CI 99%",mh3.containment_ani(mh2, confidence=0.99)) + assert mh1.containment_ani(mh2) == (1.0, 1.0, 1.0) assert mh2.containment_ani(mh1) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) assert mh1.max_containment_ani(mh2) == (1.0, 1.0, 1.0) @@ -2647,6 +2650,10 @@ def test_containment_ANI(): assert mh3.max_containment_ani(mh2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) assert mh2.max_containment_ani(mh3)[0] == max(mh2.containment_ani(mh3)[0], mh3.containment_ani(mh2)[0]) + # check confidence + assert mh2.containment_ani(mh3, confidence=0.9) == (0.9866751346467802, 0.986241913154108, 0.9870974232542815) + assert mh3.containment_ani(mh2, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) + # precalc containments and assert same results s1c = mh1.contained_by(mh2) s2c = mh2.contained_by(mh1) @@ -2660,6 +2667,7 @@ def test_containment_ANI(): assert mh2.containment_ani(mh3, containment=s3c) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) assert mh3.containment_ani(mh2, containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert mh3.containment_ani(mh2, containment=s4c, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) assert mh2.max_containment_ani(mh3, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) assert mh3.max_containment_ani(mh2, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) @@ -2671,11 +2679,16 @@ def test_jaccard_ANI(): mh2 = sourmash.load_one_signature(f2).minhash print("\nJACCARD_ANI", mh1.jaccard_ani(mh2)) + print("\nJACCARD_ANI 90% CI", mh1.jaccard_ani(mh2, confidence=0.9)) + print("\nJACCARD_ANI 99% CI", mh1.jaccard_ani(mh2, confidence=0.99)) assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) + assert mh1.jaccard_ani(mh2, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) + assert mh1.jaccard_ani(mh2, confidence=0.99) == (0.9783711630110239, 0.9774056164150092, 0.9793173653983135) # precalc jaccard and assert same result jaccard = mh1.jaccard(mh2) print("\nJACCARD_ANI", mh1.jaccard_ani(mh2,jaccard=jaccard)) assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) + assert mh1.jaccard_ani(mh2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) diff --git a/tests/test_signature.py b/tests/test_signature.py index f20febbdaf..66245c536c 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -444,6 +444,9 @@ def test_containment_ANI(): print("\nss2 contained by ss3", ss2.containment_ani(ss3)) print("ss3 contained by ss2",ss3.containment_ani(ss2)) + print("\nss2 contained by ss3, CI 90%", ss2.containment_ani(ss3, confidence=0.9)) + print("ss3 contained by ss2, CI 99%",ss3.containment_ani(ss2, confidence=0.99)) + assert ss1.containment_ani(ss2) == (1.0, 1.0, 1.0) assert ss2.containment_ani(ss1) == (0.9658183324254062, 0.9648452889933389, 0.966777042966207) assert ss1.max_containment_ani(ss2) == (1.0, 1.0, 1.0) @@ -456,6 +459,10 @@ def test_containment_ANI(): assert ss3.max_containment_ani(ss2) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) assert ss2.max_containment_ani(ss3)[0] == max(ss2.containment_ani(ss3)[0], ss3.containment_ani(ss2)[0]) + # check confidence + assert ss2.containment_ani(ss3, confidence=0.9) == (0.9866751346467802, 0.986241913154108, 0.9870974232542815) + assert ss3.containment_ani(ss2, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) + # precalc containments and assert same results s1c = ss1.contained_by(ss2) s2c = ss2.contained_by(ss1) @@ -469,6 +476,7 @@ def test_containment_ANI(): assert ss2.containment_ani(ss3, containment=s3c) == (0.9866751346467802, 0.9861576758035308, 0.9871770716451368) assert ss3.containment_ani(ss2, containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) + assert ss3.containment_ani(ss2, containment=s4c, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) assert ss2.max_containment_ani(ss3, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) assert ss3.max_containment_ani(ss2, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) @@ -480,11 +488,16 @@ def test_jaccard_ANI(): ss2 = load_one_signature(f2) print("\nJACCARD_ANI", ss1.jaccard_ani(ss2)) + print("\nJACCARD_ANI 90% CI", ss1.jaccard_ani(ss2, confidence=0.9)) + print("\nJACCARD_ANI 99% CI", ss1.jaccard_ani(ss2, confidence=0.99)) assert ss1.jaccard_ani(ss2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) + assert ss1.jaccard_ani(ss2, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) + assert ss1.jaccard_ani(ss2, confidence=0.99) == (0.9783711630110239, 0.9774056164150092, 0.9793173653983135) # precalc jaccard and assert same result jaccard = ss1.jaccard(ss2) print("\nJACCARD_ANI", ss1.jaccard_ani(ss2,jaccard=jaccard)) - assert ss1.jaccard_ani(ss2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) + assert ss1.jaccard_ani(ss2, jaccard=jaccard) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) + assert ss1.jaccard_ani(ss2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) From af036b856e0c898c4ce3ccb384bf0e0479a24828 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 18 Jan 2022 21:53:23 -0800 Subject: [PATCH 20/38] enable bp --> ANI --- src/sourmash/distance_utils.py | 13 +++++- src/sourmash/minhash.py | 18 +++---- src/sourmash/signature.py | 18 +++---- tests/test_distance_utils.py | 85 ++++++++++++++++++++++++---------- 4 files changed, 90 insertions(+), 44 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 7193bf934c..ebc89eac85 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -62,10 +62,17 @@ def probit(p): return scipy_norm.ppf(p) -def containment_to_distance(containment, n_unique_kmers, ksize, scaled, confidence=0.95, return_identity=False): +def sequence_len_to_n_kmers(sequence_len_bp, ksize): + n_kmers = sequence_len_bp - (ksize-1) + return n_kmers + + +def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False): """ Containment --> distance CI (one step) """ + if sequence_len_bp and not n_unique_kmers: + n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) if containment <= 0.0001: # changed from 0.0, to mirror jaccard_to_distance_CI_one_step sol2 = sol1 = point_estimate = 1.0 elif containment >= 0.9999: # changed from 1.0, to mirror jaccard_to_distance_CI_one_step @@ -112,10 +119,12 @@ def variance_scaled_jaccard(L, p, k, s): return term1 + term2 - term3 -def jaccard_to_distance(jaccard, n_unique_kmers, ksize, scaled, confidence=0.95, return_identity=False): +def jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False): """ Scaled Jaccard to distance estimate and CI (one step) """ + if sequence_len_bp and not n_unique_kmers: + n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) if jaccard <= 0.0001: sol2 = sol1 = point_estimate = 1.0 elif jaccard >= 0.9999: diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 5560ce2bca..5b186c5663 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -653,9 +653,9 @@ def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): jaccard = self.jaccard(other, downsample=downsample) avg_scaled_kmers = round((len(self) + len(other))/2) avg_n_kmers = avg_scaled_kmers * self.scaled # would be better if hll estimate - j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, - self.ksize, self.scaled, confidence=confidence, - return_identity=True) + j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self.ksize, self.scaled, + n_unique_kmers=avg_n_kmers, confidence=confidence, + return_identity=True) return j_ani, ani_low, ani_high def similarity(self, other, ignore_abundance=False, downsample=False): @@ -700,9 +700,9 @@ def containment_ani(self, other, downsample=False, containment=None, confidence= if not containment: containment = self.contained_by(other, downsample) n_kmers = len(self) * self.scaled # would be better if hll estimate - c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, - self.ksize, self.scaled, confidence=confidence, - return_identity=True) + c_ani,ani_low,ani_high = containment_to_distance(containment, self.ksize, self.scaled, + n_unique_kmers=n_kmers, confidence=confidence, + return_identity=True) return c_ani, ani_low, ani_high def max_containment(self, other, downsample=False): @@ -723,9 +723,9 @@ def max_containment_ani(self, other, downsample=False, max_containment=None, con max_containment = self.max_containment(other, downsample) min_n_kmers = min(len(self), len(other)) n_kmers = min_n_kmers * self.scaled - c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, - self.ksize, self.scaled, confidence=confidence, - return_identity=True) + c_ani,ani_low,ani_high = containment_to_distance(max_containment, self.ksize, self.scaled, + n_unique_kmers=n_kmers,confidence=confidence, + return_identity=True) return c_ani, ani_low, ani_high def __add__(self, other): diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 95c5e8b9de..c3d5d9b214 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -150,9 +150,9 @@ def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): downsample=downsample) avg_scaled_kmers = round((len(self.minhash.hashes) + len(other.minhash.hashes))/2) avg_n_kmers = avg_scaled_kmers * self.minhash.scaled # would be better if hll estimate - j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, avg_n_kmers, self.minhash.ksize, - self.minhash.scaled, confidence=confidence, - return_identity=True) + j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self.minhash.ksize, + self.minhash.scaled, n_unique_kmers=avg_n_kmers, + confidence=confidence, return_identity=True) return j_ani, ani_low, ani_high def contained_by(self, other, downsample=False): @@ -164,9 +164,9 @@ def containment_ani(self, other, downsample=False, containment=None, confidence= if not containment: containment = self.minhash.contained_by(other.minhash, downsample) n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate - c_ani,ani_low,ani_high = containment_to_distance(containment, n_kmers, self.minhash.ksize, - self.minhash.scaled, confidence=confidence, - return_identity=True) + c_ani,ani_low,ani_high = containment_to_distance(containment, self.minhash.ksize, + self.minhash.scaled, n_unique_kmers=n_kmers, + confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high def max_containment(self, other, downsample=False): @@ -180,9 +180,9 @@ def max_containment_ani(self, other, downsample=False, max_containment=None, con # max_containment will always use smaller sig as denominator min_n_kmers = min(len(self.minhash.hashes), len(other.minhash.hashes)) n_kmers = min_n_kmers * self.minhash.scaled - c_ani,ani_low,ani_high = containment_to_distance(max_containment, n_kmers, self.minhash.ksize, - self.minhash.scaled, confidence=confidence, - return_identity=True) + c_ani,ani_low,ani_high = containment_to_distance(max_containment, self.minhash.ksize, + self.minhash.scaled, n_unique_kmers=n_kmers, + confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high def add_sequence(self, sequence, force=False): diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index 10d2eac225..fc2f0515ff 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -3,7 +3,7 @@ """ import pytest -from sourmash.distance_utils import containment_to_distance, jaccard_to_distance,distance_to_identity +from sourmash.distance_utils import containment_to_distance, jaccard_to_distance, distance_to_identity, sequence_len_to_n_kmers def test_distance_to_identity(): id,id_low,id_high = distance_to_identity(0.5,0.4,0.6) @@ -26,7 +26,7 @@ def test_containment_to_distance_zero(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + dist,low,high = containment_to_distance(contain,ksize,scaled, n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) # check results @@ -36,7 +36,7 @@ def test_containment_to_distance_zero(): assert high == exp_high # return identity instead exp_id, exp_idlow,exp_idhigh = 0.0,0.0,0.0 - ident,low,high = containment_to_distance(contain,nkmers,ksize,scaled,return_identity=True) + ident,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) assert ident == exp_id assert low == exp_idlow assert high == exp_idhigh @@ -47,7 +47,7 @@ def test_containment_to_distance_one(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) # check results @@ -57,7 +57,7 @@ def test_containment_to_distance_one(): assert high == exp_high # return identity instead exp_id, exp_idlow,exp_idhigh = 1.0,1.0,1.0 - ident,low,high = containment_to_distance(contain,nkmers,ksize,scaled,return_identity=True) + ident,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) assert ident == exp_id assert low == exp_idlow assert high == exp_idhigh @@ -68,7 +68,7 @@ def test_containment_to_distance_scaled1(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -79,7 +79,7 @@ def test_containment_to_distance_scaled1(): assert high == exp_high # return identity instead exp_id, exp_idlow,exp_idhigh = 0.9675317785238916,0.9635213980271021,0.9712900870335944 - ident,low,high = containment_to_distance(contain,nkmers,ksize,scaled,return_identity=True) + ident,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) print(f"{ident},{low},{high}") assert ident == exp_id assert low == exp_idlow @@ -91,7 +91,7 @@ def test_containment_to_distance_2(): scaled = 100 nkmers = 10000 ksize=31 - dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -107,7 +107,7 @@ def test_containment_to_distance_scaled100(): scaled = 100 nkmers = 10000 ksize=21 - dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled) + dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -123,7 +123,7 @@ def test_containment_to_distance_k10(): scaled = 100 nkmers = 10000 ksize=10 - dist,low,high = containment_to_distance(jaccard,nkmers,ksize,scaled) + dist,low,high = containment_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -139,7 +139,7 @@ def test_containment_to_distance_confidence(): nkmers = 10000 ksize=31 confidence=0.99 - dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled,confidence) + dist,low,high = containment_to_distance(contain,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -150,7 +150,7 @@ def test_containment_to_distance_confidence(): assert high == exp_high confidence=0.90 - dist,low,high = containment_to_distance(contain,nkmers,ksize,scaled,confidence) + dist,low,high = containment_to_distance(contain,ksize,scaled,n_unique_kmers=nkmers,confidence=confidence) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -161,12 +161,30 @@ def test_containment_to_distance_confidence(): assert high == exp_high +def test_nkmers_to_bp_containment(): + containment = 0.1 + scaled = 100 + bp_len = 10030 + ksize=31 + nkmers = sequence_len_to_n_kmers(bp_len,ksize) + print("nkmers_from_bp:", bp_len) + confidence=0.99 + kmer_dist = containment_to_distance(containment,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) + bp_dist = containment_to_distance(containment,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len) + print(f"\nkDIST:", kmer_dist) + print(f"\nbpDIST:",bp_dist) + # check results + assert kmer_dist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) + assert bp_dist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) + assert kmer_dist==bp_dist + + def test_jaccard_to_distance_zero(): jaccard = 0 scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -177,7 +195,7 @@ def test_jaccard_to_distance_zero(): assert high == exp_high # return identity instead exp_id, exp_idlow,exp_idhigh = 0.0,0.0,0.0 - ident,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,return_identity=True) + ident,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) print(f"{ident},{low},{high}") assert ident == exp_id assert low == exp_idlow @@ -189,7 +207,7 @@ def test_jaccard_to_distance_one(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -200,7 +218,7 @@ def test_jaccard_to_distance_one(): assert high == exp_high # return identity instead exp_id, exp_idlow,exp_idhigh = 1.0,1.0,1.0 - ident,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,return_identity=True) + ident,low,high = jaccard_to_distance(jaccard,ksize,scaled,return_identity=True,n_unique_kmers=nkmers) print(f"{ident},{low},{high}") assert ident == exp_id assert low == exp_idlow @@ -212,7 +230,7 @@ def test_jaccard_to_distance_scaled1(): scaled = 1 nkmers = 10000 ksize=21 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -223,7 +241,7 @@ def test_jaccard_to_distance_scaled1(): assert high == exp_high # return identity instead exp_id, exp_idlow,exp_idhigh = 0.9808773406095179,0.979142479906669,0.9824501832354076 - ident,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,return_identity=True) + ident,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) print(f"{ident},{low},{high}") assert ident == exp_id assert low == exp_idlow @@ -235,7 +253,7 @@ def test_jaccard_to_distance_scaled100(): scaled = 100 nkmers = 10000 ksize=21 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -251,7 +269,7 @@ def test_jaccard_to_distance_k31(): scaled = 100 nkmers = 10000 ksize=31 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -267,7 +285,7 @@ def test_jaccard_to_distance_k31_2(): scaled = 100 nkmers = 10000 ksize=31 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -277,13 +295,14 @@ def test_jaccard_to_distance_k31_2(): assert low == exp_low assert high == exp_high + def test_jaccard_to_distance_confidence(): jaccard = 0.1 scaled = 100 nkmers = 10000 ksize=31 confidence=0.99 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,confidence) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -294,7 +313,7 @@ def test_jaccard_to_distance_confidence(): assert high == exp_high confidence=0.90 - dist,low,high = jaccard_to_distance(jaccard,nkmers,ksize,scaled,confidence) + dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) print(f"{dist},{low},{high}") @@ -302,4 +321,22 @@ def test_jaccard_to_distance_confidence(): exp_dist, exp_low,exp_high = 0.0535071608231702,0.042708361726101415,0.0646280650023921 assert dist == exp_dist assert low == exp_low - assert high == exp_high \ No newline at end of file + assert high == exp_high + + +def test_nkmers_to_bp(): + jaccard = 0.1 + scaled = 100 + bp_len = 10030 + ksize=31 + nkmers = sequence_len_to_n_kmers(bp_len,ksize) + print("nkmers_from_bp:", bp_len) + confidence=0.99 + kmer_dist = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) + bp_dist = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len) + print(f"\nkDIST:", kmer_dist) + print(f"\nbpDIST:",bp_dist) + # check results + assert kmer_dist == (0.0535071608231702, 0.03702518586582857, 0.07080999238232429) + assert bp_dist == (0.0535071608231702, 0.03702518586582857, 0.07080999238232429) + assert kmer_dist==bp_dist From 71a859b77e6f331eb8ab21975735eb3e5e1bc67c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 19 Jan 2022 08:54:32 -0800 Subject: [PATCH 21/38] fix c to dist in search --- src/sourmash/distance_utils.py | 2 +- src/sourmash/search.py | 20 ++++++++++++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index ebc89eac85..3dbec8f26e 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -67,7 +67,7 @@ def sequence_len_to_n_kmers(sequence_len_bp, ksize): return n_kmers -def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False): +def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False): """ Containment --> distance CI (one step) """ diff --git a/src/sourmash/search.py b/src/sourmash/search.py index f349fc7dd2..3185066284 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -245,8 +245,11 @@ def search_databases_with_abund_query(query, databases, **kwargs): ### gather code ### -GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match, f_match_orig, unique_intersect_bp, gather_result_rank, remaining_bp, query_filename, query_name, query_md5, query_bp, match_containment_ani') +GatherResult = namedtuple('GatherResult', ['intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query', + 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', + 'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', + 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', + 'scaled', 'match_containment_ani'])#,'query_containment_ani']) def _find_best(counters, query, threshold_bp): @@ -408,8 +411,14 @@ def __next__(self): f_match_orig = found_mh.contained_by(orig_query_mh) # calculate ani using match containment by query - match_containment_ani = containment_to_distance(f_match_orig, len(found_mh) * scaled, - found_mh.ksize, scaled, return_identity=True) + match_containment_ani = containment_to_distance(f_match_orig, found_mh.ksize, scaled, + n_unique_kmers=(len(found_mh) * scaled), + return_identity=True)[0] + + # calculate ani using query containment by match -- useful for genome classification + #orig_query_containment_ani = containment_to_distance(f_orig_query, orig_query_mh.ksize, scaled, + # n_unique_kmers=(orig_query_len * scaled), + # return_identity=True)[0] # calculate scores weighted by abundances f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mh.hashes )) @@ -459,7 +468,10 @@ def __next__(self): query_filename=self.orig_query_filename, query_name=self.orig_query_name, query_md5=self.orig_query_md5, + ksize = self.orig_query_mh.ksize, + scaled = scaled, match_containment_ani=match_containment_ani, + #query_containment_ani=orig_query_containment_ani, ) self.result_n += 1 self.query = new_query From d284a2402aa4b57921713a6e929bdc9d17d5f302 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 19 Jan 2022 10:42:57 -0800 Subject: [PATCH 22/38] optionally estimate query ani while summarizing gather results --- src/sourmash/cli/utils.py | 10 +++-- src/sourmash/tax/__main__.py | 27 +++++++++---- src/sourmash/tax/tax_utils.py | 72 +++++++++++++++++++++++------------ tests/test_tax_utils.py | 43 +++++++++++++-------- 4 files changed, 100 insertions(+), 52 deletions(-) diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 598536d311..f484bdfeeb 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -67,10 +67,14 @@ def range_limited_float_type(arg): return f -def add_tax_threshold_arg(parser, default=0.1): +def add_tax_threshold_arg(parser, containment_default=0.1, ani_default=None): parser.add_argument( - '--containment-threshold', default=default, type=range_limited_float_type, - help=f'minimum containment threshold for classification; default={default}' + '--containment-threshold', default=containment_default, type=range_limited_float_type, + help=f'minimum containment threshold for classification; default={containment_default}', + ) + parser.add_argument( + '--ani-threshold', '--aai-threshold', default=ani_default, type=range_limited_float_type, + help=f'minimum ANI threshold (nucleotide gather) or AAI threshold (protein gather) for classification; default={ani_default}', ) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 01fc6bbe1c..efc8feaa87 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -184,7 +184,7 @@ def genome(args): best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True, seen_perfect=seen_perfect) + best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -194,12 +194,15 @@ def genome(args): status = 'nomatch' if sg.query_name in matched_queries: continue - if sg.fraction <= args.containment_threshold: + if args.ani_threshold and sg.query_ani_at_rank < args.ani_threshold: + status="below_threshold" + notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet query ANI/AAI threshold {args.ani_threshold}") + elif sg.fraction <= args.containment_threshold: # should this just be less than? status="below_threshold" notify(f"WARNING: classifying query {sg.query_name} at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") else: status="match" - classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank) + classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank, sg.query_ani_at_rank) classifications[args.rank].append(classif) matched_queries.add(sg.query_name) if "krona" in args.output_format: @@ -214,7 +217,7 @@ def genome(args): best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True, seen_perfect=seen_perfect) + best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -223,18 +226,26 @@ def genome(args): status = 'nomatch' if sg.query_name in matched_queries: continue - if sg.fraction >= args.containment_threshold: + if args.ani_threshold and sg.query_ani_at_rank >= args.ani_threshold: + status="match" + elif sg.fraction >= args.containment_threshold: status = "match" - classif = ClassificationResult(sg.query_name, status, sg.rank, sg.fraction, sg.lineage, sg.query_md5, sg.query_filename, sg.f_weighted_at_rank, sg.bp_match_at_rank) + if status == "match": + classif = ClassificationResult(query_name=sg.query_name, status=status, rank=sg.rank, + fraction=sg.fraction, lineage=sg.lineage, + query_md5=sg.query_md5, query_filename=sg.query_filename, + f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, + query_ani_at_rank= sg.query_ani_at_rank) classifications[sg.rank].append(classif) matched_queries.add(sg.query_name) continue - if rank == "superkingdom" and status == "nomatch": + elif rank == "superkingdom" and status == "nomatch": status="below_threshold" classif = ClassificationResult(query_name=sg.query_name, status=status, rank="", fraction=0, lineage="", query_md5=sg.query_md5, query_filename=sg.query_filename, - f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank) + f_weighted_at_rank=sg.f_weighted_at_rank, bp_match_at_rank=sg.bp_match_at_rank, + query_ani_at_rank=sg.query_ani_at_rank) classifications[sg.rank].append(classif) if not any([classifications, krona_results]): diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index bc07476260..0488de86b7 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -5,6 +5,7 @@ import csv from collections import namedtuple, defaultdict from collections import abc +from sourmash.distance_utils import containment_to_distance __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', 'load_gather_results', 'check_and_load_gather_csvs', @@ -19,8 +20,8 @@ from sourmash.sourmash_args import load_pathlist_from_file QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp") -SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank") -ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank") +SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") +ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") # Essential Gather column names that must be in gather_csv to allow `tax` summarization EssentialGatherColnames = ('query_name', 'name', 'f_unique_weighted', 'f_unique_to_query', 'unique_intersect_bp', 'remaining_bp', 'query_md5', 'query_filename') @@ -182,7 +183,7 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], keep_full_identifiers=False, keep_identifier_versions=False, best_only=False, - seen_perfect=set()): + seen_perfect=set(), estimate_query_ani=False): """ Summarize gather results at specified taxonomic rank """ @@ -192,7 +193,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_to_query = defaultdict(lambda: defaultdict(float)) sum_uniq_bp = defaultdict(lambda: defaultdict(float)) query_info = {} - + ksize,scaled=None,None for row in gather_results: # get essential gather info query_name = row['query_name'] @@ -201,13 +202,23 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], unique_intersect_bp = int(row['unique_intersect_bp']) query_md5 = row['query_md5'] query_filename = row['query_filename'] - # get query_bp - if query_name not in query_info.keys(): - query_bp = unique_intersect_bp + int(row['remaining_bp']) + if query_name not in query_info.keys(): #THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same? + if "query_bp" in row.keys(): + query_bp = int(row["query_bp"]) + else: + query_bp = unique_intersect_bp + int(row['remaining_bp']) # store query info query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp) - match_ident = row['name'] + if estimate_query_ani and (not ksize or not scaled): # just need to set these once. BUT, if we have these, should we check for compatibility when loading the gather file? + if "ksize" in row.keys(): # ksize and scaled were added to gather results in same PR + ksize = row['ksize'] + scaled = row['scaled'] + else: + estimate_query_ani=False + notify("Error: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI...") + + match_ident = row['name'] # 100% match? are we looking at something in the database? if f_unique_to_query >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank @@ -219,9 +230,9 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], # get lineage for match lineage = find_match_lineage(match_ident, tax_assign, - skip_idents=skip_idents, - keep_full_identifiers=keep_full_identifiers, - keep_identifier_versions=keep_identifier_versions) + skip_idents=skip_idents, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) # ident was in skip_idents if not lineage: continue @@ -234,46 +245,57 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_weighted[query_name][lineage] += f_uniq_weighted sum_uniq_bp[query_name][lineage] += unique_intersect_bp + # sort and store each as SummarizedGatherResult sum_uniq_to_query_sorted = [] for query_name, lineage_weights in sum_uniq_to_query.items(): qInfo = query_info[query_name] sumgather_items = list(lineage_weights.items()) sumgather_items.sort(key = lambda x: -x[1]) + query_ani = None if best_only: - lineage, fraction = sumgather_items[0] - if fraction > 1: - raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif fraction == 0: + lineage, f_uniq_to_query_at_rank = sumgather_items[0] + if f_uniq_to_query_at_rank > 1: + raise ValueError(f"The tax summary of query '{query_name}' is {f_uniq_to_query_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") + elif f_uniq_to_query_at_rank == 0: continue f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] bp_intersect_at_rank = sum_uniq_bp[query_name][lineage] - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank) + if estimate_query_ani: + query_ani = containment_to_distance(f_uniq_to_query_at_rank, ksize, scaled, + n_unique_kmers= qInfo.query_bp, return_identity=True) + sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, qInfo.query_md5, + qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) else: total_f_weighted= 0.0 total_f_classified = 0.0 total_bp_classified = 0 - for lineage, fraction in sumgather_items: - if fraction > 1: - raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif fraction == 0: + for lineage, f_uniq_to_query_at_rank in sumgather_items: + if f_uniq_to_query_at_rank > 1: + raise ValueError(f"The tax summary of query '{query_name}' is {f_uniq_to_query_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") + elif f_uniq_to_query_at_rank == 0: continue - total_f_classified += fraction + total_f_classified += f_uniq_to_query_at_rank f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] total_f_weighted += f_weighted_at_rank bp_intersect_at_rank = int(sum_uniq_bp[query_name][lineage]) total_bp_classified += bp_intersect_at_rank - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank) + if estimate_query_ani: + query_ani = containment_to_distance(f_uniq_to_query_at_rank, ksize, scaled, + n_unique_kmers= qInfo.query_bp, return_identity=True) + sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, query_md5, + query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) # record unclassified lineage = () - fraction = 1.0 - total_f_classified - if fraction > 0: + f_uniq_to_query_at_rank = 1.0 - total_f_classified + if f_uniq_to_query_at_rank > 0: f_weighted_at_rank = 1.0 - total_f_weighted bp_intersect_at_rank = qInfo.query_bp - total_bp_classified - sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank) + sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, query_md5, + query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) return sum_uniq_to_query_sorted, seen_perfect diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index dce1d6d9c2..48861e15f6 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -597,12 +597,14 @@ def test_write_summary_csv(runtmp): sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} outs= runtmp.output("outsum.csv") with open(outs, 'w') as out_fp: @@ -610,9 +612,9 @@ def test_write_summary_csv(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_summary_results_from_file: \n", sr) - assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0] - assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1] - assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[2] + assert ['query_name', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0] + assert ['queryA', 'superkingdom', '1.0', 'a', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1] + assert ['queryA', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100',''] == sr[2] def test_write_classification(runtmp): @@ -620,7 +622,8 @@ def test_write_classification(runtmp): classif = ClassificationResult('queryA', 'match', 'phylum', 1.0, (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')), - 'queryA_md5', 'queryA.sig', 1.0, 100) + 'queryA_md5', 'queryA.sig', 1.0, 100, + query_ani_at_rank=None) classification = {'phylum': [classif]} @@ -630,8 +633,8 @@ def test_write_classification(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_classification_results_from_file: \n", sr) - assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank'] == sr[0] - assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100'] == sr[1] + assert ['query_name', 'status', 'rank', 'fraction', 'lineage', 'query_md5', 'query_filename', 'f_weighted_at_rank', 'bp_match_at_rank', 'query_ani_at_rank'] == sr[0] + assert ['queryA', 'match', 'phylum', '1.0', 'a;b', 'queryA_md5', 'queryA.sig', '1.0', '100', ''] == sr[1] def test_make_krona_header_0(): @@ -816,21 +819,25 @@ def test_combine_sumgather_csvs_by_lineage(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=1.0, bp_match_at_rank=100, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')))]} + LineagePair(rank='phylum', name='c')), + query_ani_at_rank=None)]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") @@ -903,21 +910,25 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, query_md5='queryA_md5', query_filename='queryA.sig', f_weighted_at_rank=0.5, bp_match_at_rank=50, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')))]} + LineagePair(rank='phylum', name='b')), + query_ani_at_rank=None)]} sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, - lineage=(LineagePair(rank='superkingdom', name='a'),))], + lineage=(LineagePair(rank='superkingdom', name='a'),), + query_ani_at_rank=None)], 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, query_md5='queryB_md5', query_filename='queryB.sig', f_weighted_at_rank=0.7, bp_match_at_rank=70, lineage=(LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')))]} + LineagePair(rank='phylum', name='c')), + query_ani_at_rank=None)]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") From 15db1525145286539ed743611821c1fbe6d0690c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 19 Jan 2022 15:59:52 -0800 Subject: [PATCH 23/38] addl cols in gatherresult --- src/sourmash/search.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 3185066284..cebca9db6f 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -249,7 +249,7 @@ def search_databases_with_abund_query(query, databases, **kwargs): 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', 'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', - 'scaled', 'match_containment_ani'])#,'query_containment_ani']) + 'moltype', 'num', 'scaled', 'n_hashes', 'query_abundance', 'match_containment_ani'])#,'query_containment_ani']) def _find_best(counters, query, threshold_bp): @@ -469,7 +469,11 @@ def __next__(self): query_name=self.orig_query_name, query_md5=self.orig_query_md5, ksize = self.orig_query_mh.ksize, + moltype = self.orig_query_mh.moltype, + num = self.orig_query_mh.num, scaled = scaled, + n_hashes=len(self.orig_query_mh), + query_abundance=self.orig_query_mh.track_abundance, match_containment_ani=match_containment_ani, #query_containment_ani=orig_query_containment_ani, ) From ef4dee4d6571a12844a15100eef5252be3e97723 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 19 Jan 2022 16:35:02 -0800 Subject: [PATCH 24/38] addl prefetch cols; test ani in summarize_gather_at --- src/sourmash/search.py | 26 +++++++++++++++++++------- src/sourmash/tax/tax_utils.py | 6 +++--- tests/test_tax_utils.py | 24 +++++++++++++++++------- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index cebca9db6f..e1554a2c27 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -249,7 +249,8 @@ def search_databases_with_abund_query(query, databases, **kwargs): 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', 'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', - 'moltype', 'num', 'scaled', 'n_hashes', 'query_abundance', 'match_containment_ani'])#,'query_containment_ani']) + 'moltype', 'num', 'scaled', 'query_nhashes', 'query_abundance', 'match_containment_ani', + 'query_containment_ani']) def _find_best(counters, query, threshold_bp): @@ -416,9 +417,9 @@ def __next__(self): return_identity=True)[0] # calculate ani using query containment by match -- useful for genome classification - #orig_query_containment_ani = containment_to_distance(f_orig_query, orig_query_mh.ksize, scaled, - # n_unique_kmers=(orig_query_len * scaled), - # return_identity=True)[0] + orig_query_containment_ani = containment_to_distance(f_orig_query, orig_query_mh.ksize, scaled, + n_unique_kmers=(orig_query_len * scaled), + return_identity=True)[0] # calculate scores weighted by abundances f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mh.hashes )) @@ -472,10 +473,10 @@ def __next__(self): moltype = self.orig_query_mh.moltype, num = self.orig_query_mh.num, scaled = scaled, - n_hashes=len(self.orig_query_mh), + query_nhashes=len(self.orig_query_mh), query_abundance=self.orig_query_mh.track_abundance, match_containment_ani=match_containment_ani, - #query_containment_ani=orig_query_containment_ani, + query_containment_ani=orig_query_containment_ani, ) self.result_n += 1 self.query = new_query @@ -489,7 +490,12 @@ def __next__(self): ### PrefetchResult = namedtuple('PrefetchResult', - 'intersect_bp, jaccard, max_containment, f_query_match, f_match_query, match, match_filename, match_name, match_md5, match_bp, query, query_filename, query_name, query_md5, query_bp, jaccard_ani, max_containment_ani, query_containment_ani, match_containment_ani') + ['intersect_bp', 'jaccard', 'max_containment', 'f_query_match', + 'f_match_query', 'match', 'match_filename', 'match_name', + 'match_md5', 'match_bp', 'query', 'query_filename', 'query_name', + 'query_md5', 'query_bp', 'ksize','moltype', 'num', 'scaled', + 'query_nhashes', 'query_abundance','jaccard_ani', 'max_containment_ani', + 'query_containment_ani', 'match_containment_ani']) def calculate_prefetch_info(query, match, scaled, threshold): @@ -532,6 +538,12 @@ def calculate_prefetch_info(query, match, scaled, threshold): query_filename=query.filename, query_name=query.name, query_md5=query.md5sum()[:8], + ksize = query_mh.ksize, + moltype = query_mh.moltype, + num = query_mh.num, + scaled = scaled, + query_nhashes=len(query_mh), + query_abundance=query_mh.track_abundance, jaccard_ani=jaccard_ani, max_containment_ani=max_containment_ani, query_containment_ani=query_containment_ani, diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0488de86b7..bbb6f9f3da 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -202,7 +202,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], unique_intersect_bp = int(row['unique_intersect_bp']) query_md5 = row['query_md5'] query_filename = row['query_filename'] - if query_name not in query_info.keys(): #THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same? + if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug? if "query_bp" in row.keys(): query_bp = int(row["query_bp"]) else: @@ -263,7 +263,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], bp_intersect_at_rank = sum_uniq_bp[query_name][lineage] if estimate_query_ani: query_ani = containment_to_distance(f_uniq_to_query_at_rank, ksize, scaled, - n_unique_kmers= qInfo.query_bp, return_identity=True) + n_unique_kmers= qInfo.query_bp, return_identity=True)[0] sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) @@ -283,7 +283,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], total_bp_classified += bp_intersect_at_rank if estimate_query_ani: query_ani = containment_to_distance(f_uniq_to_query_at_rank, ksize, scaled, - n_unique_kmers= qInfo.query_bp, return_identity=True) + n_unique_kmers= qInfo.query_bp, return_identity=True)[0] sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 48861e15f6..73609dbe44 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -22,9 +22,11 @@ from sourmash.lca.lca_utils import LineagePair # utility functions for testing -def make_mini_gather_results(g_infolist): +def make_mini_gather_results(g_infolist, include_ksize_and_scaled=False): # make mini gather_results min_header = ["query_name", "name", "match_ident", "f_unique_to_query", "query_md5", "query_filename", "f_unique_weighted", "unique_intersect_bp", "remaining_bp"] + if include_ksize_and_scaled: + min_header.extend(['ksize', 'scaled']) gather_results = [] for g_info in g_infolist: inf = dict(zip(min_header, g_info)) @@ -532,32 +534,38 @@ def test_summarize_gather_at_missing_fail(): def test_summarize_gather_at_best_only_0(): """test two matches, diff f_unique_to_query""" # make mini gather_results - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40'] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90'] - g_res = make_mini_gather_results([gA,gB]) + ksize =31 + scaled=10 + gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] + gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.5', '10', '90', ksize, scaled] + g_res = make_mini_gather_results([gA,gB],include_ksize_and_scaled=True) # make mini taxonomy gA_tax = ("gA", "a;b;c") gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True,estimate_query_ani=True) # superkingdom assert len(sk_sum) == 1 print("superkingdom summarized gather: ", sk_sum[0]) assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 + print("superk ANI:",sk_sum[0].query_ani_at_rank) + assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True,estimate_query_ani=True) print("phylum summarized gather: ", phy_sum[0]) assert len(phy_sum) == 1 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 assert phy_sum[0].bp_match_at_rank == 70 + print("phy ANI:",phy_sum[0].query_ani_at_rank) + assert phy_sum[0].query_ani_at_rank == 0.9885602934376099 # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True) + cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True, estimate_query_ani=True) assert len(cl_sum) == 1 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -565,6 +573,8 @@ def test_summarize_gather_at_best_only_0(): LineagePair(rank='class', name='c')) assert cl_sum[0].fraction == 0.6 assert cl_sum[0].bp_match_at_rank == 60 + print("cl ANI:",cl_sum[0].query_ani_at_rank) + assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 def test_summarize_gather_at_best_only_equal_choose_first(): From 581b4041c1a83585847fe1532131db597530d7ba Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 19 Jan 2022 17:37:57 -0800 Subject: [PATCH 25/38] add tests for ani_threshold --- src/sourmash/tax/tax_utils.py | 54 ++++++++++--------- tests/test-data/tax/test1.gather_ani.csv | 5 ++ tests/test_tax.py | 66 ++++++++++++++++++++++++ tests/test_tax_utils.py | 28 +++++++--- 4 files changed, 121 insertions(+), 32 deletions(-) create mode 100644 tests/test-data/tax/test1.gather_ani.csv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index bbb6f9f3da..0d328c8ef9 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -19,7 +19,7 @@ from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file -QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp") +QueryInfo = namedtuple("QueryInfo", "query_md5, query_filename, query_bp, query_hashes") SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") ClassificationResult = namedtuple("ClassificationResult", "query_name, status, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_match_at_rank, query_ani_at_rank") @@ -193,7 +193,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sum_uniq_to_query = defaultdict(lambda: defaultdict(float)) sum_uniq_bp = defaultdict(lambda: defaultdict(float)) query_info = {} - ksize,scaled=None,None + ksize,scaled,query_nhashes=None,None,None for row in gather_results: # get essential gather info query_name = row['query_name'] @@ -203,20 +203,22 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], query_md5 = row['query_md5'] query_filename = row['query_filename'] if query_name not in query_info.keys(): #REMOVING THIS AFFECTS GATHER RESULTS!!! BUT query bp should always be same for same query? bug? + if "query_nhashes" in row.keys(): + query_nhashes = int(row["query_nhashes"]) if "query_bp" in row.keys(): query_bp = int(row["query_bp"]) else: query_bp = unique_intersect_bp + int(row['remaining_bp']) # store query info - query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp) + query_info[query_name] = QueryInfo(query_md5=query_md5, query_filename=query_filename, query_bp=query_bp, query_hashes = query_nhashes) if estimate_query_ani and (not ksize or not scaled): # just need to set these once. BUT, if we have these, should we check for compatibility when loading the gather file? if "ksize" in row.keys(): # ksize and scaled were added to gather results in same PR - ksize = row['ksize'] - scaled = row['scaled'] + ksize = int(row['ksize']) + scaled = int(row['scaled']) else: estimate_query_ani=False - notify("Error: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI...") + notify("WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI...") match_ident = row['name'] @@ -254,47 +256,51 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], sumgather_items.sort(key = lambda x: -x[1]) query_ani = None if best_only: - lineage, f_uniq_to_query_at_rank = sumgather_items[0] - if f_uniq_to_query_at_rank > 1: - raise ValueError(f"The tax summary of query '{query_name}' is {f_uniq_to_query_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif f_uniq_to_query_at_rank == 0: + lineage, fraction = sumgather_items[0] + if fraction > 1: + raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") + elif fraction == 0: continue f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] bp_intersect_at_rank = sum_uniq_bp[query_name][lineage] if estimate_query_ani: - query_ani = containment_to_distance(f_uniq_to_query_at_rank, ksize, scaled, - n_unique_kmers= qInfo.query_bp, return_identity=True)[0] - sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, qInfo.query_md5, + query_ani = containment_to_distance(fraction, ksize, scaled, + n_unique_kmers= qInfo.query_hashes, sequence_len_bp= qInfo.query_bp, + return_identity=True)[0] + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, qInfo.query_md5, qInfo.query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) else: total_f_weighted= 0.0 total_f_classified = 0.0 total_bp_classified = 0 - for lineage, f_uniq_to_query_at_rank in sumgather_items: - if f_uniq_to_query_at_rank > 1: - raise ValueError(f"The tax summary of query '{query_name}' is {f_uniq_to_query_at_rank}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") - elif f_uniq_to_query_at_rank == 0: + for lineage, fraction in sumgather_items: + query_ani=None + if fraction > 1: + raise ValueError(f"The tax summary of query '{query_name}' is {fraction}, which is > 100% of the query!! This should not be possible. Please check that your input files come directly from a single gather run per query.") + elif fraction == 0: continue - total_f_classified += f_uniq_to_query_at_rank + total_f_classified += fraction f_weighted_at_rank = sum_uniq_weighted[query_name][lineage] total_f_weighted += f_weighted_at_rank bp_intersect_at_rank = int(sum_uniq_bp[query_name][lineage]) total_bp_classified += bp_intersect_at_rank if estimate_query_ani: - query_ani = containment_to_distance(f_uniq_to_query_at_rank, ksize, scaled, - n_unique_kmers= qInfo.query_bp, return_identity=True)[0] - sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, query_md5, + query_ani = containment_to_distance(fraction, ksize, scaled, + n_unique_kmers=qInfo.query_hashes, sequence_len_bp=qInfo.query_bp, + return_identity=True)[0] + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) # record unclassified lineage = () - f_uniq_to_query_at_rank = 1.0 - total_f_classified - if f_uniq_to_query_at_rank > 0: + query_ani=None + fraction = 1.0 - total_f_classified + if fraction > 0: f_weighted_at_rank = 1.0 - total_f_weighted bp_intersect_at_rank = qInfo.query_bp - total_bp_classified - sres = SummarizedGatherResult(query_name, rank, f_uniq_to_query_at_rank, lineage, query_md5, + sres = SummarizedGatherResult(query_name, rank, fraction, lineage, query_md5, query_filename, f_weighted_at_rank, bp_intersect_at_rank, query_ani) sum_uniq_to_query_sorted.append(sres) diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_ani.csv new file mode 100644 index 0000000000..14929116af --- /dev/null +++ b/tests/test-data/tax/test1.gather_ani.csv @@ -0,0 +1,5 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000 diff --git a/tests/test_tax.py b/tests/test_tax.py index 57b851aa21..de44bc48e9 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1363,6 +1363,72 @@ def test_genome_over100percent_error(runtmp): assert "ERROR: The tax summary of query 'test1' is 1.1, which is > 100% of the query!!" in runtmp.last_result.err +def test_genome_ani_threshold_input_errors(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + below_threshold = "-1" + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', tax, '--taxonomy-csv', tax, + '--ani-threshold', below_threshold) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Argument must be >0 and <1" in str(exc.value) + + above_threshold = "1.1" + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', above_threshold) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Argument must be >0 and <1" in str(exc.value) + + not_a_float = "str" + + with pytest.raises(SourmashCommandFailed) as exc: + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', not_a_float) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "ERROR: Must be a floating point number" in str(exc.value) + + +def test_genome_ani_stdout(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather_ani.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.95") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: Please run gather with sourmash >= 4.3 to estimate query ANI at rank. Continuing without ANI..." not in c.last_result.err + assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out + assert 'test1,match,family,0.116,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae,md5,test1.sig,0.073,582000.0,0.9328896594471843' in c.last_result.out + + # more lax threshold + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "0.9") + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + + def test_annotate_0(runtmp): # test annotate c = runtmp diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 73609dbe44..e8aadaa23f 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1,6 +1,7 @@ """ Tests for functions in taxonomy submodule. """ +from types import NoneType import pytest from os.path import basename @@ -353,30 +354,36 @@ def test_summarize_gather_at_0(): def test_summarize_gather_at_1(): """test two matches, diff f_unique_to_query""" # make mini gather_results - gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40'] - gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90'] - g_res = make_mini_gather_results([gA,gB]) + ksize=31 + scaled=10 + gA = ["queryA", "gA","0.5","0.6", "queryA_md5", "queryA.sig", '0.5', '60', '40', ksize, scaled] + gB = ["queryA", "gB","0.3","0.1", "queryA_md5", "queryA.sig", '0.1', '10', '90', ksize, scaled] + g_res = make_mini_gather_results([gA,gB], include_ksize_and_scaled=True) # make mini taxonomy gA_tax = ("gA", "a;b;c") gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, estimate_query_ani=True) # superkingdom assert len(sk_sum) == 2 - print("superkingdom summarized gather: ", sk_sum[0]) + print("\nsuperkingdom summarized gather 0: ", sk_sum[0]) assert sk_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),) assert sk_sum[0].fraction == 0.7 assert sk_sum[0].bp_match_at_rank == 70 + print("superkingdom summarized gather 1: ", sk_sum[1]) assert sk_sum[1].lineage == () assert round(sk_sum[1].fraction, 1) == 0.3 assert sk_sum[1].bp_match_at_rank == 30 + assert sk_sum[0].query_ani_at_rank == 0.9885602934376099 + assert sk_sum[1].query_ani_at_rank == None # phylum - phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) - print("phylum summarized gather: ", phy_sum[0]) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, estimate_query_ani=False) + print("phylum summarized gather 0: ", phy_sum[0]) + print("phylum summarized gather 1: ", phy_sum[1]) assert len(phy_sum) == 2 assert phy_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'),LineagePair(rank='phylum', name='b')) assert phy_sum[0].fraction == 0.7 @@ -385,8 +392,10 @@ def test_summarize_gather_at_1(): assert phy_sum[1].lineage == () assert round(phy_sum[1].fraction, 1) == 0.3 assert phy_sum[1].bp_match_at_rank == 30 + assert phy_sum[0].query_ani_at_rank == None + assert phy_sum[1].query_ani_at_rank == None # class - cl_sum, _ = summarize_gather_at("class", taxD, g_res) + cl_sum, _ = summarize_gather_at("class", taxD, g_res, estimate_query_ani=True) assert len(cl_sum) == 3 print("class summarized gather: ", cl_sum) assert cl_sum[0].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -395,6 +404,7 @@ def test_summarize_gather_at_1(): assert cl_sum[0].fraction == 0.6 assert cl_sum[0].f_weighted_at_rank == 0.5 assert cl_sum[0].bp_match_at_rank == 60 + assert cl_sum[0].query_ani_at_rank == 0.9836567776983505 assert cl_sum[1].rank == 'class' assert cl_sum[1].lineage == (LineagePair(rank='superkingdom', name='a'), @@ -403,8 +413,10 @@ def test_summarize_gather_at_1(): assert cl_sum[1].fraction == 0.1 assert cl_sum[1].f_weighted_at_rank == 0.1 assert cl_sum[1].bp_match_at_rank == 10 + assert cl_sum[1].query_ani_at_rank == 0.9284145445194744 assert cl_sum[2].lineage == () assert round(cl_sum[2].fraction, 1) == 0.3 + assert cl_sum[2].query_ani_at_rank == None def test_summarize_gather_at_perfect_match(): From 0adc5f222e731665fe6e4cac39df4aac8bdc4285 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 19 Jan 2022 17:45:03 -0800 Subject: [PATCH 26/38] fix typo --- tests/test_tax_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e8aadaa23f..17e457b2b3 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1,7 +1,7 @@ """ Tests for functions in taxonomy submodule. """ -from types import NoneType + import pytest from os.path import basename From 041edf1703cce14092370b2f083303aa285ca0a8 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 20 Jan 2022 10:09:00 -0800 Subject: [PATCH 27/38] addl tests --- src/sourmash/tax/__main__.py | 3 ++- tests/test-data/tax/test1.gather_ani.csv | 10 +++++----- tests/test_tax.py | 23 ++++++++++++++++++++++- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index efc8feaa87..1cee174eb1 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -185,6 +185,7 @@ def genome(args): keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, best_only=True, seen_perfect=seen_perfect, estimate_query_ani=True) + except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -209,7 +210,7 @@ def genome(args): lin_list = display_lineage(sg.lineage).split(';') krona_results.append((sg.fraction, *lin_list)) else: - # classify to the match that passes the containment threshold. + # classify to the rank/match that passes the containment threshold. # To do - do we want to store anything for this match if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): # gets best_at_rank for all queries in this gather_csv diff --git a/tests/test-data/tax/test1.gather_ani.csv b/tests/test-data/tax/test1.gather_ani.csv index 14929116af..48a09eb199 100644 --- a/tests/test-data/tax/test1.gather_ani.csv +++ b/tests/test-data/tax/test1.gather_ani.csv @@ -1,5 +1,5 @@ -intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled -442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000 -390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000 -138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000 -338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000 +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name,query_md5,query_filename,ksize,scaled,query_nhashes +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1,md5,test1.sig,31,1000,5013970 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1,md5,test1.sig,31,1000,4571970 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1,md5,test1.sig,31,1000,4181970 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1,md5,test1.sig,31,1000,4327970 diff --git a/tests/test_tax.py b/tests/test_tax.py index de44bc48e9..8737cf1c62 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -702,6 +702,18 @@ def test_genome_rank_stdout_0_db(runtmp): assert 'query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank' in c.last_result.out assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + # too stringent of containment threshold: + c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', + tax, '--rank', 'species', '--containment-threshold', '1.0') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "WARNING: classifying query test1 at desired rank species does not meet containment threshold 1.0" in c.last_result.err + assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0," in c.last_result.out + def test_genome_rank_csv_0(runtmp): # test basic genome - output csv @@ -1400,7 +1412,7 @@ def test_genome_ani_threshold_input_errors(runtmp): assert "ERROR: Must be a floating point number" in str(exc.value) -def test_genome_ani_stdout(runtmp): +def test_genome_ani_threshold(runtmp): c = runtmp g_csv = utils.get_test_data('tax/test1.gather_ani.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -1428,6 +1440,15 @@ def test_genome_ani_stdout(runtmp): assert c.last_result.status == 0 assert 'test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0' in c.last_result.out + # too stringent of threshold (using rank) + c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', tax, + '--ani-threshold', "1.0", '--rank', 'species') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "WARNING: classifying query test1 at desired rank species does not meet query ANI/AAI threshold 1.0" in c.last_result.err + assert "test1,below_threshold,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000.0,0.9247805047263588" in c.last_result.out + def test_annotate_0(runtmp): # test annotate From 75402f351d6249f5a73f3c9e3e31a943f193f177 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 20 Jan 2022 11:38:02 -0800 Subject: [PATCH 28/38] report CI in prefetch, gather bc I wants them for testing --- src/sourmash/search.py | 47 +++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index e1554a2c27..17ec6ecf0c 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -249,8 +249,9 @@ def search_databases_with_abund_query(query, databases, **kwargs): 'f_unique_weighted','average_abund', 'median_abund', 'std_abund', 'filename', 'name', 'md5', 'match', 'f_match_orig', 'unique_intersect_bp', 'gather_result_rank', 'remaining_bp', 'query_filename', 'query_name', 'query_md5', 'query_bp', 'ksize', - 'moltype', 'num', 'scaled', 'query_nhashes', 'query_abundance', 'match_containment_ani', - 'query_containment_ani']) + 'moltype', 'num', 'scaled', 'query_nhashes', 'query_abundance', 'match_ani', + 'match_ani_ci_low', 'match_ani_ci_high', 'query_ani', 'query_ani_ci_low', + 'query_ani_ci_high']) def _find_best(counters, query, threshold_bp): @@ -412,14 +413,14 @@ def __next__(self): f_match_orig = found_mh.contained_by(orig_query_mh) # calculate ani using match containment by query - match_containment_ani = containment_to_distance(f_match_orig, found_mh.ksize, scaled, + match_ani, match_ani_low, match_ani_high = containment_to_distance(f_match_orig, found_mh.ksize, scaled, n_unique_kmers=(len(found_mh) * scaled), - return_identity=True)[0] + return_identity=True) # calculate ani using query containment by match -- useful for genome classification - orig_query_containment_ani = containment_to_distance(f_orig_query, orig_query_mh.ksize, scaled, + orig_query_ani, query_ani_low, query_ani_high = containment_to_distance(f_orig_query, orig_query_mh.ksize, scaled, n_unique_kmers=(orig_query_len * scaled), - return_identity=True)[0] + return_identity=True) # calculate scores weighted by abundances f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mh.hashes )) @@ -475,8 +476,12 @@ def __next__(self): scaled = scaled, query_nhashes=len(self.orig_query_mh), query_abundance=self.orig_query_mh.track_abundance, - match_containment_ani=match_containment_ani, - query_containment_ani=orig_query_containment_ani, + match_ani=match_ani, + match_ani_ci_low=match_ani_low, + match_ani_ci_high=match_ani_high, + query_ani=orig_query_ani, + query_ani_ci_low=query_ani_low, + query_ani_ci_high=query_ani_high, ) self.result_n += 1 self.query = new_query @@ -493,9 +498,11 @@ def __next__(self): ['intersect_bp', 'jaccard', 'max_containment', 'f_query_match', 'f_match_query', 'match', 'match_filename', 'match_name', 'match_md5', 'match_bp', 'query', 'query_filename', 'query_name', - 'query_md5', 'query_bp', 'ksize','moltype', 'num', 'scaled', - 'query_nhashes', 'query_abundance','jaccard_ani', 'max_containment_ani', - 'query_containment_ani', 'match_containment_ani']) + 'query_md5', 'query_bp', 'ksize', 'moltype', 'num', 'scaled', + 'query_nhashes', 'query_abundance','jaccard_ani', 'jaccard_ani_ci_low', + 'jaccard_ani_ci_high', 'max_containment_ani', 'query_ani', + 'query_ani_ci_low', 'query_ani_ci_high', 'match_ani', + 'match_ani_ci_low', "match_ani_ci_high"]) def calculate_prefetch_info(query, match, scaled, threshold): @@ -516,10 +523,10 @@ def calculate_prefetch_info(query, match, scaled, threshold): jaccard=db_mh.jaccard(query_mh) # passing in jaccard/containment avoids recalc (but it better be the right one :) - jaccard_ani = query.jaccard_ani(match, jaccard=jaccard)[0] - query_containment_ani = query.containment_ani(match, containment=f_match_query)[0] - match_containment_ani = match.containment_ani(query, containment=f_query_match)[0] - max_containment_ani = max(query_containment_ani, match_containment_ani) + jaccard_ani, jaccard_ani_low, jaccard_ani_high = query.jaccard_ani(match, jaccard=jaccard) + query_ani, query_ani_low, query_ani_high = query.containment_ani(match, containment=f_match_query) + match_ani, match_ani_low, match_ani_high = match.containment_ani(query, containment=f_query_match) + max_containment_ani = max(query_ani, match_ani) # build a result namedtuple result = PrefetchResult( @@ -545,9 +552,15 @@ def calculate_prefetch_info(query, match, scaled, threshold): query_nhashes=len(query_mh), query_abundance=query_mh.track_abundance, jaccard_ani=jaccard_ani, + jaccard_ani_ci_low=jaccard_ani_low, + jaccard_ani_ci_high=jaccard_ani_high, max_containment_ani=max_containment_ani, - query_containment_ani=query_containment_ani, - match_containment_ani=match_containment_ani, + query_ani=query_ani, + query_ani_ci_low=query_ani_low, + query_ani_ci_high=query_ani_high, + match_ani=match_ani, + match_ani_ci_low=match_ani_low, + match_ani_ci_high=match_ani_high, ) return result From ae97dc02fbcef0eeb192ff3cbbaf87816e3c1933 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 25 Jan 2022 12:33:39 -0800 Subject: [PATCH 29/38] estimate ANI during search --- src/sourmash/commands.py | 5 +- src/sourmash/distance_utils.py | 57 +++++++++------- src/sourmash/index.py | 2 +- src/sourmash/search.py | 47 +++++++++++-- tests/test_distance_utils.py | 66 ++++++++---------- tests/test_minhash.py | 10 +-- tests/test_signature.py | 10 +-- tests/test_sourmash.py | 121 +++++++++++++++++++++++++++++++++ 8 files changed, 238 insertions(+), 80 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 87d75d1caf..5e022d420d 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -16,7 +16,7 @@ from .logging import notify, error, print_results, set_quiet from .sourmash_args import (FileOutput, FileOutputCSV, SaveSignaturesToLocation) -from .search import prefetch_database, PrefetchResult, GatherResult, calculate_prefetch_info +from .search import SearchResult, prefetch_database, PrefetchResult, GatherResult, calculate_prefetch_info from .index import LazyLinearIndex WATERMARK_SIZE = 10000 @@ -534,8 +534,7 @@ def search(args): notify("** reporting only one match because --best-only was set") if args.output: - fieldnames = ['similarity', 'name', 'filename', 'md5', - 'query_filename', 'query_name', 'query_md5'] + fieldnames = SearchResult._fields with FileOutputCSV(args.output) as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 3dbec8f26e..ec95ff868e 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -78,28 +78,33 @@ def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, seq elif containment >= 0.9999: # changed from 1.0, to mirror jaccard_to_distance_CI_one_step sol1 = sol2 = point_estimate = 0.0 else: - alpha = 1 - confidence - z_alpha = probit(1-alpha/2) - f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 + try: + alpha = 1 - confidence + z_alpha = probit(1-alpha/2) + f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 - bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers + bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers - term_1 = (1.0-f_scaled) / (f_scaled * n_unique_kmers**3 * bias_factor**2) - term_2 = lambda pest: n_unique_kmers * exp_n_mutated(n_unique_kmers, ksize, pest) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) - term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (n_unique_kmers**2) + term_1 = (1.0-f_scaled) / (f_scaled * n_unique_kmers**3 * bias_factor**2) + term_2 = lambda pest: n_unique_kmers * exp_n_mutated(n_unique_kmers, ksize, pest) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) + term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (n_unique_kmers**2) - var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) + var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) - f1 = lambda pest: (1-pest)**ksize + z_alpha * sqrt(var_direct(pest)) - containment - f2 = lambda pest: (1-pest)**ksize - z_alpha * sqrt(var_direct(pest)) - containment + f1 = lambda pest: (1-pest)**ksize + z_alpha * sqrt(var_direct(pest)) - containment + f2 = lambda pest: (1-pest)**ksize - z_alpha * sqrt(var_direct(pest)) - containment - sol1 = brentq(f1, 0.0000001, 0.9999999) - sol2 = brentq(f2, 0.0000001, 0.9999999) + sol1 = brentq(f1, 0.0000001, 0.9999999) + sol2 = brentq(f2, 0.0000001, 0.9999999) - point_estimate = 1.0-containment**(1.0/ksize) + point_estimate = 1.0-containment**(1.0/ksize) + + except ValueError as exc: + notify("WARNING: Cannot estimate ANI. Are your minhashes big enough?") + notify(str(exc)) + return None, None, None if return_identity: point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) - #distance_using_CImidpoint = (sol2+sol1)/2.0 return point_estimate,sol2,sol1 @@ -130,19 +135,25 @@ def jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=None, sequence_le elif jaccard >= 0.9999: sol1 = sol2 = point_estimate = 0.0 else: - alpha = 1 - confidence - z_alpha = probit(1-alpha/2) - f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 + try: + alpha = 1 - confidence + z_alpha = probit(1-alpha/2) + f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 + + var_direct = lambda pest: variance_scaled_jaccard(n_unique_kmers, pest, ksize, f_scaled) - var_direct = lambda pest: variance_scaled_jaccard(n_unique_kmers, pest, ksize, f_scaled) + f1 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 + z_alpha * sqrt(var_direct(pest)) - jaccard + f2 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 - z_alpha * sqrt(var_direct(pest)) - jaccard - f1 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 + z_alpha * sqrt(var_direct(pest)) - jaccard - f2 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 - z_alpha * sqrt(var_direct(pest)) - jaccard + sol1 = brentq(f1, 0.0000001, 0.9999999) + sol2 = brentq(f2, 0.0000001, 0.9999999) + point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) - sol1 = brentq(f1, 0.0001, 0.9999) - sol2 = brentq(f2, 0.0001, 0.9999) + except ValueError as exc: + notify("WARNING: Cannot estimate ANI. Are your minhashes big enough?") + notify(str(exc)) + return None, None, None - point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) if return_identity: point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 4700efacf3..3fad7d6326 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -201,7 +201,7 @@ def search_abund(self, query, *, threshold=None, **kwargs): raise TypeError("'search_abund' requires subject signatures with abundance information") score = query.similarity(subj) if score >= threshold: - matches.append(IndexSearchResult(score, subj, self.location)) + matches.append(IndexSearchResult(score, subj, loc)) # sort! matches.sort(key=lambda x: -x.score) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 17ec6ecf0c..150e3c3fbf 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -5,7 +5,7 @@ from enum import Enum import numpy as np -from sourmash.distance_utils import containment_to_distance +from sourmash.distance_utils import containment_to_distance, jaccard_to_distance from .signature import SourmashSignature @@ -162,7 +162,9 @@ def collect(self, score, match): # generic SearchResult tuple. SearchResult = namedtuple('SearchResult', - 'similarity, match, md5, filename, name, query, query_filename, query_name, query_md5') + ['similarity', 'match', 'md5', 'filename', 'name', + 'query', 'query_filename', 'query_name', 'query_md5', + 'ksize', 'scaled', 'estimated_ani', 'ani_ci_low', 'ani_ci_high']) def format_bp(bp): @@ -195,7 +197,31 @@ def search_databases_with_flat_query(query, databases, **kwargs): results.sort(key=lambda x: -x[0]) x = [] + query_mh = query.minhash + ksize = query_mh.ksize + scaled = query_mh.scaled + search_scaled=0 + ani, ani_low, ani_high = None, None, None for (score, match, filename) in results: + match_mh = match.minhash + if scaled: # if scaled, we can get ANI estimates + search_scaled = min(query_mh.scaled, match_mh.scaled) + if search_scaled > query_mh.scaled: + query_mh = query_mh.downsample(scaled=search_scaled) + if search_scaled > match_mh.scaled: + match_mh = match_mh.downsample(scaled=search_scaled) + if kwargs.get('do_containment'): + ani, ani_low, ani_high = containment_to_distance(score, ksize, search_scaled, + n_unique_kmers=len(query_mh), return_identity=True) + elif kwargs.get('do_max_containment'): + min_n_kmers = min(len(query_mh), len(match_mh)) + ani, ani_low, ani_high = containment_to_distance(score, ksize, search_scaled, + n_unique_kmers=min_n_kmers, return_identity=True) + else: + avg_n_kmers = round(len(query_mh) + len(match_mh)/2) + ani, ani_low, ani_high = jaccard_to_distance(score, ksize, search_scaled, + n_unique_kmers=avg_n_kmers, return_identity=True) + x.append(SearchResult(similarity=score, match=match, md5=match.md5sum(), @@ -204,7 +230,12 @@ def search_databases_with_flat_query(query, databases, **kwargs): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize=ksize, + scaled=search_scaled, + estimated_ani=ani, + ani_ci_low=ani_low, + ani_ci_high=ani_high, )) return x @@ -228,7 +259,10 @@ def search_databases_with_abund_query(query, databases, **kwargs): results.sort(key=lambda x: -x[0]) x = [] + search_scaled=0 for (score, match, filename) in results: + if query.minhash.scaled: + search_scaled = min(query.minhash.scaled, match.minhash.scaled) x.append(SearchResult(similarity=score, match=match, md5=match.md5sum(), @@ -237,7 +271,12 @@ def search_databases_with_abund_query(query, databases, **kwargs): query=query, query_filename=query.filename, query_name=query.name, - query_md5=query.md5sum()[:8] + query_md5=query.md5sum()[:8], + ksize=query.minhash.ksize, + scaled=search_scaled, + estimated_ani=None, + ani_ci_low=None, + ani_ci_high=None, )) return x diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index fc2f0515ff..432554fc10 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -233,19 +233,13 @@ def test_jaccard_to_distance_scaled1(): dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + print(f"({dist},{low},{high})") # check results - exp_dist, exp_low,exp_high = 0.019122659390482077,0.017549816764592427,0.02085752009333101 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert (dist,low,high) == (0.019122659390482077,0.017549816764593173,0.020857520093330525) # return identity instead - exp_id, exp_idlow,exp_idhigh = 0.9808773406095179,0.979142479906669,0.9824501832354076 ident,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers,return_identity=True) - print(f"{ident},{low},{high}") - assert ident == exp_id - assert low == exp_idlow - assert high == exp_idhigh + print(f"({ident},{low},{high})") + assert (ident,low, high) == (0.9808773406095179,0.9791424799066695,0.9824501832354068) def test_jaccard_to_distance_scaled100(): @@ -258,10 +252,7 @@ def test_jaccard_to_distance_scaled100(): print("CI:", low, " - ", high) print(f"{dist},{low},{high}") # check results - exp_dist, exp_low,exp_high = 0.019122659390482077,0.014156518319318126,0.025095471100086125 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + (dist,low,high) = (0.019122659390482077,0.014156518319161603,0.02509547110001303) def test_jaccard_to_distance_k31(): @@ -272,12 +263,9 @@ def test_jaccard_to_distance_k31(): dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + print(f"({dist},{low},{high})") # check results - exp_dist, exp_low,exp_high = 0.012994354410710174,0.009559840649526866,0.017172145712325716 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert (dist, low, high) == (0.012994354410710174,0.009559840649526755,0.017172145712325677) def test_jaccard_to_distance_k31_2(): @@ -288,12 +276,9 @@ def test_jaccard_to_distance_k31_2(): dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + print(f"({dist},{low},{high})") # check results - exp_dist, exp_low,exp_high = 0.0535071608231702,0.040739632227821516,0.06673746391115623 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert (dist, low, high) == (0.0535071608231702,0.04073963222782166,0.0667374639111566) def test_jaccard_to_distance_confidence(): @@ -305,27 +290,21 @@ def test_jaccard_to_distance_confidence(): dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + print(f"({dist},{low},{high})") # check results - exp_dist, exp_low,exp_high = 0.0535071608231702,0.03702518586582857,0.07080999238232429 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert (dist,low,high) == (0.0535071608231702,0.03702518586582811,0.07080999238232542) confidence=0.90 dist,low,high = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) print("\nDIST:", dist) print("CI:", low, " - ", high) - print(f"{dist},{low},{high}") + print(f"({dist},{low},{high})") # check results - exp_dist, exp_low,exp_high = 0.0535071608231702,0.042708361726101415,0.0646280650023921 - assert dist == exp_dist - assert low == exp_low - assert high == exp_high + assert(dist,low,high) == (0.0535071608231702,0.04270836172610141,0.06462806500239282) def test_nkmers_to_bp(): - jaccard = 0.1 + jaccard = containment = 0.1 scaled = 100 bp_len = 10030 ksize=31 @@ -334,9 +313,18 @@ def test_nkmers_to_bp(): confidence=0.99 kmer_dist = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) bp_dist = jaccard_to_distance(jaccard,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len) - print(f"\nkDIST:", kmer_dist) - print(f"\nbpDIST:",bp_dist) + print(f"\nk_jDIST:", kmer_dist) + print(f"\nbp_jDIST:",bp_dist) # check results - assert kmer_dist == (0.0535071608231702, 0.03702518586582857, 0.07080999238232429) - assert bp_dist == (0.0535071608231702, 0.03702518586582857, 0.07080999238232429) + assert kmer_dist == (0.0535071608231702, 0.03702518586582811, 0.07080999238232542) + assert bp_dist == (0.0535071608231702, 0.03702518586582811, 0.07080999238232542) assert kmer_dist==bp_dist + + kmer_cdist = containment_to_distance(containment,ksize,scaled,confidence=confidence,n_unique_kmers=nkmers) + bp_cdist = containment_to_distance(containment,ksize,scaled,confidence=confidence,sequence_len_bp=bp_len) + print(f"\nk_cDIST:", kmer_cdist) + print(f"\nbp_cDIST:",bp_cdist) + # check results + assert kmer_cdist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) + assert bp_cdist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) + assert kmer_cdist==bp_cdist diff --git a/tests/test_minhash.py b/tests/test_minhash.py index f00d544150..b84da843a0 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -2682,13 +2682,13 @@ def test_jaccard_ANI(): print("\nJACCARD_ANI 90% CI", mh1.jaccard_ani(mh2, confidence=0.9)) print("\nJACCARD_ANI 99% CI", mh1.jaccard_ani(mh2, confidence=0.99)) - assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) - assert mh1.jaccard_ani(mh2, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) - assert mh1.jaccard_ani(mh2, confidence=0.99) == (0.9783711630110239, 0.9774056164150092, 0.9793173653983135) + assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) + assert mh1.jaccard_ani(mh2, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) + assert mh1.jaccard_ani(mh2, confidence=0.99) == (0.9783711630110239, 0.9774056164150094, 0.9793173653983231) # precalc jaccard and assert same result jaccard = mh1.jaccard(mh2) print("\nJACCARD_ANI", mh1.jaccard_ani(mh2,jaccard=jaccard)) - assert mh1.jaccard_ani(mh2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) - assert mh1.jaccard_ani(mh2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) + assert mh1.jaccard_ani(mh2, jaccard=jaccard) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) + assert mh1.jaccard_ani(mh2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) diff --git a/tests/test_signature.py b/tests/test_signature.py index 66245c536c..805ad917d9 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -491,13 +491,13 @@ def test_jaccard_ANI(): print("\nJACCARD_ANI 90% CI", ss1.jaccard_ani(ss2, confidence=0.9)) print("\nJACCARD_ANI 99% CI", ss1.jaccard_ani(ss2, confidence=0.99)) - assert ss1.jaccard_ani(ss2) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) - assert ss1.jaccard_ani(ss2, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) - assert ss1.jaccard_ani(ss2, confidence=0.99) == (0.9783711630110239, 0.9774056164150092, 0.9793173653983135) + assert ss1.jaccard_ani(ss2) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) + assert ss1.jaccard_ani(ss2, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) + assert ss1.jaccard_ani(ss2, confidence=0.99) == (0.9783711630110239, 0.9774056164150094, 0.9793173653983231) # precalc jaccard and assert same result jaccard = ss1.jaccard(ss2) print("\nJACCARD_ANI", ss1.jaccard_ani(ss2,jaccard=jaccard)) - assert ss1.jaccard_ani(ss2, jaccard=jaccard) == (0.9783711630110239, 0.9776381521132324, 0.9790929734698981) - assert ss1.jaccard_ani(ss2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812513, 0.97897770829732) + assert ss1.jaccard_ani(ss2, jaccard=jaccard) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) + assert ss1.jaccard_ani(ss2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 6fa0f73c61..22a3177b66 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1551,11 +1551,18 @@ def test_search_containment_abund_ignore(runtmp): r = csv.DictReader(fp) row = next(r) similarity = row['similarity'] + estimated_ani = row['estimated_ani'] print(f'search output: similarity is {similarity}') + print(f'search output: ani is {estimated_ani}') print(mh1.contained_by(mh2)) assert float(similarity) == mh1.contained_by(mh2) assert float(similarity) == 0.25 + + print(runtmp.last_result.err) + assert "WARNING: Cannot estimate ANI. Are your minhashes big enough?" in runtmp.last_result.err + assert "Error: varN <0.0!" in runtmp.last_result.err + assert estimated_ani == "" def test_search_containment_sbt(runtmp): @@ -5206,3 +5213,117 @@ def test_gather_scaled_1(runtmp, linear_gather, prefetch_gather): assert "1.0 kbp 100.0% 100.0%" in runtmp.last_result.out assert "found 1 matches total;" in runtmp.last_result.out + + +@utils.in_tempdir +def test_search_ani_jaccard(c): + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', testdata1, testdata2) + + c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + csv_file = c.output('xxx.csv') + + with open(csv_file) as fp: + reader = csv.DictReader(fp) + row = next(reader) + print(row) + assert float(row['similarity']) == 0.9288577154308617 + assert row['filename'].endswith('short2.fa.sig') + assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' + assert row['query_filename'].endswith('short.fa') + assert row['query_name'] == '' + assert row['query_md5'] == '9191284a' + assert row['estimated_ani'] == "0.9987884602947684" + + +@utils.in_tempdir +def test_search_ani_empty_abund(c): + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=10,abund', testdata1, testdata2) + + c.run_sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + csv_file = c.output('xxx.csv') + + with open(csv_file) as fp: + reader = csv.DictReader(fp) + row = next(reader) + print(row) + assert float(row['similarity']) == 0.8224046424612483 + assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' + assert row['filename'].endswith('short2.fa.sig') + assert row['query_filename'].endswith('short.fa') + assert row['query_name'] == '' + assert row['query_md5'] == 'b5cc464c' + assert row['estimated_ani'] == "" + + +@utils.in_tempdir +def test_search_ani_containment(c): + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', testdata1, testdata2) + + c.run_sourmash('search', '--containment', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + csv_file = c.output('xxx.csv') + + with open(csv_file) as fp: + reader = csv.DictReader(fp) + row = next(reader) + print(row) + assert float(row['similarity']) == 0.9556701030927836 + assert row['filename'].endswith('short2.fa.sig') + assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' + assert row['query_filename'].endswith('short.fa') + assert row['query_name'] == '' + assert row['query_md5'] == '9191284a' + assert row['estimated_ani'] == "0.9985384076863009" + + # search other direction + c.run_sourmash('search', '--containment', 'short2.fa.sig', 'short.fa.sig', '-o', 'xxxx.csv') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + csv_file = c.output('xxxx.csv') + + with open(csv_file) as fp: + reader = csv.DictReader(fp) + row = next(reader) + print(row) + assert float(row['similarity']) == 0.9706806282722513 + assert row['filename'].endswith('short.fa.sig') + assert row['md5'] == '9191284a3a23a913d8d410f3d53ce8f0' + assert row['query_filename'].endswith('short2.fa') + assert row['query_name'] == '' + assert row['query_md5'] == 'bf752903' + assert row['estimated_ani'] == "0.9990405323606487" + + +@utils.in_tempdir +def test_search_ani_max_containment(c): + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', testdata1, testdata2) + + c.run_sourmash('search', '--max-containment', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + csv_file = c.output('xxx.csv') + + with open(csv_file) as fp: + reader = csv.DictReader(fp) + row = next(reader) + print(row) + assert float(row['similarity']) == 0.9706806282722513 + assert row['filename'].endswith('short2.fa.sig') + assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' + assert row['query_filename'].endswith('short.fa') + assert row['query_name'] == '' + assert row['query_md5'] == '9191284a' + assert row['estimated_ani'] == "0.9990405323606487" \ No newline at end of file From ab5f2fe3d883ae9830cef41e4eea58691c826f0d Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 25 Jan 2022 16:08:37 -0800 Subject: [PATCH 30/38] search_scaled --- src/sourmash/search.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 150e3c3fbf..8ea818d92e 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -164,7 +164,7 @@ def collect(self, score, match): SearchResult = namedtuple('SearchResult', ['similarity', 'match', 'md5', 'filename', 'name', 'query', 'query_filename', 'query_name', 'query_md5', - 'ksize', 'scaled', 'estimated_ani', 'ani_ci_low', 'ani_ci_high']) + 'ksize', 'search_scaled', 'estimated_ani', 'ani_ci_low', 'ani_ci_high']) def format_bp(bp): @@ -205,7 +205,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): for (score, match, filename) in results: match_mh = match.minhash if scaled: # if scaled, we can get ANI estimates - search_scaled = min(query_mh.scaled, match_mh.scaled) + search_scaled = max(query_mh.scaled, match_mh.scaled) if search_scaled > query_mh.scaled: query_mh = query_mh.downsample(scaled=search_scaled) if search_scaled > match_mh.scaled: @@ -218,7 +218,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): ani, ani_low, ani_high = containment_to_distance(score, ksize, search_scaled, n_unique_kmers=min_n_kmers, return_identity=True) else: - avg_n_kmers = round(len(query_mh) + len(match_mh)/2) + avg_n_kmers = round((len(query_mh) + len(match_mh))/2) ani, ani_low, ani_high = jaccard_to_distance(score, ksize, search_scaled, n_unique_kmers=avg_n_kmers, return_identity=True) @@ -232,7 +232,7 @@ def search_databases_with_flat_query(query, databases, **kwargs): query_name=query.name, query_md5=query.md5sum()[:8], ksize=ksize, - scaled=search_scaled, + search_scaled=search_scaled, estimated_ani=ani, ani_ci_low=ani_low, ani_ci_high=ani_high, @@ -273,7 +273,7 @@ def search_databases_with_abund_query(query, databases, **kwargs): query_name=query.name, query_md5=query.md5sum()[:8], ksize=query.minhash.ksize, - scaled=search_scaled, + search_scaled=search_scaled, estimated_ani=None, ani_ci_low=None, ani_ci_high=None, From 3b9bfe71f8e7ce6a85eb2a0f80ca080ee72599af Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 25 Jan 2022 16:10:30 -0800 Subject: [PATCH 31/38] wade into downsampling --- tests/test_sourmash.py | 48 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 22a3177b66..08b55e81f8 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5326,4 +5326,50 @@ def test_search_ani_max_containment(c): assert row['query_filename'].endswith('short.fa') assert row['query_name'] == '' assert row['query_md5'] == '9191284a' - assert row['estimated_ani'] == "0.9990405323606487" \ No newline at end of file + assert row['estimated_ani'] == "0.9990405323606487" + + +@utils.in_tempdir +def test_search_jaccard_ani_downsample(c): + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=2', '--force', testdata1) + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '--force', testdata1) + sig1 = sourmash.load_one_signature("short.fa.sig") + sig2 = sourmash.load_one_signature("short2.fa.sig") + print(f"SCALED: sig1: {sig1.minhash.scaled}, sig2: {sig2.minhash.scaled}") # if don't change name, just reads prior sigfile!!? + + sig1F = c.output('sig1.sig') + sig2F = c.output('sig2.sig') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=2', '--force', testdata1, '-o', sig1F) + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '--force', testdata2, '-o', sig2F) + + sig1 = sourmash.load_one_signature(sig1F) + sig2 = sourmash.load_one_signature(sig2F) + print(f"SCALED: sig1: {sig1.minhash.scaled}, sig2: {sig2.minhash.scaled}") + + c.run_sourmash('search', sig1F, sig2F, '-o', 'xdx.csv') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + csv_file = c.output('xdx.csv') + + with open(csv_file) as fp: + reader = csv.DictReader(fp) + row = next(reader) + print(row) + assert float(row['similarity']) == 0.9296066252587992 + assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' + assert row['filename'].endswith('sig2.sig') + assert row['query_filename'].endswith('short.fa') + assert row['query_name'] == '' + assert row['query_md5'] == '8f74b0b8' + assert row['estimated_ani'] == "0.9988019200011651" + + #downsample manually and assert same ANI + mh1 = sig1.minhash + mh2 = sig2.minhash + mh2_sc2 = mh2.downsample(scaled=mh1.scaled) + print("SCALED:", mh1.scaled, mh2_sc2.scaled) + ani= mh1.jaccard_ani(mh2_sc2) + print(ani) + assert ani == (0.9988019200011651, 0.9980440877843673, 0.9991807844672299) \ No newline at end of file From fdec49b7860f83b4d6b257e66c9453e56a3daf33 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 25 Jan 2022 16:41:14 -0800 Subject: [PATCH 32/38] dist comment --- src/sourmash/distance_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index ec95ff868e..c7230e61a9 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -30,7 +30,7 @@ def var_n_mutated(L,k,r1,q=None): varN = L*(1-q)*(q*(2*k+(2/r1)-1)-2*k) \ + k*(k-1)*(1-q)**2 \ + (2*(1-q)/(r1**2))*((1+(k-1)*(1-q))*r1-q) - if (varN<0.0): + if (varN<0.0): # this seems to happen only with super tiny test data raise ValueError('Error: varN <0.0!') return float(varN) From 8e268f02fce2b892ed5c7a3b41e3db64e7f78deb Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Wed, 2 Feb 2022 11:06:22 -0800 Subject: [PATCH 33/38] handle downsampling in minhash, signature ANI fns --- src/sourmash/minhash.py | 49 +++++++++++++++++++-------- src/sourmash/signature.py | 56 +++++++++++++++++++++---------- tests/test_minhash.py | 68 ++++++++++++++++++++++++++++++++++++++ tests/test_signature.py | 69 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 210 insertions(+), 32 deletions(-) diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 5b186c5663..21b065aeb0 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -649,11 +649,18 @@ def jaccard(self, other, downsample=False): def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): "Calculate Jaccard --> ANI of two MinHash objects." - if not jaccard: - jaccard = self.jaccard(other, downsample=downsample) - avg_scaled_kmers = round((len(self) + len(other))/2) - avg_n_kmers = avg_scaled_kmers * self.scaled # would be better if hll estimate - j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self.ksize, self.scaled, + self_mh = self + other_mh = other + scaled = self.scaled + if downsample: + scaled = max(self_mh.scaled, other_mh.scaled) + self_mh = self.downsample(scaled=scaled) + other_mh = other.downsample(scaled=scaled) + if jaccard is None: + jaccard = self_mh.similarity(other_mh, ignore_abundance=True) + avg_scaled_kmers = round((len(self_mh) + len(other_mh))/2) + avg_n_kmers = avg_scaled_kmers * scaled # would be better if hll estimate + j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self_mh.ksize, scaled, n_unique_kmers=avg_n_kmers, confidence=confidence, return_identity=True) return j_ani, ani_low, ani_high @@ -697,10 +704,17 @@ def contained_by(self, other, downsample=False): def containment_ani(self, other, downsample=False, containment=None, confidence=0.95): "Estimate ANI from containment with the other MinHash." - if not containment: - containment = self.contained_by(other, downsample) - n_kmers = len(self) * self.scaled # would be better if hll estimate - c_ani,ani_low,ani_high = containment_to_distance(containment, self.ksize, self.scaled, + self_mh = self + other_mh = other + scaled = self.scaled + if downsample: + scaled = max(self_mh.scaled, other_mh.scaled) + self_mh = self.downsample(scaled=scaled) + other_mh = other.downsample(scaled=scaled) + if containment is None: + containment = self_mh.contained_by(other_mh) + n_kmers = len(self_mh) * scaled # would be better if hll estimate + c_ani,ani_low,ani_high = containment_to_distance(containment, self_mh.ksize, self_mh.scaled, n_unique_kmers=n_kmers, confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high @@ -719,11 +733,18 @@ def max_containment(self, other, downsample=False): def max_containment_ani(self, other, downsample=False, max_containment=None, confidence=0.95): "Estimate ANI from containment with the other MinHash." - if not max_containment: - max_containment = self.max_containment(other, downsample) - min_n_kmers = min(len(self), len(other)) - n_kmers = min_n_kmers * self.scaled - c_ani,ani_low,ani_high = containment_to_distance(max_containment, self.ksize, self.scaled, + self_mh = self + other_mh = other + scaled = self.scaled + if downsample: + scaled = max(self_mh.scaled, other_mh.scaled) + self_mh = self.downsample(scaled=scaled) + other_mh = other.downsample(scaled=scaled) + if max_containment is None: + max_containment = self_mh.max_containment(other_mh) + min_n_kmers = min(len(self_mh), len(other_mh)) + n_kmers = min_n_kmers * scaled + c_ani,ani_low,ani_high = containment_to_distance(max_containment, self_mh.ksize, scaled, n_unique_kmers=n_kmers,confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index c3d5d9b214..7d308803de 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -145,13 +145,19 @@ def jaccard(self, other): def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): "Compute Jaccard similarity with the other MinHash signature." - if not jaccard: - jaccard = self.minhash.similarity(other.minhash, ignore_abundance=True, - downsample=downsample) - avg_scaled_kmers = round((len(self.minhash.hashes) + len(other.minhash.hashes))/2) - avg_n_kmers = avg_scaled_kmers * self.minhash.scaled # would be better if hll estimate - j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self.minhash.ksize, - self.minhash.scaled, n_unique_kmers=avg_n_kmers, + self_mh = self.minhash + other_mh = other.minhash + scaled = self_mh.scaled + if downsample: + scaled = max(self_mh.scaled, other_mh.scaled) + self_mh = self.minhash.downsample(scaled=scaled) + other_mh = other.minhash.downsample(scaled=scaled) + if jaccard is None: + jaccard = self_mh.similarity(other_mh, ignore_abundance=True) + avg_scaled_kmers = round((len(self_mh) + len(other_mh))/2) + avg_n_kmers = avg_scaled_kmers * scaled # would be better if hll estimate + j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self_mh.ksize, + scaled, n_unique_kmers=avg_n_kmers, confidence=confidence, return_identity=True) return j_ani, ani_low, ani_high @@ -161,11 +167,18 @@ def contained_by(self, other, downsample=False): def containment_ani(self, other, downsample=False, containment=None, confidence=0.95): "Estimate ANI from containment with the other MinHash signature." - if not containment: - containment = self.minhash.contained_by(other.minhash, downsample) - n_kmers = len(self.minhash.hashes) * self.minhash.scaled # would be better if hll estimate - c_ani,ani_low,ani_high = containment_to_distance(containment, self.minhash.ksize, - self.minhash.scaled, n_unique_kmers=n_kmers, + self_mh = self.minhash + other_mh = other.minhash + scaled = self_mh.scaled + if downsample: + scaled = max(self_mh.scaled, other_mh.scaled) + self_mh = self.minhash.downsample(scaled=scaled) + other_mh = other.minhash.downsample(scaled=scaled) + if containment is None: + containment = self_mh.contained_by(other_mh) + n_kmers = len(self_mh) * scaled # would be better if hll estimate + c_ani,ani_low,ani_high = containment_to_distance(containment, self_mh.ksize, + scaled, n_unique_kmers=n_kmers, confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high @@ -175,13 +188,20 @@ def max_containment(self, other, downsample=False): def max_containment_ani(self, other, downsample=False, max_containment=None, confidence=0.95): "Estimate ANI from max containment w/other signature. Note: ignores abundance." - if not max_containment: - max_containment = self.minhash.max_containment(other.minhash, downsample) + self_mh = self.minhash + other_mh = other.minhash + scaled = self_mh.scaled + if downsample: + scaled = max(self_mh.scaled, other_mh.scaled) + self_mh = self.minhash.downsample(scaled=scaled) + other_mh = other.minhash.downsample(scaled=scaled) + if max_containment is None: + max_containment = self_mh.max_containment(other_mh) # max_containment will always use smaller sig as denominator - min_n_kmers = min(len(self.minhash.hashes), len(other.minhash.hashes)) - n_kmers = min_n_kmers * self.minhash.scaled - c_ani,ani_low,ani_high = containment_to_distance(max_containment, self.minhash.ksize, - self.minhash.scaled, n_unique_kmers=n_kmers, + min_n_hashes = min(len(self_mh), len(other_mh)) + n_kmers = min_n_hashes * scaled + c_ani,ani_low,ani_high = containment_to_distance(max_containment, self_mh.ksize, + scaled, n_unique_kmers=n_kmers, confidence=confidence, return_identity=True) return c_ani, ani_low, ani_high diff --git a/tests/test_minhash.py b/tests/test_minhash.py index b84da843a0..69e6c99edb 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -2654,6 +2654,14 @@ def test_containment_ANI(): assert mh2.containment_ani(mh3, confidence=0.9) == (0.9866751346467802, 0.986241913154108, 0.9870974232542815) assert mh3.containment_ani(mh2, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) + +def test_containment_ANI_precalc_containment(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + f3 = utils.get_test_data('47+63.fa.sig') + mh1 = sourmash.load_one_signature(f1, ksize=31).minhash + mh2 = sourmash.load_one_signature(f2, ksize=31).minhash + mh3 = sourmash.load_one_signature(f3, ksize=31).minhash # precalc containments and assert same results s1c = mh1.contained_by(mh2) s2c = mh2.contained_by(mh1) @@ -2672,6 +2680,38 @@ def test_containment_ANI(): assert mh3.max_containment_ani(mh2, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) +def test_containment_ANI_downsample(): + f2 = utils.get_test_data('2+63.fa.sig') + f3 = utils.get_test_data('47+63.fa.sig') + mh2 = sourmash.load_one_signature(f2, ksize=31).minhash + mh3 = sourmash.load_one_signature(f3, ksize=31).minhash + # check that downsampling works properly + print(mh2.scaled) + mh2 = mh2.downsample(scaled=2000) + assert mh2.scaled != mh3.scaled + ds_s3c = mh2.containment_ani(mh3, downsample=True) + ds_s4c = mh3.containment_ani(mh2, downsample=True) + mc_w_ds_1 = mh2.max_containment_ani(mh3, downsample=True) + mc_w_ds_2 = mh3.max_containment_ani(mh2, downsample=True) + + with pytest.raises(ValueError) as e: + mh2.containment_ani(mh3) + assert "ValueError: mismatch in scaled; comparison fail" in e + + with pytest.raises(ValueError) as e: + mh2.max_containment_ani(mh3) + assert "ValueError: mismatch in scaled; comparison fail" in e + + mh3 = mh3.downsample(scaled=2000) + assert mh2.scaled == mh3.scaled + ds_s3c_manual = mh2.containment_ani(mh3) + ds_s4c_manual = mh3.containment_ani(mh2) + ds_mc_manual = mh2.max_containment_ani(mh3) + assert ds_s3c == ds_s3c_manual + assert ds_s4c == ds_s4c_manual + assert mc_w_ds_1 == mc_w_ds_2 == ds_mc_manual + + def test_jaccard_ANI(): f1 = utils.get_test_data('2.fa.sig') f2 = utils.get_test_data('2+63.fa.sig') @@ -2686,9 +2726,37 @@ def test_jaccard_ANI(): assert mh1.jaccard_ani(mh2, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) assert mh1.jaccard_ani(mh2, confidence=0.99) == (0.9783711630110239, 0.9774056164150094, 0.9793173653983231) + +def test_jaccard_ANI_precalc_jaccard(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + mh1 = sourmash.load_one_signature(f1, ksize=31).minhash + mh2 = sourmash.load_one_signature(f2).minhash # precalc jaccard and assert same result jaccard = mh1.jaccard(mh2) print("\nJACCARD_ANI", mh1.jaccard_ani(mh2,jaccard=jaccard)) assert mh1.jaccard_ani(mh2, jaccard=jaccard) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) assert mh1.jaccard_ani(mh2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) + + +def test_jaccard_ANI_downsample(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + mh1 = sourmash.load_one_signature(f1, ksize=31).minhash + mh2 = sourmash.load_one_signature(f2).minhash + + print(mh1.scaled) + mh1 = mh1.downsample(scaled=2000) + assert mh1.scaled != mh2.scaled + with pytest.raises(ValueError) as e: + mh1.jaccard_ani(mh2) + assert "ValueError: mismatch in scaled; comparison fail" in e + + ds_s1c = mh1.jaccard_ani(mh2, downsample=True) + ds_s2c = mh2.jaccard_ani(mh1, downsample=True) + + mh2 = mh2.downsample(scaled=2000) + assert mh1.scaled == mh2.scaled + ds_j_manual = mh1.jaccard_ani(mh2) + assert ds_s1c == ds_s2c == ds_j_manual diff --git a/tests/test_signature.py b/tests/test_signature.py index 805ad917d9..41affb9c2f 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -463,6 +463,15 @@ def test_containment_ANI(): assert ss2.containment_ani(ss3, confidence=0.9) == (0.9866751346467802, 0.986241913154108, 0.9870974232542815) assert ss3.containment_ani(ss2, confidence=0.99) == (0.9868883523107224, 0.9862092287269876, 0.987540474733798) + +def test_containment_ANI_precalc_containment(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + f3 = utils.get_test_data('47+63.fa.sig') + ss1 = load_one_signature(f1, ksize=31) + ss2 = load_one_signature(f2, ksize=31) + ss3 = load_one_signature(f3, ksize=31) + # precalc containments and assert same results s1c = ss1.contained_by(ss2) s2c = ss2.contained_by(ss1) @@ -481,6 +490,38 @@ def test_containment_ANI(): assert ss3.max_containment_ani(ss2, max_containment=s4c) == (0.9868883523107224, 0.986374049720872, 0.9873870188726516) +def test_containment_ANI_downsample(): + f2 = utils.get_test_data('2+63.fa.sig') + f3 = utils.get_test_data('47+63.fa.sig') + ss2 = load_one_signature(f2, ksize=31) + ss3 = load_one_signature(f3, ksize=31) + # check that downsampling works properly + print(ss2.minhash.scaled) + ss2.minhash = ss2.minhash.downsample(scaled=2000) + assert ss2.minhash.scaled != ss3.minhash.scaled + ds_s3c = ss2.containment_ani(ss3, downsample=True) + ds_s4c = ss3.containment_ani(ss2, downsample=True) + mc_w_ds_1 = ss2.max_containment_ani(ss3, downsample=True) + mc_w_ds_2 = ss3.max_containment_ani(ss2, downsample=True) + + with pytest.raises(ValueError) as e: + ss2.containment_ani(ss3) + assert "ValueError: mismatch in scaled; comparison fail" in e + + with pytest.raises(ValueError) as e: + ss2.max_containment_ani(ss3) + assert "ValueError: mismatch in scaled; comparison fail" in e + + ss3.minhash = ss3.minhash.downsample(scaled=2000) + assert ss2.minhash.scaled == ss3.minhash.scaled + ds_s3c_manual = ss2.containment_ani(ss3) + ds_s4c_manual = ss3.containment_ani(ss2) + ds_mc_manual = ss2.max_containment_ani(ss3) + assert ds_s3c == ds_s3c_manual + assert ds_s4c == ds_s4c_manual + assert mc_w_ds_1 == mc_w_ds_2 == ds_mc_manual + + def test_jaccard_ANI(): f1 = utils.get_test_data('2.fa.sig') f2 = utils.get_test_data('2+63.fa.sig') @@ -495,9 +536,37 @@ def test_jaccard_ANI(): assert ss1.jaccard_ani(ss2, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) assert ss1.jaccard_ani(ss2, confidence=0.99) == (0.9783711630110239, 0.9774056164150094, 0.9793173653983231) + +def test_jaccard_ANI_precalc_jaccard(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + ss1 = load_one_signature(f1, ksize=31) + ss2 = load_one_signature(f2) # precalc jaccard and assert same result jaccard = ss1.jaccard(ss2) print("\nJACCARD_ANI", ss1.jaccard_ani(ss2,jaccard=jaccard)) assert ss1.jaccard_ani(ss2, jaccard=jaccard) == (0.9783711630110239, 0.9776381521132318, 0.9790929734698974) assert ss1.jaccard_ani(ss2, jaccard=jaccard, confidence=0.9) == (0.9783711630110239, 0.9777567290812516, 0.9789777082973189) + + +def test_jaccard_ANI_downsample(): + f1 = utils.get_test_data('2.fa.sig') + f2 = utils.get_test_data('2+63.fa.sig') + ss1 = load_one_signature(f1, ksize=31) + ss2 = load_one_signature(f2) + # check that downsampling works properly + print(ss2.minhash.scaled) + ss1.minhash = ss1.minhash.downsample(scaled=2000) + assert ss1.minhash.scaled != ss2.minhash.scaled + ds_s1c = ss1.jaccard_ani(ss2, downsample=True) + ds_s2c = ss2.jaccard_ani(ss1, downsample=True) + + with pytest.raises(ValueError) as e: + ss1.jaccard_ani(ss2) + assert "ValueError: mismatch in scaled; comparison fail" in e + + ss2.minhash = ss2.minhash.downsample(scaled=2000) + assert ss1.minhash.scaled == ss2.minhash.scaled + ds_j_manual = ss1.jaccard_ani(ss2) + assert ds_s1c == ds_s2c == ds_j_manual From 727ee7ffacbfb6e35788ee5202d0753b0fb87284 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 8 Feb 2022 16:21:26 -0800 Subject: [PATCH 34/38] whoops, fix rogue test --- tests/test_sourmash.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 403f64c50f..17bd7293ac 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1580,7 +1580,7 @@ def test_search_containment_abund_ignore(runtmp): assert float(similarity) == mh1.contained_by(mh2) assert float(similarity) == 0.25 - + print(runtmp.last_result.err) assert "WARNING: Cannot estimate ANI. Are your minhashes big enough?" in runtmp.last_result.err assert "Error: varN <0.0!" in runtmp.last_result.err @@ -5256,7 +5256,7 @@ def test_search_ani_jaccard(c): reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['similarity']) == 0.9288577154308617 + assert float(row['similarity']) == 0.9288577154308617 assert row['filename'].endswith('short2.fa.sig') assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' assert row['query_filename'].endswith('short.fa') @@ -5280,7 +5280,7 @@ def test_search_ani_empty_abund(c): reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['similarity']) == 0.8224046424612483 + assert float(row['similarity']) == 0.8224046424612483 assert row['md5'] == 'c9d5a795eeaaf58e286fb299133e1938' assert row['filename'].endswith('short2.fa.sig') assert row['query_filename'].endswith('short.fa') @@ -5304,7 +5304,7 @@ def test_search_ani_containment(c): reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['similarity']) == 0.9556701030927836 + assert float(row['similarity']) == 0.9556701030927836 assert row['filename'].endswith('short2.fa.sig') assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' assert row['query_filename'].endswith('short.fa') @@ -5315,14 +5315,14 @@ def test_search_ani_containment(c): # search other direction c.run_sourmash('search', '--containment', 'short2.fa.sig', 'short.fa.sig', '-o', 'xxxx.csv') print(c.last_result.status, c.last_result.out, c.last_result.err) - + csv_file = c.output('xxxx.csv') with open(csv_file) as fp: reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['similarity']) == 0.9706806282722513 + assert float(row['similarity']) == 0.9706806282722513 assert row['filename'].endswith('short.fa.sig') assert row['md5'] == '9191284a3a23a913d8d410f3d53ce8f0' assert row['query_filename'].endswith('short2.fa') @@ -5346,7 +5346,7 @@ def test_search_ani_max_containment(c): reader = csv.DictReader(fp) row = next(reader) print(row) - assert float(row['similarity']) == 0.9706806282722513 + assert float(row['similarity']) == 0.9706806282722513 assert row['filename'].endswith('short2.fa.sig') assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db' assert row['query_filename'].endswith('short.fa') @@ -5359,10 +5359,12 @@ def test_search_ani_max_containment(c): def test_search_jaccard_ani_downsample(c): testdata1 = utils.get_test_data('short.fa') testdata2 = utils.get_test_data('short2.fa') - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=2', '--force', testdata1) - c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '--force', testdata1) - sig1 = sourmash.load_one_signature("short.fa.sig") - sig2 = sourmash.load_one_signature("short2.fa.sig") + sig1_out = c.output('short.fa.sig') + sig2_out = c.output('short2.fa.sig') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=2', '--force', testdata1, '-o', sig1_out) + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '--force', testdata1, '-o', sig2_out) + sig1 = sourmash.load_one_signature(sig1_out) + sig2 = sourmash.load_one_signature(sig2_out) print(f"SCALED: sig1: {sig1.minhash.scaled}, sig2: {sig2.minhash.scaled}") # if don't change name, just reads prior sigfile!!? sig1F = c.output('sig1.sig') @@ -5398,4 +5400,5 @@ def test_search_jaccard_ani_downsample(c): print("SCALED:", mh1.scaled, mh2_sc2.scaled) ani= mh1.jaccard_ani(mh2_sc2) print(ani) - assert ani == (0.9988019200011651, 0.9980440877843673, 0.9991807844672299) \ No newline at end of file + assert ani == (0.9988019200011651, 0.9980440877843673, 0.9991807844672298) + From e579e3ad5b2173f2ff161c4da2f379aaebaefeab Mon Sep 17 00:00:00 2001 From: Mahmudur Rahman Hera Date: Sat, 12 Mar 2022 17:15:24 -0500 Subject: [PATCH 35/38] Add dist est (#1860) * enabled running main * added point estimate and error bound * added docstring * added usage code * added nothing_common code, usage, docstrings, tested --- src/sourmash/distance_utils.py | 94 ++++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 4 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index c7230e61a9..c2f2f74784 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -7,8 +7,15 @@ from scipy.stats import norm as scipy_norm from scipy.special import hyp2f1 from numpy import sqrt +from math import log, exp -from .logging import notify +def show_error(msg): + print(msg) + +try: + from .logging import notify +except: + notify = show_error #FROM mrcc.kmer_mutation_formulas_thm5 def r1_to_q(k,r1): @@ -22,7 +29,7 @@ def var_n_mutated(L,k,r1,q=None): # by the use of higher-precision arithmetic; the problem occurs when r is # very small; for example, with L=10,k=2,r1=1e-6 standard precision # gives varN<0 which is nonsense; by using the mpf type, we get the correct - # answer which is about 0.000038. + # answer which is about 0.000038. if (r1 == 0): return 0.0 r1 = float(r1) if (q == None): # we assume that if q is provided, it is correct for r1 @@ -166,11 +173,90 @@ def distance_to_identity(dist,d_low=None,d_high=None): """ for d in [dist,d_low,d_high]: if not 0 <= d <= 1: - raise ValueError(f"Error: distance value {d} is not between 0 and 1!") + raise ValueError("Error: distance value {d} is not between 0 and 1!") id = 1-dist id_low,id_high=None,None if d_low is not None: # need to be explicit so will work on 0 value id_high = 1-d_low if d_high is not None: # need to be explicit so will work on 0 value id_low = 1-d_high - return id,id_low,id_high \ No newline at end of file + return id,id_low,id_high + + +def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers): + """Given parameters, calculate point estimate for mutation rate from jaccard index. + First checks if parameters are valid (checks are not exhaustive). Then uses formulas + derived mathematically to compute the point estimate. The formula uses approximations, + therefore a tiny error is associated with it. A lower bound of that error is also returned. + A high error indicates that the point estimate cannot be trusted. Threshold of the error + is open to interpretation, but suggested that > 10^-4 should be handled with caution. + + Note that the error is NOT a mutation rate, and therefore cannot be considered in + something like mut.rate +/- error. + + Arguments: jaccard, ksize, scaled, n_unique_kmers + Returns: tuple (point_estimate_of_mutation_rate, lower_bound_of_error) + """ + assert jaccard >= 0.0 and jaccard <= 1.0 and ksize >= 1 + + point_estimate = 1.0 - ( 2.0 * jaccard / float(1+jaccard) ) ** (1.0/float(ksize)) + + exp_n_mut = exp_n_mutated(n_unique_kmers, ksize, point_estimate) + var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate) + error_lower_bound = 1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut)**3 + + return point_estimate, error_lower_bound + + +def get_expected_log_probability(L, k, p, s): + '''helper function + ''' + exp_nmut = exp_n_mutated(L, k, p) + try: + return (L - exp_nmut) * log(1.0 - s) + except: + return float('-inf') + +def get_exp_probability_nothing_common(n_unique_kmers, ksize, mutation_rate, scaled): + '''Given parameters, calculate the expected probability that nothing will be common + between a fracminhash sketch of a original sequence and a fracminhash sketch of a mutated + sequence. If this is above a threshold, we should suspect that the two sketches may have + nothing in common. The threshold needs to be set with proper insights. + + Arguments: n_unique_kmers, ksize, mutation_rate, scaled + Returns: float - expected likelihood that nothing is common between sketches + ''' + assert 0.0 <= mutation_rate <= 1.0 and ksize >= 1 and scaled >= 1 + L = n_unique_kmers + k = ksize + p = mutation_rate + s = 1.0 / float(scaled) + if p == 1.0: + return 1.0 + return exp( get_expected_log_probability(L, k, p, s) ) + + +if __name__ == '__main__': + jaccard = 0.9 + ksize = 21 + scaled = 10 + n_unique_kmers = 100000 + + # jaccard_to_distance_point_estimate usage + mut_rate, err = jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers) + print('Point estimate is: ' + str(mut_rate)) + if err > 10.0**(-4.0): + print('Cannot trust this point estimate!') + + # get_exp_probability_nothing_common usage + ksize = 21 + scaled = 1000 + n_unique_kmers = 10000000 + mutation_rate = 0.3 + threshold = 10.0**(-3) + + exp_probability_no_common = get_exp_probability_nothing_common(n_unique_kmers, + ksize, mutation_rate, scaled) + print(exp_probability_no_common) + if (exp_probability_no_common >= threshold): + print('There could be cases where nothing common between sketches may happen!') From a349d757d34f99bacdf779fd8b5224ff5f40f482 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Sat, 12 Mar 2022 14:36:56 -0800 Subject: [PATCH 36/38] enable just point estimate for distance_to_identity --- src/sourmash/distance_utils.py | 46 ++++++++++++++++++---------------- tests/test_distance_utils.py | 5 ++++ 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index c2f2f74784..34e29954a7 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -9,13 +9,8 @@ from numpy import sqrt from math import log, exp -def show_error(msg): - print(msg) +from .logging import notify -try: - from .logging import notify -except: - notify = show_error #FROM mrcc.kmer_mutation_formulas_thm5 def r1_to_q(k,r1): @@ -171,19 +166,20 @@ def distance_to_identity(dist,d_low=None,d_high=None): """ ANI = 1-distance """ - for d in [dist,d_low,d_high]: - if not 0 <= d <= 1: - raise ValueError("Error: distance value {d} is not between 0 and 1!") - id = 1-dist + if not 0 <= dist <= 1: + raise ValueError(f"Error: distance value {dist} is not between 0 and 1!") + ident = 1-dist + result = ident id_low,id_high=None,None - if d_low is not None: # need to be explicit so will work on 0 value - id_high = 1-d_low - if d_high is not None: # need to be explicit so will work on 0 value - id_low = 1-d_high - return id,id_low,id_high + if d_low is not None and d_high is not None: + if (0<=d_low<=1) and (0<=d_high<=1): + id_high = 1-d_low + id_low = 1-d_high + result = [ident, id_low, id_high] + return result -def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers): +def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, return_identity=False): """Given parameters, calculate point estimate for mutation rate from jaccard index. First checks if parameters are valid (checks are not exhaustive). Then uses formulas derived mathematically to compute the point estimate. The formula uses approximations, @@ -197,13 +193,21 @@ def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers): Arguments: jaccard, ksize, scaled, n_unique_kmers Returns: tuple (point_estimate_of_mutation_rate, lower_bound_of_error) """ - assert jaccard >= 0.0 and jaccard <= 1.0 and ksize >= 1 + if sequence_len_bp and not n_unique_kmers: + n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) + if jaccard <= 0.0001: + point_estimate = 1.0 + elif jaccard >= 0.9999: + point_estimate = 0.0 + else: + point_estimate = 1.0 - ( 2.0 * jaccard / float(1+jaccard) ) ** (1.0/float(ksize)) - point_estimate = 1.0 - ( 2.0 * jaccard / float(1+jaccard) ) ** (1.0/float(ksize)) + exp_n_mut = exp_n_mutated(n_unique_kmers, ksize, point_estimate) + var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate) + error_lower_bound = 1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut)**3 - exp_n_mut = exp_n_mutated(n_unique_kmers, ksize, point_estimate) - var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate) - error_lower_bound = 1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut)**3 + if return_identity: + point_estimate = distance_to_identity(point_estimate) return point_estimate, error_lower_bound diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index 432554fc10..adf1093571 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -12,6 +12,11 @@ def test_distance_to_identity(): assert id_high ==0.6 +def test_distance_to_identity_just_point_estimate(): + id = distance_to_identity(0.4) + assert id == 0.6 + + def test_distance_to_identity_fail(): with pytest.raises(Exception) as exc: id,id_low,id_high = distance_to_identity(1.1,0.4,0.6) From 58f2a07c72cfe7e046e891338580ce0f3a760753 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Sat, 12 Mar 2022 16:04:00 -0800 Subject: [PATCH 37/38] save in progress changes --- src/sourmash/distance_utils.py | 149 +++++++++++++++++---------------- tests/test_distance_utils.py | 49 +++++++++-- 2 files changed, 118 insertions(+), 80 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 34e29954a7..29afb61f7b 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -69,7 +69,54 @@ def sequence_len_to_n_kmers(sequence_len_bp, ksize): return n_kmers -def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False): +def distance_to_identity(dist,d_low=None,d_high=None): + """ + ANI = 1-distance + """ + if not 0 <= dist <= 1: + raise ValueError(f"Error: distance value {dist} is not between 0 and 1!") + ident = 1-dist + result = ident + id_low,id_high=None,None + if d_low is not None and d_high is not None: + if (0<=d_low<=1) and (0<=d_high<=1): + id_high = 1-d_low + id_low = 1-d_high + result = [ident, id_low, id_high] + return result + + +def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction): + '''helper function + ''' + exp_nmut = exp_n_mutated(n_unique_kmers, ksize, mutation_rate) + try: + return (n_unique_kmers - exp_nmut) * log(1.0 - scaled_fraction) + except: + return float('-inf') + + +def get_exp_probability_nothing_common(mutation_rate, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None): + """ + Given parameters, calculate the expected probability that nothing will be common + between a fracminhash sketch of a original sequence and a fracminhash sketch of a mutated + sequence. If this is above a threshold, we should suspect that the two sketches may have + nothing in common. The threshold needs to be set with proper insights. + + Arguments: n_unique_kmers, ksize, mutation_rate, scaled + Returns: float - expected likelihood that nothing is common between sketches + """ + # NTP note: do we have any checks for ksize >=1 in sourmash_args? The rest should be taken care of. + # assert 0.0 <= mutation_rate <= 1.0 and ksize >= 1 and scaled >= 1 + if sequence_len_bp and not n_unique_kmers: + n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) + sc = 1.0 / float(scaled) + if mutation_rate == 1.0: + return 1.0 + return exp( get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, sc) ) + + +def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False, prob_threshold = 10.0**(-3)): """ Containment --> distance CI (one step) """ @@ -105,8 +152,16 @@ def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, seq notify("WARNING: Cannot estimate ANI. Are your minhashes big enough?") notify(str(exc)) return None, None, None + + # get probability nothing in common + prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) + if prob_nothing_in_common >= prob_threshold: + # how to store /report this information? Do we want a column for search/prefetch/gather? or throw ValueError? + notify('WARNING: These sketches may have no hashes in common based on chance alone.') + if return_identity: point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) + return point_estimate,sol2,sol1 @@ -162,25 +217,9 @@ def jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=None, sequence_le return point_estimate,sol2,sol1 -def distance_to_identity(dist,d_low=None,d_high=None): +def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, return_identity=False, prob_threshold = 10.0**(-3)): """ - ANI = 1-distance - """ - if not 0 <= dist <= 1: - raise ValueError(f"Error: distance value {dist} is not between 0 and 1!") - ident = 1-dist - result = ident - id_low,id_high=None,None - if d_low is not None and d_high is not None: - if (0<=d_low<=1) and (0<=d_high<=1): - id_high = 1-d_low - id_low = 1-d_high - result = [ident, id_low, id_high] - return result - - -def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, return_identity=False): - """Given parameters, calculate point estimate for mutation rate from jaccard index. + Given parameters, calculate point estimate for mutation rate from jaccard index. First checks if parameters are valid (checks are not exhaustive). Then uses formulas derived mathematically to compute the point estimate. The formula uses approximations, therefore a tiny error is associated with it. A lower bound of that error is also returned. @@ -191,8 +230,13 @@ def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=No something like mut.rate +/- error. Arguments: jaccard, ksize, scaled, n_unique_kmers - Returns: tuple (point_estimate_of_mutation_rate, lower_bound_of_error) + # Returns: tuple (point_estimate_of_mutation_rate, lower_bound_of_error) + + # Returns: point_estimate_of_mutation_rate """ + # NTP question: does this equation consider variance of jaccard due to scaled? + #Or was that only used for estimating the CI around the pt estimate? + error_lower_bound = None if sequence_len_bp and not n_unique_kmers: n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) if jaccard <= 0.0001: @@ -205,62 +249,21 @@ def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=No exp_n_mut = exp_n_mutated(n_unique_kmers, ksize, point_estimate) var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate) error_lower_bound = 1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut)**3 + + if error_lower_bound is not None and error_lower_bound > 10.0**(-4.0): + print(f"err: ({error_lower_bound})") + notify(f"WARNING: Error on Jaccard distance point estimate is too high ({error_lower_bound}).") + notify(f"Returning 'NA' for this comparison and continuing.") + point_estimate = "NA" + + # get probability nothing in common + prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) + if prob_nothing_in_common >= prob_threshold: + # how to store this information? Do we want a col in search/prefetch/gather? + notify('WARNING: These sketches may have no hashes in common based on chance alone.') if return_identity: point_estimate = distance_to_identity(point_estimate) return point_estimate, error_lower_bound - -def get_expected_log_probability(L, k, p, s): - '''helper function - ''' - exp_nmut = exp_n_mutated(L, k, p) - try: - return (L - exp_nmut) * log(1.0 - s) - except: - return float('-inf') - -def get_exp_probability_nothing_common(n_unique_kmers, ksize, mutation_rate, scaled): - '''Given parameters, calculate the expected probability that nothing will be common - between a fracminhash sketch of a original sequence and a fracminhash sketch of a mutated - sequence. If this is above a threshold, we should suspect that the two sketches may have - nothing in common. The threshold needs to be set with proper insights. - - Arguments: n_unique_kmers, ksize, mutation_rate, scaled - Returns: float - expected likelihood that nothing is common between sketches - ''' - assert 0.0 <= mutation_rate <= 1.0 and ksize >= 1 and scaled >= 1 - L = n_unique_kmers - k = ksize - p = mutation_rate - s = 1.0 / float(scaled) - if p == 1.0: - return 1.0 - return exp( get_expected_log_probability(L, k, p, s) ) - - -if __name__ == '__main__': - jaccard = 0.9 - ksize = 21 - scaled = 10 - n_unique_kmers = 100000 - - # jaccard_to_distance_point_estimate usage - mut_rate, err = jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers) - print('Point estimate is: ' + str(mut_rate)) - if err > 10.0**(-4.0): - print('Cannot trust this point estimate!') - - # get_exp_probability_nothing_common usage - ksize = 21 - scaled = 1000 - n_unique_kmers = 10000000 - mutation_rate = 0.3 - threshold = 10.0**(-3) - - exp_probability_no_common = get_exp_probability_nothing_common(n_unique_kmers, - ksize, mutation_rate, scaled) - print(exp_probability_no_common) - if (exp_probability_no_common >= threshold): - print('There could be cases where nothing common between sketches may happen!') diff --git a/tests/test_distance_utils.py b/tests/test_distance_utils.py index adf1093571..c2ed4613c1 100644 --- a/tests/test_distance_utils.py +++ b/tests/test_distance_utils.py @@ -3,26 +3,34 @@ """ import pytest -from sourmash.distance_utils import containment_to_distance, jaccard_to_distance, distance_to_identity, sequence_len_to_n_kmers +from sourmash.distance_utils import containment_to_distance, jaccard_to_distance, distance_to_identity, sequence_len_to_n_kmers, jaccard_to_distance_point_estimate def test_distance_to_identity(): - id,id_low,id_high = distance_to_identity(0.5,0.4,0.6) - assert id == 0.5 + ident,id_low,id_high = distance_to_identity(0.5,0.4,0.6) + assert ident == 0.5 assert id_low == 0.4 assert id_high ==0.6 def test_distance_to_identity_just_point_estimate(): - id = distance_to_identity(0.4) - assert id == 0.6 + ident = distance_to_identity(0.4) + assert ident == 0.6 + + +def test_distance_to_identity_just_point_estimate_extra_input(): + """ + Ignore 2nd input, unless put both dist_low and dist_high + """ + ident = distance_to_identity(0.4,0.5) + assert ident == 0.6 def test_distance_to_identity_fail(): with pytest.raises(Exception) as exc: - id,id_low,id_high = distance_to_identity(1.1,0.4,0.6) + ident,id_low,id_high = distance_to_identity(1.1,0.4,0.6) assert "distance value 1.1 is not between 0 and 1!" in str(exc.value) with pytest.raises(Exception) as exc: - id,id_low,id_high = distance_to_identity(-0.1,0.4,0.6) + ident,id_low,id_high = distance_to_identity(-0.1,0.4,0.6) assert "distance value -0.1 is not between 0 and 1!" in str(exc.value) @@ -333,3 +341,30 @@ def test_nkmers_to_bp(): assert kmer_cdist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) assert bp_cdist == (0.07158545548052564, 0.04802880300938562, 0.09619930040790341) assert kmer_cdist==bp_cdist + + +def test_jaccard_to_distance_point_estimate(): + jaccard = 0.9 + ksize = 21 + scaled = 10 + n_unique_kmers = 100000 + # jaccard_to_distance_point_estimate usage + print(n_unique_kmers) + mut_rate, err = jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers) + print('Point estimate is: ' + str(mut_rate)) + print('Error is: ' + str(err)) +# if err > 10.0**(-4.0): +# print('Cannot trust this point estimate!') + +# # get_exp_probability_nothing_common usage +# ksize = 21 +# scaled = 1000 +# n_unique_kmers = 10000000 +# mutation_rate = 0.3 +# threshold = 10.0**(-3) + +# exp_probability_no_common = get_exp_probability_nothing_common(n_unique_kmers, +# ksize, mutation_rate, scaled) +# print(exp_probability_no_common) +# if (exp_probability_no_common >= threshold): +# print('There could be cases where nothing common between sketches may happen!') \ No newline at end of file From 2f4dc3704b3e02b19d961649812adf48008ecb4f Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Fri, 1 Apr 2022 11:38:11 -0700 Subject: [PATCH 38/38] upd --- src/sourmash/distance_utils.py | 111 +++++++++++++++++++-------------- src/sourmash/signature.py | 13 ++-- 2 files changed, 73 insertions(+), 51 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 29afb61f7b..a80952d343 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -88,6 +88,8 @@ def distance_to_identity(dist,d_low=None,d_high=None): def get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, scaled_fraction): '''helper function + Note that scaled here needs to be between 0 and 1 + (e.g. scaled 1000 --> scaled_fraction 0.001) ''' exp_nmut = exp_n_mutated(n_unique_kmers, ksize, mutation_rate) try: @@ -110,16 +112,17 @@ def get_exp_probability_nothing_common(mutation_rate, ksize, scaled, n_unique_km # assert 0.0 <= mutation_rate <= 1.0 and ksize >= 1 and scaled >= 1 if sequence_len_bp and not n_unique_kmers: n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) - sc = 1.0 / float(scaled) + f_scaled = 1.0 / float(scaled) if mutation_rate == 1.0: return 1.0 - return exp( get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, sc) ) + return exp( get_expected_log_probability(n_unique_kmers, ksize, mutation_rate, f_scaled) ) -def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False, prob_threshold = 10.0**(-3)): +def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False, return_ci = False, prob_threshold = 10.0**(-3)): """ Containment --> distance CI (one step) """ + sol1,sol2,point_estimate=None,None,None if sequence_len_bp and not n_unique_kmers: n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) if containment <= 0.0001: # changed from 0.0, to mirror jaccard_to_distance_CI_one_step @@ -127,42 +130,46 @@ def containment_to_distance(containment, ksize, scaled, n_unique_kmers=None, seq elif containment >= 0.9999: # changed from 1.0, to mirror jaccard_to_distance_CI_one_step sol1 = sol2 = point_estimate = 0.0 else: - try: - alpha = 1 - confidence - z_alpha = probit(1-alpha/2) - f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 + point_estimate = 1.0-containment**(1.0/ksize) + if return_ci: + try: + alpha = 1 - confidence + z_alpha = probit(1-alpha/2) + f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 - bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers + bias_factor = 1 - (1 - f_scaled) ** n_unique_kmers - term_1 = (1.0-f_scaled) / (f_scaled * n_unique_kmers**3 * bias_factor**2) - term_2 = lambda pest: n_unique_kmers * exp_n_mutated(n_unique_kmers, ksize, pest) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) - term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (n_unique_kmers**2) + term_1 = (1.0-f_scaled) / (f_scaled * n_unique_kmers**3 * bias_factor**2) + term_2 = lambda pest: n_unique_kmers * exp_n_mutated(n_unique_kmers, ksize, pest) - exp_n_mutated_squared(n_unique_kmers, ksize, pest) + term_3 = lambda pest: var_n_mutated(n_unique_kmers, ksize, pest) / (n_unique_kmers**2) - var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) + var_direct = lambda pest: term_1 * term_2(pest) + term_3(pest) - f1 = lambda pest: (1-pest)**ksize + z_alpha * sqrt(var_direct(pest)) - containment - f2 = lambda pest: (1-pest)**ksize - z_alpha * sqrt(var_direct(pest)) - containment + f1 = lambda pest: (1-pest)**ksize + z_alpha * sqrt(var_direct(pest)) - containment + f2 = lambda pest: (1-pest)**ksize - z_alpha * sqrt(var_direct(pest)) - containment - sol1 = brentq(f1, 0.0000001, 0.9999999) - sol2 = brentq(f2, 0.0000001, 0.9999999) + sol1 = brentq(f1, 0.0000001, 0.9999999) + sol2 = brentq(f2, 0.0000001, 0.9999999) - point_estimate = 1.0-containment**(1.0/ksize) + except ValueError as exc: + # afaict, this only happens with extremely small test data + notify("WARNING: Cannot estimate ANI from containment. Do your sketches contain enough hashes?") + notify(str(exc)) + return None, None, None - except ValueError as exc: - notify("WARNING: Cannot estimate ANI. Are your minhashes big enough?") - notify(str(exc)) - return None, None, None - - # get probability nothing in common + # Do this here, so that we don't need to reconvert distance <--> identity later. prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) if prob_nothing_in_common >= prob_threshold: - # how to store /report this information? Do we want a column for search/prefetch/gather? or throw ValueError? + # keep count, suggest mod decrease scaled, maybe decrease ksize notify('WARNING: These sketches may have no hashes in common based on chance alone.') if return_identity: point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) - return point_estimate,sol2,sol1 + if return_ci: + return point_estimate, sol2, sol1, prob_nothing_in_common + + return point_estimate, prob_nothing_in_common #from from mrcc.p_from_scaled_jaccard @@ -181,7 +188,7 @@ def variance_scaled_jaccard(L, p, k, s): return term1 + term2 - term3 -def jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False): +def jaccard_to_distance_orig(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, confidence=0.95, return_identity=False, return_ci=False): """ Scaled Jaccard to distance estimate and CI (one step) """ @@ -192,32 +199,42 @@ def jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=None, sequence_le elif jaccard >= 0.9999: sol1 = sol2 = point_estimate = 0.0 else: - try: - alpha = 1 - confidence - z_alpha = probit(1-alpha/2) - f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 + point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) + if return_ci: + try: + alpha = 1 - confidence + z_alpha = probit(1-alpha/2) + f_scaled = 1.0/scaled # these use scaled as a fraction between 0 and 1 - var_direct = lambda pest: variance_scaled_jaccard(n_unique_kmers, pest, ksize, f_scaled) + var_direct = lambda pest: variance_scaled_jaccard(n_unique_kmers, pest, ksize, f_scaled) - f1 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 + z_alpha * sqrt(var_direct(pest)) - jaccard - f2 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 - z_alpha * sqrt(var_direct(pest)) - jaccard + f1 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 + z_alpha * sqrt(var_direct(pest)) - jaccard + f2 = lambda pest: 2.0/(2- (1-pest)**ksize ) - 1 - z_alpha * sqrt(var_direct(pest)) - jaccard - sol1 = brentq(f1, 0.0000001, 0.9999999) - sol2 = brentq(f2, 0.0000001, 0.9999999) - point_estimate = 1.0 - (2.0 - 2.0/(jaccard + 1))**(1.0/ksize) + sol1 = brentq(f1, 0.0000001, 0.9999999) + sol2 = brentq(f2, 0.0000001, 0.9999999) - except ValueError as exc: - notify("WARNING: Cannot estimate ANI. Are your minhashes big enough?") - notify(str(exc)) - return None, None, None + except ValueError as exc: + # afaict, this only happens with extremely small test data + notify("WARNING: Cannot estimate ANI from jaccard. Do your sketches contain enough hashes?") + notify(str(exc)) + return None, None, None + + # Do this here, so that we don't need to reconvert distance <--> identity later. + prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) + if prob_nothing_in_common >= prob_threshold: + notify('WARNING: These sketches may have no hashes in common based on chance alone.') if return_identity: point_estimate,sol2,sol1 = distance_to_identity(point_estimate,sol2,sol1) - return point_estimate,sol2,sol1 + if return_ci: + return point_estimate, sol2, sol1, prob_nothing_in_common + + return point_estimate, prob_nothing_in_common -def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, return_identity=False, prob_threshold = 10.0**(-3)): +def jaccard_to_distance(jaccard, ksize, scaled, n_unique_kmers=None, sequence_len_bp=None, return_identity=False, prob_threshold = 10.0**(-3)): """ Given parameters, calculate point estimate for mutation rate from jaccard index. First checks if parameters are valid (checks are not exhaustive). Then uses formulas @@ -235,7 +252,7 @@ def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=No # Returns: point_estimate_of_mutation_rate """ # NTP question: does this equation consider variance of jaccard due to scaled? - #Or was that only used for estimating the CI around the pt estimate? + # Or was that only used for estimating the CI around the pt estimate? error_lower_bound = None if sequence_len_bp and not n_unique_kmers: n_unique_kmers = sequence_len_to_n_kmers(sequence_len_bp, ksize) @@ -250,20 +267,20 @@ def jaccard_to_distance_point_estimate(jaccard, ksize, scaled, n_unique_kmers=No var_n_mut = var_n_mutated(n_unique_kmers, ksize, point_estimate) error_lower_bound = 1.0 * n_unique_kmers * var_n_mut / (n_unique_kmers + exp_n_mut)**3 - if error_lower_bound is not None and error_lower_bound > 10.0**(-4.0): - print(f"err: ({error_lower_bound})") + if error_lower_bound is not None and error_lower_bound > 10.0**(-4.0): # does this need to be modifiable? + #print(f"err: ({error_lower_bound})") notify(f"WARNING: Error on Jaccard distance point estimate is too high ({error_lower_bound}).") + # @NTP: ruturn ANI estimate anyway, NA later instead? Probably want this value for `sig overlap, at least?` notify(f"Returning 'NA' for this comparison and continuing.") point_estimate = "NA" # get probability nothing in common prob_nothing_in_common = get_exp_probability_nothing_common(point_estimate, ksize, scaled, n_unique_kmers=n_unique_kmers) if prob_nothing_in_common >= prob_threshold: - # how to store this information? Do we want a col in search/prefetch/gather? notify('WARNING: These sketches may have no hashes in common based on chance alone.') if return_identity: point_estimate = distance_to_identity(point_estimate) - return point_estimate, error_lower_bound + return point_estimate, prob_nothing_in_common, error_lower_bound diff --git a/src/sourmash/signature.py b/src/sourmash/signature.py index 7d308803de..c8adffb145 100644 --- a/src/sourmash/signature.py +++ b/src/sourmash/signature.py @@ -143,7 +143,7 @@ def jaccard(self, other): return self.minhash.similarity(other.minhash, ignore_abundance=True, downsample=False) - def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): + def jaccard_ani(self, other, downsample=False, jaccard=None, return_err_and_p_nothing_in_common=False)#, return_ci=False, confidence=0.95): "Compute Jaccard similarity with the other MinHash signature." self_mh = self.minhash other_mh = other.minhash @@ -156,10 +156,15 @@ def jaccard_ani(self, other, downsample=False, jaccard=None, confidence=0.95): jaccard = self_mh.similarity(other_mh, ignore_abundance=True) avg_scaled_kmers = round((len(self_mh) + len(other_mh))/2) avg_n_kmers = avg_scaled_kmers * scaled # would be better if hll estimate - j_ani,ani_low,ani_high = jaccard_to_distance(jaccard, self_mh.ksize, + #j_ani,ani_low,ani_high = jaccard_to_distance_orig(jaccard, self_mh.ksize, + # scaled, n_unique_kmers=avg_n_kmers, + # confidence=confidence, return_identity=True) + j_ani,err,prob_nothing_in_common = jaccard_to_distance(jaccard, self_mh.ksize, scaled, n_unique_kmers=avg_n_kmers, - confidence=confidence, return_identity=True) - return j_ani, ani_low, ani_high + return_identity=True) #confidence=confidence, + if return_err_and_p_nothing_in_common: + return j_ani,err,prob_nothing_in_common + return j_ani def contained_by(self, other, downsample=False): "Compute containment by the other signature. Note: ignores abundance."