From 73d752080c345392768a4eb54a41dc3d72de4f2f Mon Sep 17 00:00:00 2001 From: Brian J Arnold Date: Tue, 24 May 2022 15:10:06 -0400 Subject: [PATCH] fixed error in CI calculation, bumped version --- README.md | 2 +- setup.py | 2 +- src/decifer/__init__.py | 2 +- src/decifer/__main__.py | 35 +++++++++++++++++++---------------- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index ccd0bc4..909e81b 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,7 @@ DeCiFer's main output file (ending with `_output.tsv`) corresponds to a single T For the column containing the `true_cluster_DCF`, the CIs correspond to the 95% credible interval of the posterior distribution of the DCF cluster center (Eqn 8 in manuscript and S23 in supplement) . These CIs have been corrected for multiple tests. Specifically, for each cluster, we find the lower CI by finding the X=[0.025/(number of hypothesis tests)] quantile, where the number of tests corresponds to (number of clusters)\*(number of samples for patient). The same procedure is used for the upper CI, by finding the quantile that corresponds to 1-X. -These cluster CIs may also be found in the output file ending in `_cluster.CIs.tsv`. This file contains this information in a more condensed format, reporting only the upper and lower CIs for each cluster for each sample (in column `f_lb` and `f_ub` respectively). +These cluster CIs may also be found in the output file ending in `_cluster.CIs.tsv`. This file contains this information in a more condensed format, reporting only the upper and lower CIs for each cluster for each sample (in column `f_lb` and `f_ub` respectively). These numbers may contain "NaN" if no mutations were assigned to that particular cluster. The file ending in `_model_selection.tsv` shows how decifer selected the best value of K clusters. diff --git a/setup.py b/setup.py index e30491d..d673759 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ setuptools.setup( name='decifer', - version='v2.1.1', + version='v2.1.2', python_requires='>=3.7.*', packages=['decifer'], package_dir={'': 'src'}, diff --git a/src/decifer/__init__.py b/src/decifer/__init__.py index 73f37a5..472bc37 100644 --- a/src/decifer/__init__.py +++ b/src/decifer/__init__.py @@ -1 +1 @@ -__version__ = 'v2.1.1' +__version__ = 'v2.1.2' diff --git a/src/decifer/__main__.py b/src/decifer/__main__.py index 31c1be6..09dde5c 100644 --- a/src/decifer/__main__.py +++ b/src/decifer/__main__.py @@ -280,22 +280,25 @@ def CI(job): mut = list(filter(lambda m : m.assigned_cluster == c, muts)) num_pts = 10000 - grid = [objective(j, mut, s, bb) for j in np.linspace(0, PURITY[s], num_pts)] - min_log = min(grid) - delta = (-1*min_log)-2 # constant to make -log(pdf) values less negative - prob = (lambda x: math.exp(-1*(x+delta))) # convert -log(pdf) to unnormalized probability - total = sum([prob(x) for x in grid]) # unnormalized probabilities across support - pdf = [prob(x)/total for x in grid] # normalized probabilities across support - cdf = np.cumsum(pdf) - - low_ci = 0.025/num_tests # divide the desired CI quantile by the number of tests, bonferonni correction - high_ci = 1 - low_ci - - low_index = take_closest(cdf, low_ci) - high_index = take_closest(cdf, high_ci) - - l = float(low_index)/num_pts - u = float(high_index)/num_pts + if len(mut) == 0: + l, u, pdf = "NaN", "Nan", ["Nan"]*num_pts + else: + grid = [objective(j, mut, s, bb) for j in np.linspace(0, PURITY[s], num_pts)] + min_log = min(grid) + delta = (-1*min_log)-2 # constant to make -log(pdf) values less negative + prob = (lambda x: math.exp(-1*(x+delta))) # convert -log(pdf) to unnormalized probability + total = sum([prob(x) for x in grid]) # unnormalized probabilities across support + pdf = [prob(x)/total for x in grid] # normalized probabilities across support + cdf = np.cumsum(pdf) + + low_ci = 0.025/num_tests # divide the desired CI quantile by the number of tests, bonferonni correction + high_ci = 1 - low_ci + + low_index = take_closest(cdf, low_ci) + high_index = take_closest(cdf, high_ci) + + l = float(low_index)/num_pts + u = float(high_index)/num_pts return (c, s, l, u, pdf)