rename cli flag to match terminology in the paper: confidence_level -…

…> tolerance
tschuelia · Apr 19, 2024 · c0f5d5e · c0f5d5e
1 parent 6224a66
commit c0f5d5e
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 37 deletions.
diff --git a/docs/cli_config.rst b/docs/cli_config.rst
@@ -12,7 +12,7 @@ Configuration options:
 - ``file_format``, default = ``EIGENSTRAT``, Name of the file format your dataset is in. Supported formats are ``ANCESTRYMAP``, ``EIGENSTRAT``, ``PED``, ``PACKEDPED``, ``PACKEDANCESTRYMAP``. For more information see Section `Input data`_ below.
 - ``convertf``, default = ``convertf``, File path pointing to an executable of Eigensoft's ``convertf`` tool. ``convertf`` is used if the provided dataset is not in ``EIGENSTRAT`` format. Default is ``convertf``. This will only work if ``convertf`` is installed systemwide.
 - ``bootstrap_convergence_check``, default = ``True``, If true, instead of computing ``n_replicates`` bootstraps and embeddings, Pandora will check for convergence once every ``max(10, threads)`` bootstrap embeddings are computed. If according to our heuristic (see TODO for more details) the bootstrap procedure converged, all remaining tasks are cancelled and the stability is determined uisng only the number of replicates computed when convergence is determined. Due to the runtime overhead of the convergence check compared to the runtime of MDS computations, we only advice using this convergence check for PCA analyses. Note that this parameter is only relevant if ``analysis_mode`` is ``AnalysisMode.BOOTSTRAP``.
-- ``bootstrap_convergence_confidence_level``, default=0.05, Determines the level of confidence when checking for bootstrap convergence. A value of :math:`X` means that we allow deviations of up to :math:`X * 100\%` between pairwise bootstrap comparisons and still assume convergence.
+- ``bootstrap_convergence_tolerance``, default=0.05, Determines the level of deviation tolerance when checking for bootstrap convergence. A value of :math:`X` means that we allow deviations of up to :math:`X * 100\%` between pairwise bootstrap comparisons and still assume convergence.
 - ``n_replicates``, default = 100, Number of bootstrap replicates or sliding windows to compute
 - ``keep_replicates``, default = ``false``, Whether to store all intermediate datasets files (``.geno``, ``.snp``, ``.ind``). Note that this will result in a substantial storage consumption. Note that in case of bootstrapping, the bootstrapped indices are stored as checkpoints for full reproducibility in any case.
 - ``n_components``, default = 10, Number of components to compute and compare for PCA or MDS analyses. We recommend 10 for PCA analyses and 2 for MDS analyses. The default is 10 since the default for ``embedding_algorithm`` is ``PCA``.

diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -46,12 +46,12 @@ have to add the path to ``smartpca`` to the ``config-example.yaml``.
 
 You should then see an output similar to this:::
 
-    Pandora version 1.0.3 released by The Exelixis Lab
+    Pandora version 2.0.0 released by The Exelixis Lab
     Developed by: Julia Haag
     Latest version: https://github.com/tschuelia/Pandora
     Questions/problems/suggestions? Please open an issue on GitHub.
 
-    Pandora was called at 06-Nov-2023 16:26:50 as follows:
+    Pandora was called at 18-Apr-2024 16:26:50 as follows:
 
     /Users/julia/micromamba/envs/pandora/bin/pandora -c config_example.yaml
 
@@ -63,7 +63,7 @@ You should then see an output similar to this:::
     n_replicates: 10
     keep_replicates: False
     bootstrap_convergence_check: True
-    bootstrap_convergence_confidence_level: 0.05
+    bootstrap_convergence_tolerance: 0.05
     n_components: 10
     embedding_algorithm: PCA
     smartpca: smartpca
@@ -85,7 +85,7 @@ You should then see an output similar to this:::
     [00:00:02] Running SmartPCA on the input dataset.
     [00:00:02] Plotting embedding results for the input dataset.
     [00:00:18] Drawing 10 bootstrapped datasets and running PCA.
-    [00:00:18] NOTE: Bootstrap convergence check is enabled. Will terminate bootstrap computation once convergence is determined. Convergence confidence level: 0.05
+    [00:00:18] NOTE: Bootstrap convergence check is enabled. Will terminate bootstrap computation once convergence is determined. Convergence tolerance: 0.05
     [00:00:27] Bootstrapping done. Number of replicates computed: 10
     [00:00:27] Comparing bootstrapping embedding results.
     [00:00:34] Plotting bootstrapping embedding results.

diff --git a/pandora/bootstrap.py b/pandora/bootstrap.py
@@ -29,7 +29,7 @@
 def _bootstrap_convergence_check(
     bootstraps: List[Union[NumpyDataset, EigenDataset]],
     embedding: EmbeddingAlgorithm,
-    bootstrap_convergence_confidence_level: float,
+    bootstrap_convergence_tolerance: float,
     threads: int,
     logger: Optional[loguru.Logger] = None,
 ):
@@ -44,14 +44,12 @@ def _bootstrap_convergence_check(
             f"Unrecognized embedding option {embedding}. Supported are 'pca' and 'mds'."
         )
 
-    return _bootstrap_converged(
-        embeddings, bootstrap_convergence_confidence_level, threads
-    )
+    return _bootstrap_converged(embeddings, bootstrap_convergence_tolerance, threads)
 
 
 def _bootstrap_converged(
     bootstraps: List[Embedding],
-    bootstrap_convergence_confidence_level: float,
+    bootstrap_convergence_tolerance: float,
     threads: int,
 ):
     """Checks for convergence by comparing the Pandora Stabilities for 10 subsets of the given list of bootstraps."""
@@ -76,7 +74,7 @@ def _bootstrap_converged(
             stabilities[j] = stability_s2
 
         relative_difference = abs(stability_s2 - stability_s1) / (stability_s1 + 1e-6)
-        if round(relative_difference, 3) > bootstrap_convergence_confidence_level:
+        if round(relative_difference, 3) > bootstrap_convergence_tolerance:
             return False
     return True
 
@@ -252,7 +250,7 @@ def run(
         self,
         threads: int,
         bootstrap_convergence_check: bool,
-        bootstrap_convergence_confidence_level: float,
+        bootstrap_convergence_tolerance: float,
         embedding: EmbeddingAlgorithm,
         logger: Optional[loguru.Logger] = None,
     ):
@@ -296,7 +294,7 @@ def run(
                         converged = _bootstrap_convergence_check(
                             bootstraps,
                             embedding,
-                            bootstrap_convergence_confidence_level,
+                            bootstrap_convergence_tolerance,
                             threads,
                             logger,
                         )
@@ -390,7 +388,7 @@ def bootstrap_and_embed_multiple(
     redo: bool = False,
     keep_bootstraps: bool = False,
     bootstrap_convergence_check: bool = True,
-    bootstrap_convergence_confidence_level: float = 0.05,
+    bootstrap_convergence_tolerance: float = 0.05,
     smartpca_optional_settings: Optional[Dict] = None,
     logger: Optional[loguru.Logger] = None,
 ) -> List[EigenDataset]:
@@ -434,8 +432,8 @@ def bootstrap_and_embed_multiple(
     bootstrap_convergence_check : bool, default=True
         Whether to automatically determine bootstrap convergence. If ``True``, will only compute as many replicates as
         required for convergence according to our heuristic (see Notes below).
-    bootstrap_convergence_confidence_level : float, default=0.05
-        Determines the level of confidence when checking for bootstrap convergence. A value of X means that we allow
+    bootstrap_convergence_tolerance : float, default=0.05
+        Determines the deviation tolerance when checking for bootstrap convergence. A value of X means that we allow
         deviations of up to :math:`X * 100\\%` between pairwise bootstrap comparisons and still assume convergence.
     smartpca_optional_settings : Dict, default=None
         Additional smartpca settings.
@@ -462,7 +460,7 @@ def bootstrap_and_embed_multiple(
     We first create 10 random subsets of size :math:`int(N/2)` by sampling from all :math:`N` replicates.
     We then compute the Pandora Stability (PS) for each of the 10 subsets and compute the relative difference of PS
     values between all possible pairs of subsets :math:`(PS_1, PS_2)` by computing :math:`\\frac{\\left|PS_1 - PS_2\\right|}{PS_2}`.
-    We assume convergence if all pairwise relative differences are below X * 100% were X is the set confidence level.
+    We assume convergence if all pairwise relative differences are below X * 100% were X is the set tolerance.
     If we determine that the bootstrap has converged, all remaining bootstrap computations are cancelled.
 
     (*) The reasoning for checking every ``max(10, threads)`` is the following: if Pandora runs on a machine with e.g. 48
@@ -517,7 +515,7 @@ def bootstrap_and_embed_multiple(
     bootstraps, finished_indices = parallel_bootstrap_process_manager.run(
         threads=threads,
         bootstrap_convergence_check=bootstrap_convergence_check,
-        bootstrap_convergence_confidence_level=bootstrap_convergence_confidence_level,
+        bootstrap_convergence_tolerance=bootstrap_convergence_tolerance,
         embedding=embedding,
         logger=logger,
     )
@@ -568,7 +566,7 @@ def bootstrap_and_embed_multiple_numpy(
     ] = euclidean_sample_distance,
     imputation: Optional[str] = "mean",
     bootstrap_convergence_check: bool = True,
-    bootstrap_convergence_confidence_level: float = 0.05,
+    bootstrap_convergence_tolerance: float = 0.05,
 ) -> List[NumpyDataset]:
     """Draws ``n_replicates`` bootstrap datasets of the provided NumpyDataset and performs PCA/MDS analysis (as
     specified by ``embedding``) for each bootstrap.
@@ -611,8 +609,8 @@ def bootstrap_and_embed_multiple_numpy(
     bootstrap_convergence_check : bool, default=True
         Whether to automatically determine bootstrap convergence. If ``True``, will only compute as many replicates as
         required for convergence according to our heuristic (see Notes below).
-    bootstrap_convergence_confidence_level : float, default=0.05
-        Determines the level of confidence when checking for bootstrap convergence. A value of X means that we allow
+    bootstrap_convergence_tolerance : float, default=0.05
+        Determines the level of deviation tolerance when checking for bootstrap convergence. A value of X means that we allow
         deviations of up to :math:`X * 100\\%` between pairwise bootstrap comparisons and still assume convergence.
 
     Returns
@@ -633,7 +631,7 @@ def bootstrap_and_embed_multiple_numpy(
     We first create 10 random subsets of size :math:`int(N/2)` by sampling from all :math:`N` replicates.
     We then compute the Pandora Stability (PS) for each of the 10 subsets and compute the relative difference of PS
     values between all possible pairs of subsets :math:`(PS_1, PS_2)` by computing :math:`\\frac{\\left|PS_1 - PS_2\\right|}{PS_2}`.
-    We assume convergence if all pairwise relative differences are below X * 100% were X is the set confidence level.
+    We assume convergence if all pairwise relative differences are below X * 100% were X is the set tolerance.
     If we determine that the bootstrap has converged, all remaining bootstrap computations are cancelled.
 
     (*) The reasoning for checking every ``max(10, threads)`` is the following: if Pandora runs on a machine with e.g. 48
@@ -666,7 +664,7 @@ def bootstrap_and_embed_multiple_numpy(
     bootstraps, _ = parallel_bootstrap_process_manager.run(
         threads=threads,
         bootstrap_convergence_check=bootstrap_convergence_check,
-        bootstrap_convergence_confidence_level=bootstrap_convergence_confidence_level,
+        bootstrap_convergence_tolerance=bootstrap_convergence_tolerance,
         embedding=embedding,
     )
 

diff --git a/pandora/pandora.py b/pandora/pandora.py
@@ -74,8 +74,8 @@ class PandoraConfig(BaseModel):
         the bootstrap procedure converged, all remaining tasks are cancelled and the stability is determined uisng
         only the number of replicates computed when convergence is determined.
         Note that this parameter is only relevant if ``analysis_mode`` is ``AnalysisMode.BOOTSTRAP``.
-    bootstrap_convergence_confidence_level : NonNegativeFloat, default=0.05
-        Determines the level of confidence when checking for bootstrap convergence. A value of :math:`X` means that we
+    bootstrap_convergence_tolerance : NonNegativeFloat, default=0.05
+        Determines the level of deviation tolerance when checking for bootstrap convergence. A value of :math:`X` means that we
         allow deviations of up to :math:`X * 100\\%` between pairwise bootstrap comparisons and still assume convergence.
     n_components : PositiveInt, default=10
         Number of dimensions to output and compare for PCA and MDS analyses.
@@ -145,7 +145,7 @@ class PandoraConfig(BaseModel):
 
     # Bootstrap specific setting (convergence check)
     bootstrap_convergence_check: bool = True
-    bootstrap_convergence_confidence_level: NonNegativeFloat = 0.05
+    bootstrap_convergence_tolerance: NonNegativeFloat = 0.05
 
     # Embedding related
     n_components: NonNegativeInt = 10
@@ -561,7 +561,7 @@ def bootstrap_embeddings(self) -> None:
                 fmt_message(
                     "NOTE: Bootstrap convergence check is enabled. "
                     "Will terminate bootstrap computation once convergence is determined. "
-                    f"Convergence confidence level: {self.pandora_config.bootstrap_convergence_confidence_level}"
+                    f"Convergence tolerance: {self.pandora_config.bootstrap_convergence_tolerance}"
                 )
             )
         try:
@@ -577,7 +577,7 @@ def bootstrap_embeddings(self) -> None:
                 self.pandora_config.redo,
                 self.pandora_config.keep_replicates,
                 self.pandora_config.bootstrap_convergence_check,
-                self.pandora_config.bootstrap_convergence_confidence_level,
+                self.pandora_config.bootstrap_convergence_tolerance,
                 self.pandora_config.smartpca_optional_settings,
                 logger,
             )

diff --git a/tests/test_bootstrap.py b/tests/test_bootstrap.py
@@ -223,7 +223,7 @@ def test_bootstrap_converged_for_identical_embeddings(
         assert _bootstrap_convergence_check(
             [test_numpy_dataset] * n_replicates,
             embedding_algorithm,
-            bootstrap_convergence_confidence_level=0.05,
+            bootstrap_convergence_tolerance=0.05,
             threads=2,
         )
 
@@ -244,7 +244,7 @@ def _random_embedding(embedding):
     random_embeddings = [_random_embedding(test_numpy_dataset.pca) for _ in range(5)]
     bootstraps = random_embeddings + [test_numpy_dataset.pca]
     assert not _bootstrap_converged(
-        bootstraps, bootstrap_convergence_confidence_level=0.01, threads=2
+        bootstraps, bootstrap_convergence_tolerance=0.01, threads=2
     )
 
 
@@ -324,7 +324,7 @@ def test_bootstrap_and_embed_multiple_with_convergence_check_pca(
 ):
     # example_dataset actually does not require all 100 bootstraps to converge
     # so if we run bootstrap_and_embed_multiple with the convergence check enabled
-    # and the convergence confidence level to 0.05
+    # and the convergence tolerance to 0.05
     # we should get less than 100 bootstraps as result
 
     n_bootstraps = 100
@@ -345,12 +345,12 @@ def test_bootstrap_and_embed_multiple_with_convergence_check_pca(
         assert len(bootstraps) < n_bootstraps
 
 
-def test_bootstrap_and_embed_multiple_with_convergence_check_pca_no_convergence_with_high_confidence_level(
+def test_bootstrap_and_embed_multiple_with_convergence_check_pca_no_convergence_with_high_tolerance(
     example_dataset, smartpca
 ):
-    # when setting the confidence level to 0.01, we should see no convergence
+    # when setting the convergence tolerance to 0.01, we should see no convergence
     # so if we run bootstrap_and_embed_multiple with the convergence check enabled
-    # and the convergence confidence limit to 0.01
+    # and the convergence tolerance to 0.01
     # we should get exactly 100 bootstraps as result
 
     n_bootstraps = 100
@@ -366,7 +366,7 @@ def test_bootstrap_and_embed_multiple_with_convergence_check_pca_no_convergence_
             seed=0,
             keep_bootstraps=False,
             bootstrap_convergence_check=True,
-            bootstrap_convergence_confidence_level=0.01,
+            bootstrap_convergence_tolerance=0.01,
         )
 
         assert len(bootstraps) == n_bootstraps
@@ -405,7 +405,7 @@ def test_bootstrap_and_embed_multiple_numpy_with_convergence_check_pca(
 ):
     # test_numpy_dataset actually does not require all 100 bootstraps to converge (neither for PCA nor for MDS)
     # so if we run bootstrap_and_embed_multiple_numpy with the convergence check enabled
-    # and the confidence level to a very liberal setting (0.8)
+    # and the convergence tolerance to a very liberal setting (0.8)
     # we should get less than 100 bootstraps as result
 
     n_bootstraps = 100
@@ -417,7 +417,7 @@ def test_bootstrap_and_embed_multiple_numpy_with_convergence_check_pca(
         n_components=2,
         seed=0,
         bootstrap_convergence_check=True,
-        bootstrap_convergence_confidence_level=0.8,
+        bootstrap_convergence_tolerance=0.8,
     )
 
     assert len(bootstraps) < n_bootstraps