Optimise PC regression (#408)

* add numpy linreg function and use in pcr * allow configuring linreg method for pcr methods * simplify numpy linreg function * add tests for different linreg implementations * set PCA recomputation to False by default * use multithreading for linear regression and add tdqm * add durations to test command * add durations flag to pytest command * add min duration flag * add comments for ThreadPoolExecutore run * allow to specify n_threads and linreg algorithm * add keywords to cell cycle scoring * add multiple threading in pcr test * use nanmean * test cell cycle score with numpy linreg * use more memory-efficient implementation of ThreadPool * fix cases where there are no residuals * allow to configure svd_solver * use multiple linear regression and remove multithreading approach * use pcr score from numpy exact OLS implementation * remove testing code * include old pcr implementation * Update scib/metrics/cell_cycle.py * change default to numpy
theislab · Dec 20, 2024 · e614d35 · e614d35
1 parent d357273
commit e614d35
Show file tree

Hide file tree

Showing 8 changed files with 247 additions and 60 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -60,7 +60,7 @@ jobs:
         -   name: Test with pytest
             if: ${{ matrix.os != 'macos-latest'}}
             run: |
-                pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv
+                pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv --durations 0 --durations-min=1.0
                 mv coverage.xml "$(echo 'coverage_metrics_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"
 
         -   name: Upload coverage to GitHub Actions
@@ -98,7 +98,7 @@ jobs:
 
         -   name: Test with pytest
             run: |
-                pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2
+                pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2 --durations 0 --durations-min=1.0
                 mv coverage.xml "$(echo 'coverage_rpy2_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"
 
         -   name: Upload coverage to GitHub Actions
@@ -129,7 +129,7 @@ jobs:
 
         -   name: Test with pytest
             run: |
-                pytest --cov=scib --cov-report=xml -vv --tb=native -k integration
+                pytest --cov=scib --cov-report=xml -vv --tb=native -k integration --durations 0 --durations-min=1.0
                 mv coverage.xml "$(echo 'coverage_integration_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"
 
         -   name: Upload coverage to GitHub Actions

diff --git a/pyproject.toml b/pyproject.toml
@@ -9,3 +9,4 @@ build-backend = "setuptools.build_meta"
 log_cli = 'True'
 log_cli_level = 'INFO'
 addopts = '-p no:warnings'
+durations = 0
diff --git a/scib/metrics/cell_cycle.py b/scib/metrics/cell_cycle.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+from tqdm import tqdm
 
 from ..preprocessing import score_cell_cycle
 from ..utils import check_adata
@@ -11,12 +12,14 @@ def cell_cycle(
     adata_post,
     batch_key,
     embed=None,
-    agg_func=np.mean,
+    agg_func=np.nanmean,
     organism="mouse",
     n_comps=50,
     recompute_cc=True,
     precompute_pcr_key=None,
     verbose=False,
+    linreg_method="numpy",
+    n_threads=1,
 ):
     """Cell cycle conservation score
 
@@ -44,6 +47,7 @@ def cell_cycle(
         precomputed scores if available as 'S_score' and 'G2M_score' in ``adata_post.obs``
     :param precompute_pcr_key: Key in adata_pre for precomputed PCR values for cell
         cycle scores. Ignores cell cycle scores in adata_pre if present.
+    :param n_threads: Number of threads for linear regressions per principle component
 
     :return:
         A score between 1 and 0. The larger the score, the stronger the cell cycle
@@ -70,11 +74,6 @@ def cell_cycle(
     if embed == "X_pca":
         embed = None
 
-    batches = adata_pre.obs[batch_key].unique()
-    scores_final = []
-    scores_before = []
-    scores_after = []
-
     recompute_cc = (
         recompute_cc
         or "S_score" not in adata_pre.obs_keys()
@@ -84,19 +83,26 @@ def cell_cycle(
         precompute_pcr_key is None or precompute_pcr_key not in adata_pre.uns_keys()
     )
 
-    for batch in batches:
+    batches = adata_pre.obs[batch_key].unique()
+    scores_before = []
+    scores_after = []
+    scores_final = []
+
+    for batch in tqdm(batches):
         before, after = get_pcr_before_after(
             adata_pre,
             adata_post,
             batch_key=batch_key,
             batch=batch,
             embed=embed,
             organism=organism,
+            pcr_key=precompute_pcr_key,
             recompute_cc=recompute_cc,
             recompute_pcr=recompute_pcr,
-            pcr_key=precompute_pcr_key,
             n_comps=n_comps,
             verbose=verbose,
+            n_threads=n_threads,
+            linreg_method=linreg_method,
         )
 
         # scale result
@@ -140,11 +146,13 @@ def get_pcr_before_after(
     batch,
     embed,
     organism,
-    recompute_cc,
-    recompute_pcr,
     pcr_key,
-    n_comps,
-    verbose,
+    recompute_cc=False,
+    recompute_pcr=False,
+    n_comps=50,
+    verbose=True,
+    n_threads=1,
+    linreg_method="numpy",
 ):
     """
     Principle component regression value on cell cycle scores for one batch
@@ -192,14 +200,26 @@ def get_pcr_before_after(
     # PCR on adata before integration
     if recompute_pcr:
         before = pc_regression(
-            raw_sub.X, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
+            raw_sub.X,
+            covariate,
+            pca_var=None,
+            n_comps=n_comps,
+            verbose=verbose,
+            n_threads=n_threads,
+            linreg_method=linreg_method,
         )
     else:
         before = pd.Series(raw_sub.uns[pcr_key])
 
     # PCR on adata after integration
     after = pc_regression(
-        int_sub, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
+        int_sub,
+        covariate,
+        pca_var=None,
+        n_comps=n_comps,
+        verbose=verbose,
+        n_threads=n_threads,
+        linreg_method=linreg_method,
     )
 
     return before, after