Skip to content

Commit

Permalink
Optimise PC regression (#408)
Browse files Browse the repository at this point in the history
* add numpy linreg function and use in pcr

* allow configuring linreg method for pcr methods

* simplify numpy linreg function

* add tests for different linreg implementations

* set PCA recomputation to False by default

* use multithreading for linear regression and add tdqm

* add durations to test command

* add durations flag to pytest command

* add min duration flag

* add comments for ThreadPoolExecutore run

* allow to specify n_threads and linreg algorithm

* add keywords to cell cycle scoring

* add multiple threading in pcr test

* use nanmean

* test cell cycle score with numpy linreg

* use more memory-efficient implementation of ThreadPool

* fix cases where there are no residuals

* allow to configure svd_solver

* use multiple linear regression and remove multithreading approach

* use pcr score from numpy exact OLS implementation

* remove testing code

* include old pcr implementation

* Update scib/metrics/cell_cycle.py

* change default to numpy
  • Loading branch information
mumichae authored Dec 20, 2024
1 parent d357273 commit e614d35
Show file tree
Hide file tree
Showing 8 changed files with 247 additions and 60 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ jobs:
- name: Test with pytest
if: ${{ matrix.os != 'macos-latest'}}
run: |
pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv
pytest --cov=scib --cov-report=xml -vv --ignore=tests/integration/ --ignore=tests/metrics/rpy2 -vv --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_metrics_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"
- name: Upload coverage to GitHub Actions
Expand Down Expand Up @@ -98,7 +98,7 @@ jobs:
- name: Test with pytest
run: |
pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2
pytest --cov=scib --cov-report=xml -vv --tb=native -k rpy2 --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_rpy2_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"
- name: Upload coverage to GitHub Actions
Expand Down Expand Up @@ -129,7 +129,7 @@ jobs:
- name: Test with pytest
run: |
pytest --cov=scib --cov-report=xml -vv --tb=native -k integration
pytest --cov=scib --cov-report=xml -vv --tb=native -k integration --durations 0 --durations-min=1.0
mv coverage.xml "$(echo 'coverage_integration_${{ matrix.os }}_${{ matrix.python }}.xml' | sed 's/[^a-z0-9\.\/]/_/g')"
- name: Upload coverage to GitHub Actions
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ build-backend = "setuptools.build_meta"
log_cli = 'True'
log_cli_level = 'INFO'
addopts = '-p no:warnings'
durations = 0
48 changes: 34 additions & 14 deletions scib/metrics/cell_cycle.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
import pandas as pd
from tqdm import tqdm

from ..preprocessing import score_cell_cycle
from ..utils import check_adata
Expand All @@ -11,12 +12,14 @@ def cell_cycle(
adata_post,
batch_key,
embed=None,
agg_func=np.mean,
agg_func=np.nanmean,
organism="mouse",
n_comps=50,
recompute_cc=True,
precompute_pcr_key=None,
verbose=False,
linreg_method="numpy",
n_threads=1,
):
"""Cell cycle conservation score
Expand Down Expand Up @@ -44,6 +47,7 @@ def cell_cycle(
precomputed scores if available as 'S_score' and 'G2M_score' in ``adata_post.obs``
:param precompute_pcr_key: Key in adata_pre for precomputed PCR values for cell
cycle scores. Ignores cell cycle scores in adata_pre if present.
:param n_threads: Number of threads for linear regressions per principle component
:return:
A score between 1 and 0. The larger the score, the stronger the cell cycle
Expand All @@ -70,11 +74,6 @@ def cell_cycle(
if embed == "X_pca":
embed = None

batches = adata_pre.obs[batch_key].unique()
scores_final = []
scores_before = []
scores_after = []

recompute_cc = (
recompute_cc
or "S_score" not in adata_pre.obs_keys()
Expand All @@ -84,19 +83,26 @@ def cell_cycle(
precompute_pcr_key is None or precompute_pcr_key not in adata_pre.uns_keys()
)

for batch in batches:
batches = adata_pre.obs[batch_key].unique()
scores_before = []
scores_after = []
scores_final = []

for batch in tqdm(batches):
before, after = get_pcr_before_after(
adata_pre,
adata_post,
batch_key=batch_key,
batch=batch,
embed=embed,
organism=organism,
pcr_key=precompute_pcr_key,
recompute_cc=recompute_cc,
recompute_pcr=recompute_pcr,
pcr_key=precompute_pcr_key,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)

# scale result
Expand Down Expand Up @@ -140,11 +146,13 @@ def get_pcr_before_after(
batch,
embed,
organism,
recompute_cc,
recompute_pcr,
pcr_key,
n_comps,
verbose,
recompute_cc=False,
recompute_pcr=False,
n_comps=50,
verbose=True,
n_threads=1,
linreg_method="numpy",
):
"""
Principle component regression value on cell cycle scores for one batch
Expand Down Expand Up @@ -192,14 +200,26 @@ def get_pcr_before_after(
# PCR on adata before integration
if recompute_pcr:
before = pc_regression(
raw_sub.X, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
raw_sub.X,
covariate,
pca_var=None,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)
else:
before = pd.Series(raw_sub.uns[pcr_key])

# PCR on adata after integration
after = pc_regression(
int_sub, covariate, pca_var=None, n_comps=n_comps, verbose=verbose
int_sub,
covariate,
pca_var=None,
n_comps=n_comps,
verbose=verbose,
n_threads=n_threads,
linreg_method=linreg_method,
)

return before, after
Loading

0 comments on commit e614d35

Please sign in to comment.