Merge branch 'main' into template-update-YosefLab-scib-metrics-v0.3.0

YosefLab · Jan 4, 2024 · aeed425 · aeed425
2 parents 6cae23b + 2b48a60
commit aeed425
Show file tree

Hide file tree

Showing 23 changed files with 589 additions and 565 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.4.1
+current_version = 0.5.0
 tag = True
 commit = True
 

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -20,7 +20,7 @@ jobs:
         strategy:
             fail-fast: false
             matrix:
-                python: ["3.9", "3.10"]
+                python: ["3.9", "3.10", "3.11"]
                 os: [ubuntu-latest]
 
         env:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -73,3 +73,7 @@ repos:
                     mdformat-myst,
                 ]
             args: [--nbqa-md]
+    - repo: https://github.com/kynan/nbstripout
+      rev: 0.6.1
+      hooks:
+          - id: nbstripout
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,10 @@ and this project adheres to [Semantic Versioning][].
 [keep a changelog]: https://keepachangelog.com/en/1.0.0/
 [semantic versioning]: https://semver.org/spec/v2.0.0.html
 
+## 0.5.0 (2024-MM-DD)
+
+-   Refactor all relevant metrics to use `NeighborsResults` as input instead of sparse distance/connectivity matrices.
+
 ## 0.4.1 (2023-10-08)
 
 -   Fix KMeans. All previous versions had a bug with KMeans and ARI/NMI metrics are not reliable with this clustering. ([#115][])

diff --git a/docs/api.md b/docs/api.md
@@ -82,7 +82,7 @@ scib_metrics.ilisi_knn(...)
 
     nearest_neighbors.pynndescent
     nearest_neighbors.jax_approx_min_k
-    nearest_neighbors.NeighborsOutput
+    nearest_neighbors.NeighborsResults
 ```
 
 ## Settings

diff --git a/docs/notebooks/large_scale.ipynb b/docs/notebooks/large_scale.ipynb
diff --git a/docs/notebooks/lung_example.ipynb b/docs/notebooks/lung_example.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,17 +5,13 @@ requires = ["hatchling"]
 
 [project]
 name = "scib-metrics"
-version = "0.4.1"
+version = "0.5.0"
 description = "Accelerated and Python-only scIB metrics"
 readme = "README.md"
 requires-python = ">=3.9"
-license = {file = "LICENSE"}
-authors = [
-    {name = "Adam Gayoso"},
-]
-maintainers = [
-    {name = "Adam Gayoso", email = "adamgayoso@berkeley.edu"},
-]
+license = { file = "LICENSE" }
+authors = [{ name = "Adam Gayoso" }]
+maintainers = [{ name = "Adam Gayoso", email = "adamgayoso@berkeley.edu" }]
 urls.Documentation = "https://scib-metrics.readthedocs.io/"
 urls.Source = "https://github.com/yoseflab/scib-metrics"
 urls.Home-page = "https://github.com/yoseflab/scib-metrics"
@@ -35,13 +31,14 @@ dependencies = [
     "matplotlib",
     "plottable",
     "tqdm",
+    "umap-learn>=0.5.0",
 ]
 
 [project.optional-dependencies]
 dev = [
     # CLI for bumping the version number
     "bump2version",
-    "pre-commit"
+    "pre-commit",
 ]
 doc = [
     "sphinx>=4",
@@ -66,30 +63,26 @@ test = [
     "black",
     "numba>=0.57.1",
 ]
-parallel = [
-    "joblib"
-]
+parallel = ["joblib"]
 tutorial = [
     "rich",
     "scanorama",
     "harmony-pytorch",
     "scvi-tools",
     "pyliger",
-    "numexpr", # missing liger dependency
-    "plotnine", # missing liger dependency
-    "mygene", # missing liger dependency
-    "goatools", # missing liger dependency
-    "adjustText", # missing liger dependency
+    "numexpr",         # missing liger dependency
+    "plotnine",        # missing liger dependency
+    "mygene",          # missing liger dependency
+    "goatools",        # missing liger dependency
+    "adjustText",      # missing liger dependency
 ]
 
 [tool.hatch.build.targets.wheel]
 packages = ['src/scib_metrics']
 
 [tool.coverage.run]
 source = ["scib_metrics"]
-omit = [
-    "**/test_*.py",
-]
+omit = ["**/test_*.py"]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
@@ -99,17 +92,17 @@ xfail_strict = true
 src = ["src"]
 line-length = 120
 select = [
-    "F",  # Errors detected by Pyflakes
-    "E",  # Error detected by Pycodestyle
-    "W",  # Warning detected by Pycodestyle
-    "I",  # isort
-    "D",  # pydocstyle
-    "B",  # flake8-bugbear
-    "TID",  # flake8-tidy-imports
-    "C4",  # flake8-comprehensions
-    "BLE",  # flake8-blind-except
-    "UP",  # pyupgrade
-    "RUF100",  # Report unused noqa directives
+    "F",      # Errors detected by Pyflakes
+    "E",      # Error detected by Pycodestyle
+    "W",      # Warning detected by Pycodestyle
+    "I",      # isort
+    "D",      # pydocstyle
+    "B",      # flake8-bugbear
+    "TID",    # flake8-tidy-imports
+    "C4",     # flake8-comprehensions
+    "BLE",    # flake8-blind-except
+    "UP",     # pyupgrade
+    "RUF100", # Report unused noqa directives
 ]
 ignore = [
     # line too long -> we accept long comment lines; black gets rid of long code lines
@@ -128,8 +121,7 @@ ignore = [
     "B008",
     # __magic__ methods are are often self-explanatory, allow missing docstrings
     "D105",
-    # first line should end with a period [Bug: doesn't work with single-line docstrings]
-    "D400",
+    # first line should end with a period [Bug: doesn't work with single-line docstrings]    "D400",
     # First line should be in imperative mood; try rephrasing
     "D401",
     ## Disable one in each pair of mutually incompatible rules
@@ -185,5 +177,5 @@ skip = [
     "docs/changelog.md",
     "docs/references.bib",
     "docs/references.md",
-    "docs/notebooks/example.ipynb"
+    "docs/notebooks/example.ipynb",
 ]
diff --git a/src/scib_metrics/_graph_connectivity.py b/src/scib_metrics/_graph_connectivity.py
@@ -1,10 +1,11 @@
 import numpy as np
 import pandas as pd
-from scipy.sparse import csr_matrix
 from scipy.sparse.csgraph import connected_components
 
+from scib_metrics.nearest_neighbors import NeighborsResults
 
-def graph_connectivity(X: csr_matrix, labels: np.ndarray) -> float:
+
+def graph_connectivity(X: NeighborsResults, labels: np.ndarray) -> float:
     """Quantify the connectivity of the subgraph per cell type label.
 
     Parameters
@@ -19,9 +20,11 @@ def graph_connectivity(X: csr_matrix, labels: np.ndarray) -> float:
     # TODO(adamgayoso): Utils for validating inputs
     clust_res = []
 
+    graph = X.knn_graph_distances
+
     for label in np.unique(labels):
         mask = labels == label
-        graph_sub = X[mask]
+        graph_sub = graph[mask]
         graph_sub = graph_sub[:, mask]
         _, comps = connected_components(graph_sub, connection="strong")
         tab = pd.value_counts(comps)

diff --git a/src/scib_metrics/_kbet.py b/src/scib_metrics/_kbet.py
@@ -8,9 +8,9 @@
 import numpy as np
 import pandas as pd
 import scipy
-from scipy.sparse import csr_matrix
 
-from scib_metrics.utils import convert_knn_graph_to_idx, diffusion_nn, get_ndarray
+from scib_metrics.nearest_neighbors import NeighborsResults
+from scib_metrics.utils import diffusion_nn, get_ndarray
 
 from ._types import NdArray
 
@@ -40,7 +40,7 @@ def _kbet(neigh_batch_ids: jnp.ndarray, batches: jnp.ndarray, n_batches: int) ->
     return test_statistics, p_values
 
 
-def kbet(X: csr_matrix, batches: np.ndarray, alpha: float = 0.05) -> float:
+def kbet(X: NeighborsResults, batches: np.ndarray, alpha: float = 0.05) -> float:
     """Compute kbet :cite:p:`buttner2018`.
 
     This implementation is inspired by the implementation in Pegasus:
@@ -57,8 +57,7 @@ def kbet(X: csr_matrix, batches: np.ndarray, alpha: float = 0.05) -> float:
     Parameters
     ----------
     X
-        Array of shape (n_cells, n_cells) with non-zero values
-        representing distances to exactly each cell's k nearest neighbors.
+        A :class:`~scib_metrics.utils.nearest_neighbors.NeighborsResults` object.
     batches
         Array of shape (n_cells,) representing batch values
         for each cell.
@@ -73,16 +72,10 @@ def kbet(X: csr_matrix, batches: np.ndarray, alpha: float = 0.05) -> float:
         Mean Kbet chi-square statistic over all cells.
     pvalue_mean
         Mean Kbet p-value over all cells.
-
-    Notes
-    -----
-    This function requires X to be cell-cell distances, not connectivies.
     """
-    if len(batches) != X.shape[0]:
+    if len(batches) != len(X.indices):
         raise ValueError("Length of batches does not match number of cells.")
-    _, knn_idx = convert_knn_graph_to_idx(X)
-    # Make sure self is included
-    knn_idx = np.concatenate([np.arange(knn_idx.shape[0])[:, None], knn_idx], axis=1)
+    knn_idx = X.indices
     batches = np.asarray(pd.Categorical(batches).codes)
     neigh_batch_ids = batches[knn_idx]
     chex.assert_equal_shape([neigh_batch_ids, knn_idx])
@@ -96,7 +89,7 @@ def kbet(X: csr_matrix, batches: np.ndarray, alpha: float = 0.05) -> float:
 
 
 def kbet_per_label(
-    X: csr_matrix,
+    X: NeighborsResults,
     batches: np.ndarray,
     labels: np.ndarray,
     alpha: float = 0.05,
@@ -113,8 +106,7 @@ def kbet_per_label(
     Parameters
     ----------
     X
-        Array of shape (n_cells, n_cells) with non-zero values
-        representing connectivies to exactly each cell's k nearest neighbors.
+        A :class:`~scib_metrics.utils.nearest_neighbors.NeighborsResults` object.
     batches
         Array of shape (n_cells,) representing batch values
         for each cell.
@@ -136,23 +128,25 @@ def kbet_per_label(
     -----
     This function requires X to be cell-cell connectivities, not distances.
     """
-    if len(batches) != X.shape[0]:
+    if len(batches) != len(X.indices):
         raise ValueError("Length of batches does not match number of cells.")
-    if len(labels) != X.shape[0]:
+    if len(labels) != len(X.indices):
         raise ValueError("Length of labels does not match number of cells.")
     # set upper bound for k0
     size_max = 2**31 - 1
     batches = np.asarray(pd.Categorical(batches).codes)
     labels = np.asarray(labels)
 
+    conn_graph = X.knn_graph_connectivities
+
     # prepare call of kBET per cluster
     kbet_scores = {"cluster": [], "kBET": []}
     for clus in np.unique(labels):
         # subset by label
         mask = labels == clus
-        X_sub = X[mask, :][:, mask]
-        X_sub.sort_indices()
-        n_obs = X_sub.shape[0]
+        conn_graph_sub = conn_graph[mask, :][:, mask]
+        conn_graph_sub.sort_indices()
+        n_obs = conn_graph_sub.shape[0]
         batches_sub = batches[mask]
 
         # check if neighborhood size too small or only one batch in subset
@@ -166,12 +160,12 @@ def kbet_per_label(
             if k0 * n_obs >= size_max:
                 k0 = np.floor(size_max / n_obs).astype("int")
 
-            n_comp, labs = scipy.sparse.csgraph.connected_components(X_sub, connection="strong")
+            n_comp, labs = scipy.sparse.csgraph.connected_components(conn_graph_sub, connection="strong")
 
             if n_comp == 1:  # a single component to compute kBET on
                 try:
                     diffusion_n_comps = np.min([diffusion_n_comps, n_obs - 1])
-                    nn_graph_sub = diffusion_nn(X_sub, k=k0, n_comps=diffusion_n_comps).astype("float")
+                    nn_graph_sub = diffusion_nn(conn_graph_sub, k=k0, n_comps=diffusion_n_comps)
                     # call kBET
                     score, _, _ = kbet(
                         nn_graph_sub,
@@ -192,15 +186,15 @@ def kbet_per_label(
                 # check if 75% of all cells can be used for kBET run
                 if len(idx_nonan) / len(labs) >= 0.75:
                     # create another subset of components, assume they are not visited in a diffusion process
-                    X_sub_sub = X_sub[idx_nonan, :][:, idx_nonan]
-                    X_sub_sub.sort_indices()
+                    conn_graph_sub_sub = conn_graph_sub[idx_nonan, :][:, idx_nonan]
+                    conn_graph_sub_sub.sort_indices()
 
                     try:
-                        diffusion_n_comps = np.min([diffusion_n_comps, X_sub_sub.shape[0] - 1])
-                        nn_graph_sub_sub = diffusion_nn(X_sub_sub, k=k0, n_comps=diffusion_n_comps).astype("float")
+                        diffusion_n_comps = np.min([diffusion_n_comps, conn_graph_sub_sub.shape[0] - 1])
+                        nn_results_sub_sub = diffusion_nn(conn_graph_sub_sub, k=k0, n_comps=diffusion_n_comps)
                         # call kBET
                         score, _, _ = kbet(
-                            nn_graph_sub_sub,
+                            nn_results_sub_sub,
                             batches=batches_sub[idx_nonan],
                             alpha=alpha,
                         )