Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport PR #3048: (feat): raising errors where backed is not supported #3072

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/1.10.2.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
```

* Compatibility with `matplotlib` 3.9 {pr}`2999` {smaller}`I Virshup`
* Add clear errors where `backed` mode-like matrices (i.e., from `sparse_dataset`) are not supported {pr}`3048` {smaller}`I gold`
* Fix deprecated use of `.A` with sparse matrices {pr}`3084` {smaller}`P Angerer`

```{rubric} Performance
Expand Down
19 changes: 19 additions & 0 deletions scanpy/_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from typing import TYPE_CHECKING, overload
from weakref import WeakSet

import h5py
import numpy as np
from anndata import __version__ as anndata_version
from packaging.version import Version
Expand All @@ -33,6 +34,13 @@
from .._settings import settings
from .compute.is_constant import is_constant # noqa: F401

if Version(anndata_version) >= Version("0.10.0"):
from anndata._core.sparse_dataset import (
BaseCompressedSparseDataset as SparseDataset,
)
else:
from anndata._core.sparse_dataset import SparseDataset

if TYPE_CHECKING:
from collections.abc import Mapping
from pathlib import Path
Expand Down Expand Up @@ -1084,3 +1092,14 @@ def _resolve_axis(
if axis in {1, "var"}:
return (1, "var")
raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis!r}")


def is_backed_type(X: object) -> bool:
return isinstance(X, (SparseDataset, h5py.File, h5py.Dataset))


def raise_not_implemented_error_if_backed_type(X: object, method_name: str) -> None:
if is_backed_type(X):
raise NotImplementedError(
f"{method_name} is not implemented for matrices of type {type(X)}"
)
11 changes: 9 additions & 2 deletions scanpy/preprocessing/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from .. import logging as logg
from .._compat import DaskArray, pkg_version
from .._settings import settings
from .._utils import _doc_params, _empty
from .._utils import _doc_params, _empty, is_backed_type
from ..get import _check_mask, _get_obs_rep
from ._docs import doc_mask_var_hvg
from ._utils import _get_mean_var
Expand Down Expand Up @@ -173,6 +173,10 @@ def pca(
)
data_is_AnnData = isinstance(data, AnnData)
if data_is_AnnData:
if layer is None and not chunked and is_backed_type(data.X):
raise NotImplementedError(
f"PCA is not implemented for matrices of type {type(data.X)} with chunked as False"
)
adata = data.copy() if copy else data
else:
if pkg_version("anndata") < Version("0.8.0rc1"):
Expand All @@ -195,7 +199,10 @@ def pca(
logg.info(f" with n_comps={n_comps}")

X = _get_obs_rep(adata_comp, layer=layer)

if is_backed_type(X) and layer is not None:
raise NotImplementedError(
f"PCA is not implemented for matrices of type {type(X)} from layers"
)
# See: https://github.com/scverse/scanpy/pull/2816#issuecomment-1932650529
if (
Version(ad.__version__) < Version("0.9")
Expand Down
2 changes: 2 additions & 0 deletions scanpy/preprocessing/_scale.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from .._utils import (
_check_array_function_arguments,
axis_mul_or_truediv,
raise_not_implemented_error_if_backed_type,
renamed_arg,
view_to_actual,
)
Expand Down Expand Up @@ -298,6 +299,7 @@ def scale_anndata(
mask_obs = _check_mask(adata, mask_obs, "obs")
view_to_actual(adata)
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
raise_not_implemented_error_if_backed_type(X, "scale")
X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale(
X,
zero_center=zero_center,
Expand Down
15 changes: 15 additions & 0 deletions scanpy/preprocessing/_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from .._utils import (
_check_array_function_arguments,
axis_sum,
is_backed_type,
raise_not_implemented_error_if_backed_type,
renamed_arg,
sanitize_anndata,
view_to_actual,
Expand Down Expand Up @@ -145,6 +147,7 @@ def filter_cells(
"`min_genes`, `max_counts`, `max_genes` per call."
)
if isinstance(data, AnnData):
raise_not_implemented_error_if_backed_type(data.X, "filter_cells")
adata = data.copy() if copy else data
cell_subset, number = materialize_as_ndarray(
filter_cells(
Expand Down Expand Up @@ -260,6 +263,7 @@ def filter_genes(
)

if isinstance(data, AnnData):
raise_not_implemented_error_if_backed_type(data.X, "filter_genes")
adata = data.copy() if copy else data
gene_subset, number = materialize_as_ndarray(
filter_genes(
Expand Down Expand Up @@ -405,10 +409,19 @@ def log1p_anndata(
raise NotImplementedError(
"Currently cannot perform chunked operations on arrays not stored in X."
)
if adata.isbacked and adata.file._filemode != "r+":
raise NotImplementedError(
"log1p is not implemented for backed AnnData with backed mode not r+"
)
for chunk, start, end in adata.chunked_X(chunk_size):
adata.X[start:end] = log1p(chunk, base=base, copy=False)
else:
X = _get_obs_rep(adata, layer=layer, obsm=obsm)
if is_backed_type(X):
msg = f"log1p is not implemented for matrices of type {type(X)}"
if layer is not None:
raise NotImplementedError(f"{msg} from layers")
raise NotImplementedError(f"{msg} without `chunked=True`")
X = log1p(X, copy=False, base=base)
_set_obs_rep(adata, X, layer=layer, obsm=obsm)

Expand Down Expand Up @@ -647,6 +660,7 @@ def regress_out(
keys = [keys]

X = _get_obs_rep(adata, layer=layer)
raise_not_implemented_error_if_backed_type(X, "regress_out")

if issparse(X):
logg.info(" sparse input is densified and may " "lead to high memory use")
Expand Down Expand Up @@ -855,6 +869,7 @@ def downsample_counts(
`adata.X` : :class:`numpy.ndarray` | :class:`scipy.sparse.spmatrix` (dtype `float`)
Downsampled counts matrix.
"""
raise_not_implemented_error_if_backed_type(adata.X, "downsample_counts")
# This logic is all dispatch
total_counts_call = total_counts is not None
counts_per_cell_call = counts_per_cell is not None
Expand Down
98 changes: 98 additions & 0 deletions scanpy/tests/test_backed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from __future__ import annotations

from functools import partial

import pytest
from anndata import read_h5ad

import scanpy as sc


@pytest.mark.parametrize(
("name", "func", "msg"),
[
pytest.param("PCA", sc.pp.pca, " with chunked as False", id="pca"),
pytest.param(
"PCA", partial(sc.pp.pca, layer="X_copy"), " from layers", id="pca_layer"
),
pytest.param(
"regress_out",
partial(sc.pp.regress_out, keys=["n_counts", "percent_mito"]),
"",
id="regress_out",
),
pytest.param(
"dendrogram", partial(sc.tl.dendrogram, groupby="cat"), "", id="dendrogram"
),
pytest.param("tsne", sc.tl.tsne, "", id="tsne"),
pytest.param("scale", sc.pp.scale, "", id="scale"),
pytest.param(
"downsample_counts",
partial(sc.pp.downsample_counts, counts_per_cell=1000),
"",
id="downsample_counts",
),
pytest.param(
"filter_genes",
partial(sc.pp.filter_genes, max_cells=1000),
"",
id="filter_genes",
),
pytest.param(
"filter_cells",
partial(sc.pp.filter_cells, max_genes=1000),
"",
id="filter_cells",
),
pytest.param(
"rank_genes_groups",
partial(sc.tl.rank_genes_groups, groupby="cat"),
"",
id="rank_genes_groups",
),
pytest.param(
"score_genes",
partial(sc.tl.score_genes, gene_list=map(str, range(100))),
"",
id="score_genes",
),
],
)
def test_backed_error(backed_adata, name, func, msg):
with pytest.raises(
NotImplementedError,
match=f"{name} is not implemented for matrices of type {type(backed_adata.X)}{msg}",
):
func(backed_adata)


def test_log1p_backed_errors(backed_adata):
with pytest.raises(
NotImplementedError,
match="log1p is not implemented for backed AnnData with backed mode not r+",
):
sc.pp.log1p(backed_adata, chunked=True)
backed_adata.file.close()
backed_adata = read_h5ad(backed_adata.filename, backed="r+")
with pytest.raises(
NotImplementedError,
match=f"log1p is not implemented for matrices of type {type(backed_adata.X)} without `chunked=True`",
):
sc.pp.log1p(backed_adata)
backed_adata.layers["X_copy"] = backed_adata.X
layer_type = type(backed_adata.layers["X_copy"])
with pytest.raises(
NotImplementedError,
match=f"log1p is not implemented for matrices of type {layer_type} from layers",
):
sc.pp.log1p(backed_adata, layer="X_copy")
backed_adata.file.close()


def test_scatter_backed(backed_adata):
sc.pp.pca(backed_adata, chunked=True)
sc.pl.scatter(backed_adata, color="0", basis="pca")


def test_dotplot_backed(backed_adata):
sc.pl.dotplot(backed_adata, ["0", "1", "2", "3"], groupby="cat")
17 changes: 17 additions & 0 deletions scanpy/tests/test_ingest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import anndata
import numpy as np
import pytest
from sklearn.neighbors import KDTree
Expand Down Expand Up @@ -153,3 +154,19 @@ def test_ingest_map_embedding_umap():
umap_transformed_t = reducer.transform(T)

assert np.allclose(ing._obsm["X_umap"], umap_transformed_t)


def test_ingest_backed(adatas, tmp_path):
adata_ref = adatas[0].copy()
adata_new = adatas[1].copy()

adata_new.write_h5ad(f"{tmp_path}/new.h5ad")

adata_new = anndata.read_h5ad(f"{tmp_path}/new.h5ad", backed="r")

ing = sc.tl.Ingest(adata_ref)
with pytest.raises(
NotImplementedError,
match=f"Ingest.fit is not implemented for matrices of type {type(adata_new.X)}",
):
ing.fit(adata_new)
4 changes: 3 additions & 1 deletion scanpy/tools/_dendrogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from .. import logging as logg
from .._compat import old_positionals
from .._utils import _doc_params
from .._utils import _doc_params, raise_not_implemented_error_if_backed_type
from ..neighbors._doc import doc_n_pcs, doc_use_rep
from ._utils import _choose_representation

Expand Down Expand Up @@ -117,6 +117,8 @@ def dendrogram(
>>> markers = ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ']
>>> sc.pl.dotplot(adata, markers, groupby='bulk_labels', dendrogram=True)
"""

raise_not_implemented_error_if_backed_type(adata.X, "dendrogram")
if isinstance(groupby, str):
# if not a list, turn into a list
groupby = [groupby]
Expand Down
3 changes: 2 additions & 1 deletion scanpy/tools/_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .. import logging as logg
from .._compat import old_positionals, pkg_version
from .._settings import settings
from .._utils import NeighborsView
from .._utils import NeighborsView, raise_not_implemented_error_if_backed_type
from .._utils._doctests import doctest_skip
from ..neighbors import FlatTree

Expand Down Expand Up @@ -392,6 +392,7 @@ def fit(self, adata_new):
`adata` refers to the :class:`~anndata.AnnData` object
that is passed during the initialization of an Ingest instance.
"""
raise_not_implemented_error_if_backed_type(adata_new.X, "Ingest.fit")
ref_var_names = self._adata_ref.var_names.str.upper()
new_var_names = adata_new.var_names.str.upper()

Expand Down
7 changes: 5 additions & 2 deletions scanpy/tools/_rank_genes_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from .. import _utils
from .. import logging as logg
from .._compat import old_positionals
from .._utils import check_nonnegative_integers
from .._utils import (
check_nonnegative_integers,
raise_not_implemented_error_if_backed_type,
)
from ..get import _check_mask
from ..preprocessing._utils import _get_mean_var

Expand Down Expand Up @@ -134,6 +137,7 @@ def __init__(
if use_raw and adata.raw is not None:
adata_comp = adata.raw
X = adata_comp.X
raise_not_implemented_error_if_backed_type(X, "rank_genes_groups")

# for correct getnnz calculation
if issparse(X):
Expand Down Expand Up @@ -594,7 +598,6 @@ def rank_genes_groups(
>>> # to visualize the results
>>> sc.pl.rank_genes_groups(adata)
"""

if mask_var is not None:
mask_var = _check_mask(adata, mask_var, "var")

Expand Down
6 changes: 5 additions & 1 deletion scanpy/tools/_score_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from scipy.sparse import issparse

from scanpy._utils import _check_use_raw
from scanpy._utils import _check_use_raw, is_backed_type

from .. import logging as logg
from .._compat import old_positionals
Expand Down Expand Up @@ -115,6 +115,10 @@ def score_genes(
start = logg.info(f"computing score {score_name!r}")
adata = adata.copy() if copy else adata
use_raw = _check_use_raw(adata, use_raw)
if is_backed_type(adata.X) and not use_raw:
raise NotImplementedError(
f"score_genes is not implemented for matrices of type {type(adata.X)}"
)

if random_state is not None:
np.random.seed(random_state)
Expand Down
3 changes: 2 additions & 1 deletion scanpy/tools/_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .. import logging as logg
from .._compat import old_positionals
from .._settings import settings
from .._utils import _doc_params
from .._utils import _doc_params, raise_not_implemented_error_if_backed_type
from ..neighbors._doc import doc_n_pcs, doc_use_rep
from ._utils import _choose_representation

Expand Down Expand Up @@ -106,6 +106,7 @@ def tsne(
start = logg.info("computing tSNE")
adata = adata.copy() if copy else adata
X = _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs)
raise_not_implemented_error_if_backed_type(X, "tsne")
# params for sklearn
n_jobs = settings.n_jobs if n_jobs is None else n_jobs
params_sklearn = dict(
Expand Down
2 changes: 2 additions & 0 deletions src/testing/scanpy/_pytest/fixtures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from .data import (
_pbmc3ks_parametrized_session,
backed_adata,
pbmc3k_parametrized,
pbmc3k_parametrized_small,
)
Expand All @@ -27,6 +28,7 @@
"_pbmc3ks_parametrized_session",
"pbmc3k_parametrized",
"pbmc3k_parametrized_small",
"backed_adata",
]


Expand Down
Loading
Loading