From d111f046717f85a58544e3181d53f69ab2545796 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Apr 2024 17:22:43 +0200 Subject: [PATCH 001/138] (feat): `read_elem_lazy` method --- src/anndata/_io/specs/__init__.py | 6 +- src/anndata/_io/specs/lazy_methods.py | 105 +++++++++++++++++++++++ src/anndata/_io/specs/registry.py | 24 ++++-- src/anndata/tests/test_io_elementwise.py | 97 ++++++++++++++++----- 4 files changed, 206 insertions(+), 26 deletions(-) create mode 100644 src/anndata/_io/specs/lazy_methods.py diff --git a/src/anndata/_io/specs/__init__.py b/src/anndata/_io/specs/__init__.py index ceff8b3d6..8fd9898a3 100644 --- a/src/anndata/_io/specs/__init__.py +++ b/src/anndata/_io/specs/__init__.py @@ -1,21 +1,25 @@ from __future__ import annotations -from . import methods +from . import lazy_methods, methods from .registry import ( + _LAZY_REGISTRY, # noqa: F401 _REGISTRY, # noqa: F401 IOSpec, Reader, Writer, get_spec, read_elem, + read_elem_lazy, write_elem, ) __all__ = [ "methods", + "lazy_methods", "write_elem", "get_spec", "read_elem", + "read_elem_lazy", "Reader", "Writer", "IOSpec", diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py new file mode 100644 index 000000000..16c5c93f6 --- /dev/null +++ b/src/anndata/_io/specs/lazy_methods.py @@ -0,0 +1,105 @@ +from __future__ import annotations + +import dask.array as da +import h5py +import numpy as np +from scipy import sparse + +import anndata as ad +from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup + +from .registry import _LAZY_REGISTRY, IOSpec + +# TODO: settings +stride = 100 +h5_chunks = 1000 + + +def make_dask_array(is_csc, shape, make_dask_chunk, dtype): + chunks = [None, None] + major_index = int(is_csc) + minor_index = (is_csc + 1) % 2 + chunks[minor_index] = (shape[minor_index],) + chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( + shape[major_index] % stride, + ) + memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] + da_mtx = da.map_blocks( + make_dask_chunk, + dtype=dtype, + chunks=chunks, + meta=memory_format((0, 0), dtype=np.float32), + ) + return da_mtx + + +def make_index(is_csc, stride, shape, block_id): + index = ( + slice( + block_id[is_csc] * stride, + min((block_id[is_csc] * stride) + stride, shape[0]), + ), + ) + if is_csc: + return (slice(None, None, None),) + index + return index + + +@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) +def read_sparse_as_dask_h5(elem, _reader): + filename = elem.file.filename + elem_name = elem.name + with h5py.File(filename, "r") as f: + e = f[elem_name] + shape = e.attrs["shape"] + encoding_type = e.attrs["encoding-type"] + dtype = e["data"].dtype + is_csc = encoding_type == "csc_matrix" + + def make_dask_chunk(block_id=None): + # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` + # https://github.com/scverse/anndata/issues/1105 + with h5py.File(filename, "r") as f: + mtx = ad.experimental.sparse_dataset(f[elem_name]) + index = make_index(is_csc, stride, shape, block_id) + chunk = mtx[*index] + return chunk + + return make_dask_array(is_csc, shape, make_dask_chunk, dtype) + + +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) +def read_sparse_as_dask_zarr(elem, _reader): + shape = elem.attrs["shape"] + dtype = elem["data"].dtype + is_csc = elem.attrs["encoding-type"] == "csc_matrix" + + def make_dask_chunk(block_id=None): + mtx = ad.experimental.sparse_dataset(elem) + index = make_index(is_csc, stride, shape, block_id) + return mtx[*index] + + return make_dask_array(is_csc, shape, make_dask_chunk, dtype) + + +@_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) +def read_h5_array(elem, _reader): + if not hasattr(elem, "chunks") or elem.chunks is None: + return da.from_array(elem, chunks=(h5_chunks,) * len(elem.shape)) + return da.from_array(elem) + + +@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) +def read_h5_string_array(elem, _reader): + from anndata._io.h5ad import read_dataset + + elem = read_dataset(elem) + return read_h5_array(elem, _reader) + + +@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) +@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) +def read_zarr_array(elem, _reader): + return da.from_zarr(elem) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index a8357295d..b422ff223 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -44,7 +44,7 @@ def _from_read_parts( ) -> IORegistryError: # TODO: Improve error message if type exists, but version does not msg = ( - f"No {method} method registered for {spec} from {src_typ}. " + f"No {method} method registered for {spec} from {src_typ} in registry {registry}. " "You may need to update your installation of anndata." ) return cls(msg) @@ -145,9 +145,7 @@ def get_reader( if (src_type, spec, modifiers) in self.read: return self.read[(src_type, spec, modifiers)] else: - raise IORegistryError._from_read_parts( - "read", _REGISTRY.read, src_type, spec - ) + raise IORegistryError._from_read_parts("read", self.read, src_type, spec) def has_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() @@ -176,7 +174,7 @@ def get_partial_reader( return self.read_partial[(src_type, spec, modifiers)] else: raise IORegistryError._from_read_parts( - "read_partial", _REGISTRY.read_partial, src_type, spec + "read_partial", self.read_partial, src_type, spec ) def get_spec(self, elem: Any) -> IOSpec: @@ -188,6 +186,7 @@ def get_spec(self, elem: Any) -> IOSpec: _REGISTRY = IORegistry() +_LAZY_REGISTRY = IORegistry() @singledispatch @@ -332,6 +331,21 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) +def read_elem_lazy(elem: StorageType) -> Any: + """ + Read an element from a store lazily. + + Assumes that the element is encoded using the anndata encoding. This function will + determine the encoded type using the encoding metadata stored in elem's attributes. + + Params + ------ + elem + The stored element. + """ + return Reader(_LAZY_REGISTRY).read_elem(elem) + + def write_elem( store: GroupStorageType, k: str, diff --git a/src/anndata/tests/test_io_elementwise.py b/src/anndata/tests/test_io_elementwise.py index cd43d57ca..aae470c54 100644 --- a/src/anndata/tests/test_io_elementwise.py +++ b/src/anndata/tests/test_io_elementwise.py @@ -15,7 +15,14 @@ from scipy import sparse import anndata as ad -from anndata._io.specs import _REGISTRY, IOSpec, get_spec, read_elem, write_elem +from anndata._io.specs import ( + _REGISTRY, + IOSpec, + get_spec, + read_elem, + read_elem_lazy, + write_elem, +) from anndata._io.specs.registry import IORegistryError from anndata.compat import H5Group, ZarrGroup, _read_attr from anndata.tests.helpers import ( @@ -47,6 +54,46 @@ def store(request, tmp_path) -> H5Group | ZarrGroup: file.close() +sparse_formats = ["csr", "csc"] +SIZE = 1000 + + +@pytest.fixture(scope="function", params=sparse_formats) +def sparse_format(request): + return request.param + + +def create_dense_store(store): + X = np.random.randn(SIZE, SIZE) + + write_elem(store, "X", X) + return store + + +def create_string_store(store): + X = np.arange(0, SIZE * SIZE).reshape((SIZE, SIZE)).astype(str) + + write_elem(store, "X", X) + return store + + +def create_sparse_store(sparse_format, store): + import dask.array as da + + X = sparse.random( + SIZE, + SIZE, + format=sparse_format, + density=0.01, + random_state=np.random.default_rng(), + ) + X_dask = da.from_array(X, chunks=(100, 100)) + + write_elem(store, "X", X) + write_elem(store, "X_dask", X_dask) + return store + + @pytest.mark.parametrize( "value,encoding_type", [ @@ -126,30 +173,40 @@ def test_io_spec_cupy(store, value, encoding_type): assert get_spec(store[key]) == _REGISTRY.get_spec(value) -@pytest.mark.parametrize("sparse_format", ["csr", "csc"]) -def test_dask_write_sparse(store, sparse_format): - import dask.array as da +def test_dask_write_sparse(sparse_format, store): + x_sparse_store = create_sparse_store(sparse_format, store) + X_from_disk = read_elem(x_sparse_store["X"]) + X_dask_from_disk = read_elem(x_sparse_store["X_dask"]) - X = sparse.random( - 1000, - 1000, - format=sparse_format, - density=0.01, - random_state=np.random.default_rng(), - ) - X_dask = da.from_array(X, chunks=(100, 100)) + assert_equal(X_from_disk, X_dask_from_disk) + assert_equal(dict(x_sparse_store["X"].attrs), dict(x_sparse_store["X_dask"].attrs)) - write_elem(store, "X", X) - write_elem(store, "X_dask", X_dask) + assert x_sparse_store["X_dask/indptr"].dtype == np.int64 + assert x_sparse_store["X_dask/indices"].dtype == np.int64 - X_from_disk = read_elem(store["X"]) - X_dask_from_disk = read_elem(store["X_dask"]) + +@pytest.mark.parametrize("arr_type", ["dense", "string", *sparse_formats]) +def test_read_lazy_2d_dask(arr_type, store): + if arr_type == "dense": + arr_store = create_dense_store(store) + elif arr_type == "string": + arr_store = create_string_store(store) + else: + arr_store = create_sparse_store(arr_type, store) + X_dask_from_disk = read_elem_lazy(arr_store["X"]) + X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) - assert_equal(dict(store["X"].attrs), dict(store["X_dask"].attrs)) + random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) + random_bool_mask = np.random.randn(SIZE) > 0 + index_slice = slice(0, SIZE // 10) + for index in [random_int_indices, index_slice, random_bool_mask]: + assert_equal(X_from_disk[index, :], X_dask_from_disk[index, :]) + assert_equal(X_from_disk[:, index], X_dask_from_disk[:, index]) - assert store["X_dask/indptr"].dtype == np.int64 - assert store["X_dask/indices"].dtype == np.int64 + if arr_type in {"csr", "csc"}: + assert arr_store["X_dask/indptr"].dtype == np.int64 + assert arr_store["X_dask/indices"].dtype == np.int64 def test_io_spec_raw(store): @@ -178,7 +235,7 @@ def test_write_anndata_to_root(store): ["attribute", "value"], [ ("encoding-type", "floob"), - ("encoding-version", "10000.0"), + ("encoding-version", "SIZE0.0"), ], ) def test_read_iospec_not_found(store, attribute, value): From 00be7f02cb93a2affb65ccd195a219aae8328f4e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Apr 2024 17:26:34 +0200 Subject: [PATCH 002/138] (revert): error message --- src/anndata/_io/specs/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index b422ff223..a2e62db7a 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -44,7 +44,7 @@ def _from_read_parts( ) -> IORegistryError: # TODO: Improve error message if type exists, but version does not msg = ( - f"No {method} method registered for {spec} from {src_typ} in registry {registry}. " + f"No {method} method registered for {spec} from {src_typ}. " "You may need to update your installation of anndata." ) return cls(msg) From fd635d771aa65987ede03042ffe3c29548a6b6bc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Apr 2024 17:29:24 +0200 Subject: [PATCH 003/138] (refactor): declare `is_csc` reading elem directly in h5 --- src/anndata/_io/specs/lazy_methods.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 16c5c93f6..dcdc5160a 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -53,9 +53,8 @@ def read_sparse_as_dask_h5(elem, _reader): with h5py.File(filename, "r") as f: e = f[elem_name] shape = e.attrs["shape"] - encoding_type = e.attrs["encoding-type"] dtype = e["data"].dtype - is_csc = encoding_type == "csc_matrix" + is_csc = e.attrs["encoding-type"] == "csc_matrix" def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` From f5e7fda7049fb33a13dea821cbe2d8f1eb513988 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 12 Apr 2024 10:25:05 +0200 Subject: [PATCH 004/138] (chore): `read_elem_lazy` -> `read_elem_as_dask` --- src/anndata/_io/specs/__init__.py | 4 ++-- src/anndata/_io/specs/registry.py | 2 +- src/anndata/tests/test_io_elementwise.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anndata/_io/specs/__init__.py b/src/anndata/_io/specs/__init__.py index 8fd9898a3..5eadfdb50 100644 --- a/src/anndata/_io/specs/__init__.py +++ b/src/anndata/_io/specs/__init__.py @@ -9,7 +9,7 @@ Writer, get_spec, read_elem, - read_elem_lazy, + read_elem_as_dask, write_elem, ) @@ -19,7 +19,7 @@ "write_elem", "get_spec", "read_elem", - "read_elem_lazy", + "read_elem_as_dask", "Reader", "Writer", "IOSpec", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index a2e62db7a..7460d7f70 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -331,7 +331,7 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -def read_elem_lazy(elem: StorageType) -> Any: +def read_elem_as_dask(elem: StorageType) -> Any: """ Read an element from a store lazily. diff --git a/src/anndata/tests/test_io_elementwise.py b/src/anndata/tests/test_io_elementwise.py index aae470c54..07484e422 100644 --- a/src/anndata/tests/test_io_elementwise.py +++ b/src/anndata/tests/test_io_elementwise.py @@ -20,7 +20,7 @@ IOSpec, get_spec, read_elem, - read_elem_lazy, + read_elem_as_dask, write_elem, ) from anndata._io.specs.registry import IORegistryError @@ -193,7 +193,7 @@ def test_read_lazy_2d_dask(arr_type, store): arr_store = create_string_store(store) else: arr_store = create_sparse_store(arr_type, store) - X_dask_from_disk = read_elem_lazy(arr_store["X"]) + X_dask_from_disk = read_elem_as_dask(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) From ae5396cfc0fa6ce47463453320dc6cbe45a520e2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 12 Apr 2024 10:28:57 +0200 Subject: [PATCH 005/138] (chore): remove string handling --- src/anndata/_io/specs/lazy_methods.py | 9 --------- src/anndata/tests/test_io_elementwise.py | 11 +---------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index dcdc5160a..d967d7591 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -90,15 +90,6 @@ def read_h5_array(elem, _reader): return da.from_array(elem) -@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_h5_string_array(elem, _reader): - from anndata._io.h5ad import read_dataset - - elem = read_dataset(elem) - return read_h5_array(elem, _reader) - - @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) -@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) def read_zarr_array(elem, _reader): return da.from_zarr(elem) diff --git a/src/anndata/tests/test_io_elementwise.py b/src/anndata/tests/test_io_elementwise.py index 07484e422..2fb49162b 100644 --- a/src/anndata/tests/test_io_elementwise.py +++ b/src/anndata/tests/test_io_elementwise.py @@ -70,13 +70,6 @@ def create_dense_store(store): return store -def create_string_store(store): - X = np.arange(0, SIZE * SIZE).reshape((SIZE, SIZE)).astype(str) - - write_elem(store, "X", X) - return store - - def create_sparse_store(sparse_format, store): import dask.array as da @@ -185,12 +178,10 @@ def test_dask_write_sparse(sparse_format, store): assert x_sparse_store["X_dask/indices"].dtype == np.int64 -@pytest.mark.parametrize("arr_type", ["dense", "string", *sparse_formats]) +@pytest.mark.parametrize("arr_type", ["dense", *sparse_formats]) def test_read_lazy_2d_dask(arr_type, store): if arr_type == "dense": arr_store = create_dense_store(store) - elif arr_type == "string": - arr_store = create_string_store(store) else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"]) From 664336aa30511f1b28ba55d3d8b3028dbcd74eda Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 12 Apr 2024 10:36:24 +0200 Subject: [PATCH 006/138] (refactor): use `elem` for h5 where posssble --- src/anndata/_io/specs/lazy_methods.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index d967d7591..2f392db00 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -50,11 +50,9 @@ def make_index(is_csc, stride, shape, block_id): def read_sparse_as_dask_h5(elem, _reader): filename = elem.file.filename elem_name = elem.name - with h5py.File(filename, "r") as f: - e = f[elem_name] - shape = e.attrs["shape"] - dtype = e["data"].dtype - is_csc = e.attrs["encoding-type"] == "csc_matrix" + shape = elem.attrs["shape"] + dtype = elem["data"].dtype + is_csc = elem.attrs["encoding-type"] == "csc_matrix" def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` From 52002b6eeee9aa60dc7ccac3956e8c32e497a78a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 17 Apr 2024 12:37:34 +0200 Subject: [PATCH 007/138] (chore): remove invlaud syntax --- src/anndata/_io/specs/lazy_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 2f392db00..567561d03 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -60,7 +60,7 @@ def make_dask_chunk(block_id=None): with h5py.File(filename, "r") as f: mtx = ad.experimental.sparse_dataset(f[elem_name]) index = make_index(is_csc, stride, shape, block_id) - chunk = mtx[*index] + chunk = mtx[index] return chunk return make_dask_array(is_csc, shape, make_dask_chunk, dtype) @@ -76,7 +76,7 @@ def read_sparse_as_dask_zarr(elem, _reader): def make_dask_chunk(block_id=None): mtx = ad.experimental.sparse_dataset(elem) index = make_index(is_csc, stride, shape, block_id) - return mtx[*index] + return mtx[index] return make_dask_array(is_csc, shape, make_dask_chunk, dtype) From aa1006ea0813a0f6e48d378927a303e5184b5c9b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 17 Apr 2024 13:44:03 +0200 Subject: [PATCH 008/138] (fix): put dask import inside function --- src/anndata/_io/specs/lazy_methods.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 567561d03..a4445c77d 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,6 +1,5 @@ from __future__ import annotations -import dask.array as da import h5py import numpy as np from scipy import sparse @@ -16,6 +15,8 @@ def make_dask_array(is_csc, shape, make_dask_chunk, dtype): + import dask.array as da + chunks = [None, None] major_index = int(is_csc) minor_index = (is_csc + 1) % 2 @@ -83,6 +84,8 @@ def make_dask_chunk(block_id=None): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array(elem, _reader): + import dask.array as da + if not hasattr(elem, "chunks") or elem.chunks is None: return da.from_array(elem, chunks=(h5_chunks,) * len(elem.shape)) return da.from_array(elem) @@ -90,4 +93,6 @@ def read_h5_array(elem, _reader): @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array(elem, _reader): + import dask.array as da + return da.from_zarr(elem) From dda7d8306f242ccf1f3682151e845cecd3467044 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 17 Apr 2024 16:45:23 +0200 Subject: [PATCH 009/138] (refactor): try maybe open? --- src/anndata/_io/specs/lazy_methods.py | 43 +++++++++++++++------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a4445c77d..6931fae0f 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,5 +1,8 @@ from __future__ import annotations +from contextlib import contextmanager +from pathlib import Path + import h5py import numpy as np from scipy import sparse @@ -46,11 +49,28 @@ def make_index(is_csc, stride, shape, block_id): return index +@contextmanager +def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): + if isinstance(filename_or_elem, str): + file = h5py.File(filename_or_elem, "r") + try: + yield file[elem_name] + finally: + file.close() + else: + try: + yield filename_or_elem + finally: + pass + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) +@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask_h5(elem, _reader): - filename = elem.file.filename - elem_name = elem.name + filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem + elem_name = elem.name if isinstance(elem, H5Group) else Path(elem.path).name shape = elem.attrs["shape"] dtype = elem["data"].dtype is_csc = elem.attrs["encoding-type"] == "csc_matrix" @@ -58,8 +78,8 @@ def read_sparse_as_dask_h5(elem, _reader): def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 - with h5py.File(filename, "r") as f: - mtx = ad.experimental.sparse_dataset(f[elem_name]) + with maybe_open_h5(filename_or_elem, elem_name) as f: + mtx = ad.experimental.sparse_dataset(f) index = make_index(is_csc, stride, shape, block_id) chunk = mtx[index] return chunk @@ -67,21 +87,6 @@ def make_dask_chunk(block_id=None): return make_dask_array(is_csc, shape, make_dask_chunk, dtype) -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) -@_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask_zarr(elem, _reader): - shape = elem.attrs["shape"] - dtype = elem["data"].dtype - is_csc = elem.attrs["encoding-type"] == "csc_matrix" - - def make_dask_chunk(block_id=None): - mtx = ad.experimental.sparse_dataset(elem) - index = make_index(is_csc, stride, shape, block_id) - return mtx[index] - - return make_dask_array(is_csc, shape, make_dask_chunk, dtype) - - @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array(elem, _reader): import dask.array as da From 1fc4cc354bab82d075d6f23a85f40831b9bf6e99 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:00:15 +0200 Subject: [PATCH 010/138] (fix): revert `encoding-version` --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 09872994c..d3ddcc5a9 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -243,7 +243,7 @@ def test_write_anndata_to_root(store): ["attribute", "value"], [ ("encoding-type", "floob"), - ("encoding-version", "SIZE0.0"), + ("encoding-version", "10000.0"), ], ) def test_read_iospec_not_found(store, attribute, value): From 5ca71eaaa62a4eb75780981c93fb2643ae2bf416 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:00:56 +0200 Subject: [PATCH 011/138] (chore): document `create_sparse_store` test function --- tests/test_io_elementwise.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index d3ddcc5a9..12d5be98f 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -5,6 +5,7 @@ from __future__ import annotations import re +from typing import Literal, TypeVar import h5py import numpy as np @@ -70,7 +71,21 @@ def create_dense_store(store): return store -def create_sparse_store(sparse_format, store): +G = TypeVar("G", bound=H5Group | ZarrGroup) + + +def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: + """Returns a store + + Parameters + ---------- + sparse_format + store + + Returns + ------- + A store with a key, `X` that is simply a sparse matrix, and `X_dask` where that same array is wrapped by dask + """ import dask.array as da X = sparse.random( @@ -80,7 +95,9 @@ def create_sparse_store(sparse_format, store): density=0.01, random_state=np.random.default_rng(), ) - X_dask = da.from_array(X, chunks=(100, 100)) + X_dask = da.from_array( + X, chunks=(100 if format == "csr" else SIZE, SIZE if format == "csr" else 100) + ) write_elem(store, "X", X) write_elem(store, "X_dask", X_dask) From 3672c187a539af72bed3bab1b6a1858ee5ef787e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:11:30 +0200 Subject: [PATCH 012/138] (chore): sort indices to prevent warning --- tests/test_io_elementwise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 12d5be98f..da34f621c 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -206,6 +206,7 @@ def test_read_lazy_2d_dask(arr_type, store): assert_equal(X_from_disk, X_dask_from_disk) random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) + random_int_indices.sort() random_bool_mask = np.random.randn(SIZE) > 0 index_slice = slice(0, SIZE // 10) for index in [random_int_indices, index_slice, random_bool_mask]: From 33c35998e1fc9cb61992c707942b503b30a3d8da Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:12:09 +0200 Subject: [PATCH 013/138] (fix): remove utility function `make_dask_array` --- src/anndata/_io/specs/lazy_methods.py | 38 ++++++++++++--------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 6931fae0f..42d01def1 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -17,26 +17,6 @@ h5_chunks = 1000 -def make_dask_array(is_csc, shape, make_dask_chunk, dtype): - import dask.array as da - - chunks = [None, None] - major_index = int(is_csc) - minor_index = (is_csc + 1) % 2 - chunks[minor_index] = (shape[minor_index],) - chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( - shape[major_index] % stride, - ) - memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] - da_mtx = da.map_blocks( - make_dask_chunk, - dtype=dtype, - chunks=chunks, - meta=memory_format((0, 0), dtype=np.float32), - ) - return da_mtx - - def make_index(is_csc, stride, shape, block_id): index = ( slice( @@ -69,6 +49,8 @@ def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask_h5(elem, _reader): + import dask.array as da + filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem elem_name = elem.name if isinstance(elem, H5Group) else Path(elem.path).name shape = elem.attrs["shape"] @@ -84,7 +66,21 @@ def make_dask_chunk(block_id=None): chunk = mtx[index] return chunk - return make_dask_array(is_csc, shape, make_dask_chunk, dtype) + chunks = [None, None] + major_index = int(is_csc) + minor_index = (is_csc + 1) % 2 + chunks[minor_index] = (shape[minor_index],) + chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( + shape[major_index] % stride, + ) + memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] + da_mtx = da.map_blocks( + make_dask_chunk, + dtype=dtype, + chunks=chunks, + meta=memory_format((0, 0), dtype=np.float32), + ) + return da_mtx @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) From 157e7103e4c23304d22da78b595814a91419af57 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:13:01 +0200 Subject: [PATCH 014/138] (chore): `read_sparse_as_dask_h5` -> `read_sparse_as_dask` --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 42d01def1..376257759 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -48,7 +48,7 @@ def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask_h5(elem, _reader): +def read_sparse_as_dask(elem, _reader): import dask.array as da filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem From 375000d2cb6ad2cafcddd1938ad56a50202a432c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:28:36 +0200 Subject: [PATCH 015/138] (feat): make params of `h5_chunks` and `stride` --- src/anndata/_io/specs/lazy_methods.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 376257759..579b9f741 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -12,10 +12,6 @@ from .registry import _LAZY_REGISTRY, IOSpec -# TODO: settings -stride = 100 -h5_chunks = 1000 - def make_index(is_csc, stride, shape, block_id): index = ( @@ -48,7 +44,7 @@ def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask(elem, _reader): +def read_sparse_as_dask(elem, _reader, stride: int = 100): import dask.array as da filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem @@ -84,11 +80,11 @@ def make_dask_chunk(block_id=None): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) -def read_h5_array(elem, _reader): +def read_h5_array(elem, _reader, chunk_size: int = 1000): import dask.array as da if not hasattr(elem, "chunks") or elem.chunks is None: - return da.from_array(elem, chunks=(h5_chunks,) * len(elem.shape)) + return da.from_array(elem, chunks=(chunk_size,) * len(elem.shape)) return da.from_array(elem) From 241904a6860b311f64c69321137cda43e76c76f0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:49:45 +0200 Subject: [PATCH 016/138] (chore): add distributed test --- tests/test_io_elementwise.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index da34f621c..fc773866e 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -218,6 +218,20 @@ def test_read_lazy_2d_dask(arr_type, store): assert arr_store["X_dask/indices"].dtype == np.int64 +def test_read_lazy_h5_cluster(sparse_format, tmp_path): + import dask.distributed as dd + + file = h5py.File(tmp_path / "test.h5", "w") + store = file["/"] + arr_store = create_sparse_store(sparse_format, store) + X_dask_from_disk = read_elem_as_dask(arr_store["X"]) + X_from_disk = read_elem(arr_store["X"]) + file.close() + with dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster: + with dd.Client(cluster) as client: # noqa: F841 + assert_equal(X_from_disk, X_dask_from_disk) + + @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) def test_write_indptr_dtype_override(store, sparse_format): X = sparse.random( From 42d0d2212c77b6b88c787b4d2b18db382a3a9eb0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 3 Jun 2024 14:51:04 +0200 Subject: [PATCH 017/138] (fix): `TypeVar` bind --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index fc773866e..f71d32117 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -71,7 +71,7 @@ def create_dense_store(store): return store -G = TypeVar("G", bound=H5Group | ZarrGroup) +G = TypeVar("G", H5Group, ZarrGroup) def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: From 0bba2c062c653bcd2a565c379e8ba8af44f98096 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 4 Jun 2024 10:28:43 +0200 Subject: [PATCH 018/138] (chore): release note --- docs/release-notes/0.10.8.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release-notes/0.10.8.md b/docs/release-notes/0.10.8.md index 52b743866..dbcd646a0 100644 --- a/docs/release-notes/0.10.8.md +++ b/docs/release-notes/0.10.8.md @@ -13,3 +13,5 @@ ```{rubric} Performance ``` + +* Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` From 0d0b43a3617af616a67a4d716b492055daa15de5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 5 Jun 2024 13:26:13 +0200 Subject: [PATCH 019/138] (chore): `0.10.8` -> `0.11.0` --- docs/release-notes/0.10.8.md | 2 -- docs/release-notes/0.11.0.md | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/release-notes/0.10.8.md b/docs/release-notes/0.10.8.md index dbcd646a0..52b743866 100644 --- a/docs/release-notes/0.10.8.md +++ b/docs/release-notes/0.10.8.md @@ -13,5 +13,3 @@ ```{rubric} Performance ``` - -* Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md index f19202871..3a883f415 100644 --- a/docs/release-notes/0.11.0.md +++ b/docs/release-notes/0.11.0.md @@ -7,6 +7,7 @@ * Add `should_remove_unused_categories` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1340` {user}`ilan-gold` * `scipy.sparse.csr_array` and `scipy.sparse.csc_array` are now supported when constructing `AnnData` objects {pr}`1028` {user}`ilan-gold` {user}`isaac-virshup` * Add `should_check_uniqueness` option to `anndata.settings` to override current behavior. Default is `True` (i.e., previous behavior). Please refer to the [documentation](https://anndata.readthedocs.io/en/latest/generated/anndata.settings.html) for usage. {pr}`1507` {user}`ilan-gold` +* Add `~anndata.experimental.read_elem_as_dask` function to handle i/o with sparse and dense arrays {pr}`1469` {user}`ilan-gold` ```{rubric} Bugfix ``` From c935fe02dcd938a17af166e23d19cf04b6389963 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 26 Jun 2024 10:54:30 +0200 Subject: [PATCH 020/138] (fix): `ruff` for default `pytest.fixture` `scope` --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index f447e1fb4..30d728a29 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -59,7 +59,7 @@ def store(request, tmp_path) -> H5Group | ZarrGroup: SIZE = 1000 -@pytest.fixture(scope="function", params=sparse_formats) +@pytest.fixture(params=sparse_formats) def sparse_format(request): return request.param From 23e0ea2f082225051cc1b0c7588b72b4179ab4a5 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 1 Jul 2024 17:17:30 +0200 Subject: [PATCH 021/138] Apply suggestions from code review Co-authored-by: Philipp A. --- src/anndata/_io/specs/lazy_methods.py | 46 +++++++++++---------------- tests/test_io_elementwise.py | 6 ++-- 2 files changed, 23 insertions(+), 29 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 579b9f741..995099c79 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -14,30 +14,25 @@ def make_index(is_csc, stride, shape, block_id): - index = ( - slice( - block_id[is_csc] * stride, - min((block_id[is_csc] * stride) + stride, shape[0]), - ), + index1d = slice( + block_id[is_csc] * stride, + min((block_id[is_csc] * stride) + stride, shape[0]), ) if is_csc: - return (slice(None, None, None),) + index - return index + return (slice(None, None, None), index1d) + return (index1d,) @contextmanager -def maybe_open_h5(filename_or_elem: str | ZarrGroup, elem_name: str): - if isinstance(filename_or_elem, str): - file = h5py.File(filename_or_elem, "r") - try: - yield file[elem_name] - finally: - file.close() - else: - try: - yield filename_or_elem - finally: - pass +def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): + if not isinstance(path_or_group, Path): + yield path_or_group + return + file = h5py.File(path_or_group, "r") + try: + yield file[elem_name] + finally: + file.close() @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @@ -62,14 +57,11 @@ def make_dask_chunk(block_id=None): chunk = mtx[index] return chunk - chunks = [None, None] - major_index = int(is_csc) - minor_index = (is_csc + 1) % 2 - chunks[minor_index] = (shape[minor_index],) - chunks[major_index] = (stride,) * (shape[major_index] // stride) + ( - shape[major_index] % stride, - ) - memory_format = [sparse.csr_matrix, sparse.csc_matrix][major_index] + n_strides, rest = np.divmod(shape[major_index], stride) + chunks_major = (stride,) * n_strides + (rest,) + chunks_minor = (shape[minor_index],) + chunks = (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) + memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix da_mtx = da.map_blocks( make_dask_chunk, dtype=dtype, diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 30d728a29..732c3b641 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -247,8 +247,10 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): X_dask_from_disk = read_elem_as_dask(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) file.close() - with dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster: - with dd.Client(cluster) as client: # noqa: F841 + with ( + dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, + dd.Client(cluster) as _client, + ): assert_equal(X_from_disk, X_dask_from_disk) From 5b96c771d41a23f33af7e38f2a0f790cb371be2d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 1 Jul 2024 17:27:39 +0200 Subject: [PATCH 022/138] (fix): `Any` to `DaskArray` --- src/anndata/_io/specs/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 7460d7f70..21f52beac 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata.compat import _read_attr +from anndata.compat import DaskArray, _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable @@ -331,7 +331,7 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -def read_elem_as_dask(elem: StorageType) -> Any: +def read_elem_as_dask(elem: StorageType) -> DaskArray: """ Read an element from a store lazily. From 0907a4ea086209af7394502d60b96e7d75738c90 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 1 Jul 2024 17:30:58 +0200 Subject: [PATCH 023/138] (fix): type `make_index` + fix undeclared --- src/anndata/_io/specs/lazy_methods.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 995099c79..cc860aaca 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -2,6 +2,7 @@ from contextlib import contextmanager from pathlib import Path +from typing import Literal, overload import h5py import numpy as np @@ -13,7 +14,19 @@ from .registry import _LAZY_REGISTRY, IOSpec -def make_index(is_csc, stride, shape, block_id): +@overload +def make_index( + *, is_csc: Literal[True], stride: int, shape: tuple[int, int], block_id: int +) -> tuple[slice, slice]: ... +@overload +def make_index( + *, is_csc: Literal[False], stride: int, shape: tuple[int, int], block_id: int +) -> tuple[slice]: ... + + +def make_index( + *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: int +) -> tuple[slice, slice] | tuple[slice]: index1d = slice( block_id[is_csc] * stride, min((block_id[is_csc] * stride) + stride, shape[0]), @@ -47,6 +60,8 @@ def read_sparse_as_dask(elem, _reader, stride: int = 100): shape = elem.attrs["shape"] dtype = elem["data"].dtype is_csc = elem.attrs["encoding-type"] == "csc_matrix" + major_index = int(is_csc) + minor_index = int(not is_csc) def make_dask_chunk(block_id=None): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` From 20ced167d07851d24acd24bca0a5aa03b7abcd7c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 15:31:23 +0000 Subject: [PATCH 024/138] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_io_elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 732c3b641..204b4734f 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -251,7 +251,7 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, dd.Client(cluster) as _client, ): - assert_equal(X_from_disk, X_dask_from_disk) + assert_equal(X_from_disk, X_dask_from_disk) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) From bb6607e8263c1c4560b2e4ca92a09d4352d3d9c4 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 1 Jul 2024 18:02:20 +0200 Subject: [PATCH 025/138] fix rest --- src/anndata/_io/specs/lazy_methods.py | 43 +++++++++++++++++---------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index cc860aaca..e1131b7e7 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,7 +1,7 @@ from __future__ import annotations from contextlib import contextmanager -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import Literal, overload import h5py @@ -16,16 +16,24 @@ @overload def make_index( - *, is_csc: Literal[True], stride: int, shape: tuple[int, int], block_id: int + *, + is_csc: Literal[True], + stride: int, + shape: tuple[int, int], + block_id: tuple[int, int], ) -> tuple[slice, slice]: ... @overload def make_index( - *, is_csc: Literal[False], stride: int, shape: tuple[int, int], block_id: int + *, + is_csc: Literal[False], + stride: int, + shape: tuple[int, int], + block_id: tuple[int, int], ) -> tuple[slice]: ... def make_index( - *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: int + *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: tuple[int, int] ) -> tuple[slice, slice] | tuple[slice]: index1d = slice( block_id[is_csc] * stride, @@ -52,29 +60,32 @@ def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask(elem, _reader, stride: int = 100): +def read_sparse_as_dask(elem: H5Group | ZarrGroup, _reader, stride: int = 100): import dask.array as da - filename_or_elem = elem.file.filename if isinstance(elem, H5Group) else elem - elem_name = elem.name if isinstance(elem, H5Group) else Path(elem.path).name - shape = elem.attrs["shape"] + path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem + elem_name = ( + elem.name if isinstance(elem, H5Group) else PurePosixPath(elem.path).name + ) + shape: tuple[int, int] = elem.attrs["shape"] dtype = elem["data"].dtype - is_csc = elem.attrs["encoding-type"] == "csc_matrix" - major_index = int(is_csc) - minor_index = int(not is_csc) + is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" - def make_dask_chunk(block_id=None): + def make_dask_chunk(block_id: tuple[int, int]): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 - with maybe_open_h5(filename_or_elem, elem_name) as f: + with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - index = make_index(is_csc, stride, shape, block_id) + index = make_index( + is_csc=is_csc, stride=stride, shape=shape, block_id=block_id + ) chunk = mtx[index] return chunk - n_strides, rest = np.divmod(shape[major_index], stride) + shape_minor, shape_major = shape if is_csc else shape[::-1] + n_strides, rest = np.divmod(shape_major, stride) chunks_major = (stride,) * n_strides + (rest,) - chunks_minor = (shape[minor_index],) + chunks_minor = (shape_minor,) chunks = (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix da_mtx = da.map_blocks( From 419691ba363d3d8028dc7f1c018a46d28c84c211 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 10:17:46 +0200 Subject: [PATCH 026/138] (fix): use `chunks` kwarg --- src/anndata/_io/specs/lazy_methods.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index e1131b7e7..0f8bbbf58 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -98,11 +98,13 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) -def read_h5_array(elem, _reader, chunk_size: int = 1000): +def read_h5_array(elem, _reader, chunks: tuple[int] | None = None): import dask.array as da if not hasattr(elem, "chunks") or elem.chunks is None: - return da.from_array(elem, chunks=(chunk_size,) * len(elem.shape)) + if chunks is None: + chunks = (1000,) * len(elem.shape) + return da.from_array(elem, chunks=chunks) return da.from_array(elem) From fd2376afbefa7d69eb5bee7ed74466754560a3a7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 15:58:16 +0200 Subject: [PATCH 027/138] (feat): expose `chunks` as an option to `read_elem_as_dask` via `dataset_kwargs` --- src/anndata/_io/h5ad.py | 2 +- src/anndata/_io/specs/lazy_methods.py | 80 +++++++++++++++++++++++---- src/anndata/_io/specs/methods.py | 34 ++++++------ src/anndata/_io/specs/registry.py | 40 +++++++++++--- src/anndata/_io/zarr.py | 2 +- src/anndata/experimental/merge.py | 2 +- src/anndata/tests/helpers.py | 2 +- tests/test_backed_sparse.py | 2 +- tests/test_io_dispatched.py | 8 +-- tests/test_io_elementwise.py | 76 +++++++++++++++++++++---- 10 files changed, 192 insertions(+), 56 deletions(-) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 098c139de..d3a9ef028 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -236,7 +236,7 @@ def read_h5ad( with h5py.File(filename, "r") as f: - def callback(func, elem_name: str, elem, iospec): + def callback(func, elem_name: str, elem, dataset_kwargs, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 0f8bbbf58..6fbcb48ac 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -2,7 +2,8 @@ from contextlib import contextmanager from pathlib import Path, PurePosixPath -from typing import Literal, overload +from types import MappingProxyType +from typing import TYPE_CHECKING, Any, Literal, overload import h5py import numpy as np @@ -13,6 +14,9 @@ from .registry import _LAZY_REGISTRY, IOSpec +if TYPE_CHECKING: + from collections.abc import Mapping + @overload def make_index( @@ -56,11 +60,18 @@ def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): file.close() +_DEFAULT_STRIDE = 1000 + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse_as_dask(elem: H5Group | ZarrGroup, _reader, stride: int = 100): +def read_sparse_as_dask( + elem: H5Group | ZarrGroup, + _reader, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): import dask.array as da path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem @@ -71,6 +82,15 @@ def read_sparse_as_dask(elem: H5Group | ZarrGroup, _reader, stride: int = 100): dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" + chunks = dataset_kwargs.get("chunks", None) + stride: int = _DEFAULT_STRIDE + if chunks is not None: + if len(chunks) != 2: + raise ValueError("`chunks` must be a tuple of two integers") + if chunks[int(not is_csc)] != shape[int(not is_csc)]: + raise ValueError("Only the major axis can be chunked") + stride = chunks[int(is_csc)] + def make_dask_chunk(block_id: tuple[int, int]): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 @@ -84,32 +104,68 @@ def make_dask_chunk(block_id: tuple[int, int]): shape_minor, shape_major = shape if is_csc else shape[::-1] n_strides, rest = np.divmod(shape_major, stride) - chunks_major = (stride,) * n_strides + (rest,) + chunks_major = (stride,) * n_strides + if rest > 0: + chunks_major += (rest,) chunks_minor = (shape_minor,) - chunks = (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) + chunk_layout = ( + (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) + ) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix da_mtx = da.map_blocks( make_dask_chunk, dtype=dtype, - chunks=chunks, + chunks=chunk_layout, meta=memory_format((0, 0), dtype=np.float32), ) return da_mtx @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) -def read_h5_array(elem, _reader, chunks: tuple[int] | None = None): +def read_h5_array( + elem, + _reader, + chunks: tuple[int] | None = None, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), +): import dask.array as da - if not hasattr(elem, "chunks") or elem.chunks is None: - if chunks is None: - chunks = (1000,) * len(elem.shape) - return da.from_array(elem, chunks=chunks) - return da.from_array(elem) + path = Path(elem.file.filename) + elem_name = elem.name + shape = elem.shape + dtype = elem.dtype + chunks: tuple[int, ...] = dataset_kwargs.get( + "chunks", (_DEFAULT_STRIDE,) * len(shape) + ) + + def make_dask_chunk(block_id: tuple[int, int]): + with maybe_open_h5(path, elem_name) as f: + idx = () + for i in range(len(shape)): + start = block_id[i] * chunks[i] + stop = min(((block_id[i] * chunks[i]) + chunks[i]), shape[i]) + idx += (slice(start, stop),) + return f[*idx] + + chunk_layout = () + for i in range(len(shape)): + n_strides, rest = np.divmod(shape[i], chunks[i]) + chunk = (chunks[i],) * n_strides + if rest > 0: + chunk += (rest,) + chunk_layout += (chunk,) + + return da.map_blocks( + make_dask_chunk, + dtype=dtype, + chunks=chunk_layout, + ) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) -def read_zarr_array(elem, _reader): +def read_zarr_array( + elem, _reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}) +): import dask.array as da return da.from_zarr(elem) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 85bf6dddc..acea99bbf 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -109,7 +109,7 @@ def wrapper( @_REGISTRY.register_read(H5File, IOSpec("", "")) @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) -def read_basic(elem, _reader): +def read_basic(elem, _reader, dataset_kwargs=MappingProxyType({})): from anndata._io import h5ad warn( @@ -129,7 +129,7 @@ def read_basic(elem, _reader): @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) -def read_basic_zarr(elem, _reader): +def read_basic_zarr(elem, _reader, dataset_kwargs=MappingProxyType({})): from anndata._io import zarr warn( @@ -265,7 +265,7 @@ def write_anndata(f, k, adata, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem, _reader): +def read_anndata(elem, _reader, dataset_kwargs=MappingProxyType({})): d = {} for k in [ "X", @@ -300,7 +300,7 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem, _reader): +def read_mapping(elem, _reader, dataset_kwargs=MappingProxyType({})): return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -374,7 +374,7 @@ def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem, _reader): +def read_array(elem, _reader, dataset_kwargs=MappingProxyType({})): return elem[()] @@ -391,7 +391,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array(d, _reader): +def read_string_array(d, _reader, dataset_kwargs=MappingProxyType({})): return read_array(d.asstr(), _reader=_reader) @@ -460,7 +460,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader): +def read_recarray(d, _reader, dataset_kwargs=MappingProxyType({})): value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -620,7 +620,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem, _reader): +def read_sparse(elem, _reader, dataset_kwargs=MappingProxyType({})): return sparse_dataset(elem).to_memory() @@ -658,7 +658,7 @@ def write_awkward(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem, _reader): +def read_awkward(elem, _reader, dataset_kwargs=MappingProxyType({})): from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -720,7 +720,7 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem, _reader): +def read_dataframe(elem, _reader, dataset_kwargs=MappingProxyType({})): columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -761,7 +761,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem, _reader): +def read_dataframe_0_1_0(elem, _reader, dataset_kwargs=MappingProxyType({})): columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -825,7 +825,7 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem, _reader): +def read_categorical(elem, _reader, dataset_kwargs=MappingProxyType({})): return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -869,7 +869,7 @@ def write_nullable_integer(f, k, v, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) -def read_nullable_integer(elem, _reader): +def read_nullable_integer(elem, _reader, dataset_kwargs=MappingProxyType({})): if "mask" in elem: return pd.arrays.IntegerArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -880,7 +880,7 @@ def read_nullable_integer(elem, _reader): @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) -def read_nullable_boolean(elem, _reader): +def read_nullable_boolean(elem, _reader, dataset_kwargs=MappingProxyType({})): if "mask" in elem: return pd.arrays.BooleanArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -896,7 +896,7 @@ def read_nullable_boolean(elem, _reader): @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem, _reader): +def read_scalar(elem, _reader, dataset_kwargs=MappingProxyType({})): return elem[()] @@ -929,12 +929,12 @@ def write_hdf5_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem, _reader): +def read_hdf5_string(elem, _reader, dataset_kwargs=MappingProxyType({})): return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem, _reader): +def read_zarr_string(elem, _reader, dataset_kwargs=MappingProxyType({})): return str(elem[()]) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 21f52beac..6bf2a1964 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -1,10 +1,12 @@ from __future__ import annotations +import inspect +import warnings from collections.abc import Mapping from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, TypedDict from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata.compat import DaskArray, _read_attr @@ -241,6 +243,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> Any: """Read an element from a store. See exported function for more details.""" from functools import partial @@ -251,8 +254,16 @@ def read_elem( _reader=self, ) if self.callback is None: - return read_func(elem) - return self.callback(read_func, elem.name, elem, iospec=iospec) + return read_func(elem, dataset_kwargs=dataset_kwargs) + if "dataset_kwargs" not in inspect.getfullargspec(self.callback)[0]: + warnings.warn( + "Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", + stacklevel=2, + ) + return self.callback(read_func, elem.name, elem, iospec=iospec) + return self.callback( + read_func, elem.name, elem, dataset_kwargs=dataset_kwargs, iospec=iospec + ) class Writer: @@ -331,19 +342,34 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -def read_elem_as_dask(elem: StorageType) -> DaskArray: +class DaskKwargs(TypedDict): + chunks: tuple[int, ...] + + +def read_elem_as_dask( + elem: StorageType, dataset_kwargs: DaskKwargs | None = None +) -> DaskArray: """ Read an element from a store lazily. Assumes that the element is encoded using the anndata encoding. This function will determine the encoded type using the encoding metadata stored in elem's attributes. - Params - ------ + + Parameters + ---------- elem The stored element. + dataset_kwargs, optional + Keyword arguments for dask array creation. Only `chunks` is supported with `n` elements, the same `n` as the size of the array. + + Returns + ------- + DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem(elem) + return Reader(_LAZY_REGISTRY).read_elem( + elem, dataset_kwargs=dataset_kwargs if dataset_kwargs is not None else {} + ) def write_elem( diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 0e015244a..9d6f759ff 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -66,7 +66,7 @@ def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: f = zarr.open(store, mode="r") # Read with handling for backwards compat - def callback(func, elem_name: str, elem, iospec): + def callback(func, elem_name: str, elem, dataset_kwargs, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index aa6f47e9b..f998d6c79 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -130,7 +130,7 @@ def read_as_backed(group: ZarrGroup | H5Group): BaseCompressedSparseDataset, Array or EAGER_TYPES are encountered. """ - def callback(func, elem_name: str, elem, iospec): + def callback(func, elem_name: str, elem, dataset_kwargs, iospec): if iospec.encoding_type in SPARSE_MATRIX: return sparse_dataset(elem) elif iospec.encoding_type in EAGER_TYPES: diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index d4b9a38be..91d8cdbcd 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -494,7 +494,7 @@ def assert_equal_cupy(a, b, exact=False, elem_name=None): def assert_equal_ndarray(a, b, exact=False, elem_name=None): b = asarray(b) if not exact and is_numeric_dtype(a) and is_numeric_dtype(b): - assert a.shape == b.shape, format_msg(elem_name) + assert a.shape == b.shape, (a.shape, b.shape) np.testing.assert_allclose(a, b, equal_nan=True, err_msg=format_msg(elem_name)) elif ( # Structured dtype not exact diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index cc0468230..7538cc121 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -64,7 +64,7 @@ def read_zarr_backed(path): f = zarr.open(path, mode="r") # Read with handling for backwards compat - def callback(func, elem_name, elem, iospec): + def callback(func, elem_name, elem, iospec, dataset_kwargs): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{k: read_dispatched(v, callback) for k, v in elem.items()} diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 833b23e83..c091fa8ac 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -18,7 +18,7 @@ def test_read_dispatched_w_regex(): - def read_only_axis_dfs(func, elem_name: str, elem, iospec): + def read_only_axis_dfs(func, elem_name: str, elem, iospec, dataset_kwargs): if iospec.encoding_type == "anndata": return func(elem) elif re.match(r"^/((obs)|(var))?(/.*)?$", elem_name): @@ -40,7 +40,7 @@ def read_only_axis_dfs(func, elem_name: str, elem, iospec): def test_read_dispatched_dask(): import dask.array as da - def read_as_dask_array(func, elem_name: str, elem, iospec): + def read_as_dask_array(func, elem_name: str, elem, iospec, dataset_kwargs): if iospec.encoding_type in { "dataframe", "csr_matrix", @@ -162,11 +162,11 @@ def zarr_writer(func, store, k, elem, dataset_kwargs, iospec): zarr_write_keys.append(k) func(store, k, elem, dataset_kwargs=dataset_kwargs) - def h5ad_reader(func, elem_name: str, elem, iospec): + def h5ad_reader(func, elem_name: str, elem, iospec, dataset_kwargs): h5ad_read_keys.append(elem_name) return func(elem) - def zarr_reader(func, elem_name: str, elem, iospec): + def zarr_reader(func, elem_name: str, elem, iospec, dataset_kwargs): zarr_read_keys.append(elem_name) return func(elem) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 204b4734f..5927536a4 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -64,8 +64,8 @@ def sparse_format(request): return request.param -def create_dense_store(store): - X = np.random.randn(SIZE, SIZE) +def create_dense_store(store, n_dims: int = 2): + X = np.random.randn(*((SIZE,) * n_dims)) write_elem(store, "X", X) return store @@ -215,12 +215,8 @@ def test_dask_write_sparse(sparse_format, store): assert x_sparse_store["X_dask/indices"].dtype == np.int64 -@pytest.mark.parametrize("arr_type", ["dense", *sparse_formats]) -def test_read_lazy_2d_dask(arr_type, store): - if arr_type == "dense": - arr_store = create_dense_store(store) - else: - arr_store = create_sparse_store(arr_type, store) +def test_read_lazy_2d_dask(sparse_format, store): + arr_store = create_sparse_store(sparse_format, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"]) X_from_disk = read_elem(arr_store["X"]) @@ -233,9 +229,28 @@ def test_read_lazy_2d_dask(arr_type, store): assert_equal(X_from_disk[index, :], X_dask_from_disk[index, :]) assert_equal(X_from_disk[:, index], X_dask_from_disk[:, index]) - if arr_type in {"csr", "csc"}: - assert arr_store["X_dask/indptr"].dtype == np.int64 - assert arr_store["X_dask/indices"].dtype == np.int64 + assert arr_store["X_dask/indptr"].dtype == np.int64 + assert arr_store["X_dask/indices"].dtype == np.int64 + + +@pytest.mark.parametrize( + ("n_dims", "chunks"), + [(1, (100,)), (1, (400,)), (2, (100, 100)), (2, (400, 400)), (2, (200, 400))], +) +def test_read_lazy_nd_dask(store, n_dims, chunks): + arr_store = create_dense_store(store, n_dims) + X_dask_from_disk = read_elem_as_dask( + arr_store["X"], dataset_kwargs=dict(chunks=chunks) + ) + X_from_disk = read_elem(arr_store["X"]) + assert_equal(X_from_disk, X_dask_from_disk) + + random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) + random_int_indices.sort() + random_bool_mask = np.random.randn(SIZE) > 0 + index_slice = slice(0, SIZE // 10) + for index in [random_int_indices, index_slice, random_bool_mask]: + assert_equal(X_from_disk[index], X_dask_from_disk[index]) def test_read_lazy_h5_cluster(sparse_format, tmp_path): @@ -254,6 +269,45 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): assert_equal(X_from_disk, X_dask_from_disk) +@pytest.mark.parametrize( + ("arr_type", "chunks"), + [("dense", (100, 100)), ("csc", (SIZE, 10)), ("csr", (10, SIZE))], +) +def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): + import dask.distributed as dd + + file = h5py.File(tmp_path / "test.h5", "w") + store = file["/"] + if arr_type == "dense": + arr_store = create_dense_store(store) + X_dask_from_disk = read_elem_as_dask( + arr_store["X"], dataset_kwargs=dict(chunks=chunks) + ) + else: + arr_store = create_sparse_store(arr_type, store) + X_dask_from_disk = read_elem_as_dask( + arr_store["X"], dataset_kwargs=dict(chunks=chunks) + ) + X_from_disk = read_elem(arr_store["X"]) + file.close() + with ( + dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, + dd.Client(cluster) as _client, + ): + assert_equal(X_from_disk, X_dask_from_disk) + + +def test_read_lazy_h5_bad_chunk_kwargs(tmp_path): + arr_type = "csr" + file = h5py.File(tmp_path / "test.h5", "w") + store = file["/"] + arr_store = create_sparse_store(arr_type, store) + with pytest.raises(ValueError, match=r"`chunks` must be a tuple of two integers"): + read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE,))) + with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): + read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE, 10))) + + @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) def test_write_indptr_dtype_override(store, sparse_format): X = sparse.random( From 42b10938d77a061bff98c50a5efc76c6192f9c9e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 16:50:14 +0200 Subject: [PATCH 028/138] (fix): `test_read_dispatched_null_case` test --- tests/test_io_dispatched.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index c091fa8ac..5dbb1229c 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -76,7 +76,7 @@ def test_read_dispatched_null_case(): write_elem(z, "/", adata) expected = read_elem(z) - actual = read_dispatched(z, lambda _, __, x, **___: read_elem(x)) + actual = read_dispatched(z, lambda _, __, x, ___, ____: read_elem(x)) assert_equal(expected, actual) From 78de057b3ab6786531b11f66b4bd0963485310cf Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 16:51:07 +0200 Subject: [PATCH 029/138] (fix): disallowed spread syntax? --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 6fbcb48ac..b977cd5ef 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -145,7 +145,7 @@ def make_dask_chunk(block_id: tuple[int, int]): start = block_id[i] * chunks[i] stop = min(((block_id[i] * chunks[i]) + chunks[i]), shape[i]) idx += (slice(start, stop),) - return f[*idx] + return f[idx] chunk_layout = () for i in range(len(shape)): From 717b997d0e33ddae066f72cc6495cdb64b88d175 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 2 Jul 2024 16:56:48 +0200 Subject: [PATCH 030/138] (refactor): reuse `compute_chunk_layout_for_axis_shape` functionality --- src/anndata/_io/specs/lazy_methods.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index b977cd5ef..596c07575 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -63,6 +63,16 @@ def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): _DEFAULT_STRIDE = 1000 +def compute_chunk_layout_for_axis_shape( + chunk_axis_shape: int, full_axis_shape: int +) -> tuple[int, ...]: + n_strides, rest = np.divmod(full_axis_shape, chunk_axis_shape) + chunk = (chunk_axis_shape,) * n_strides + if rest > 0: + chunk += (rest,) + return chunk + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -103,10 +113,7 @@ def make_dask_chunk(block_id: tuple[int, int]): return chunk shape_minor, shape_major = shape if is_csc else shape[::-1] - n_strides, rest = np.divmod(shape_major, stride) - chunks_major = (stride,) * n_strides - if rest > 0: - chunks_major += (rest,) + chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) chunks_minor = (shape_minor,) chunk_layout = ( (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) @@ -147,13 +154,10 @@ def make_dask_chunk(block_id: tuple[int, int]): idx += (slice(start, stop),) return f[idx] - chunk_layout = () - for i in range(len(shape)): - n_strides, rest = np.divmod(shape[i], chunks[i]) - chunk = (chunks[i],) * n_strides - if rest > 0: - chunk += (rest,) - chunk_layout += (chunk,) + chunk_layout = tuple( + compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) + for i in range(len(shape)) + ) return da.map_blocks( make_dask_chunk, From 2b86293dce90780d4d98f09f51e2fa2eec541813 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:09:14 +0200 Subject: [PATCH 031/138] (fix): remove unneeded `slice` arguments --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 596c07575..a3a29ac15 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -44,7 +44,7 @@ def make_index( min((block_id[is_csc] * stride) + stride, shape[0]), ) if is_csc: - return (slice(None, None, None), index1d) + return (slice(None), index1d) return (index1d,) From 8d5a9df7c5a0d11b50fba7bb1386f33cf41efb87 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:09:24 +0200 Subject: [PATCH 032/138] (fix): revert message --- src/anndata/tests/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 91d8cdbcd..d4b9a38be 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -494,7 +494,7 @@ def assert_equal_cupy(a, b, exact=False, elem_name=None): def assert_equal_ndarray(a, b, exact=False, elem_name=None): b = asarray(b) if not exact and is_numeric_dtype(a) and is_numeric_dtype(b): - assert a.shape == b.shape, (a.shape, b.shape) + assert a.shape == b.shape, format_msg(elem_name) np.testing.assert_allclose(a, b, equal_nan=True, err_msg=format_msg(elem_name)) elif ( # Structured dtype not exact From 449fc1a6f7cd3234d910c2cd4e2e2edbf2e87826 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:10:22 +0200 Subject: [PATCH 033/138] (refactor): `make_index` -> `make_block_indexer` --- src/anndata/_io/specs/lazy_methods.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a3a29ac15..db2f35dfc 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -19,7 +19,7 @@ @overload -def make_index( +def make_block_indexer( *, is_csc: Literal[True], stride: int, @@ -27,7 +27,7 @@ def make_index( block_id: tuple[int, int], ) -> tuple[slice, slice]: ... @overload -def make_index( +def make_block_indexer( *, is_csc: Literal[False], stride: int, @@ -36,7 +36,7 @@ def make_index( ) -> tuple[slice]: ... -def make_index( +def make_block_indexer( *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: tuple[int, int] ) -> tuple[slice, slice] | tuple[slice]: index1d = slice( @@ -106,7 +106,7 @@ def make_dask_chunk(block_id: tuple[int, int]): # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - index = make_index( + index = make_block_indexer( is_csc=is_csc, stride=stride, shape=shape, block_id=block_id ) chunk = mtx[index] From 1522de334c517a79df9f9bab032568366e2e8fac Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:11:20 +0200 Subject: [PATCH 034/138] (fix): export from `experimental` --- src/anndata/experimental/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 486f14e8d..993e26b28 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset -from anndata._io.specs import IOSpec, read_elem, write_elem +from anndata._io.specs import IOSpec, read_elem, read_elem_as_dask, write_elem from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk @@ -13,6 +13,7 @@ "AnnLoader", "read_elem", "write_elem", + "read_elem_as_dask", "read_dispatched", "write_dispatched", "IOSpec", From 71c150da759a2444826d0caceab803621a7ddd7a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:20:47 +0200 Subject: [PATCH 035/138] (fix): `callback` signature for `test_read_dispatched_null_case --- tests/test_io_dispatched.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 5dbb1229c..75f6b0033 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -76,7 +76,11 @@ def test_read_dispatched_null_case(): write_elem(z, "/", adata) expected = read_elem(z) - actual = read_dispatched(z, lambda _, __, x, ___, ____: read_elem(x)) + + def callback(read_func, elem_name, x, dataset_kwargs, iospec): + return read_elem(x) + + actual = read_dispatched(z, callback) assert_equal(expected, actual) From b441366a7d16728aa2f6cbe12eadaa6c7c7ea292 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:23:53 +0200 Subject: [PATCH 036/138] (chore): `get_elem_name` helper --- src/anndata/_io/specs/lazy_methods.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index db2f35dfc..83a19aab1 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,6 +1,7 @@ from __future__ import annotations from contextlib import contextmanager +from functools import singledispatch from pathlib import Path, PurePosixPath from types import MappingProxyType from typing import TYPE_CHECKING, Any, Literal, overload @@ -73,6 +74,21 @@ def compute_chunk_layout_for_axis_shape( return chunk +@singledispatch +def get_elem_name(x): + raise NotImplementedError(f"Not implemented for {type(x)}") + + +@get_elem_name.register(h5py.Group) +def _(x): + return x.name + + +@get_elem_name.register(ZarrArray) +def _(x): + return PurePosixPath(x.path).name + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -85,9 +101,7 @@ def read_sparse_as_dask( import dask.array as da path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem - elem_name = ( - elem.name if isinstance(elem, H5Group) else PurePosixPath(elem.path).name - ) + elem_name = get_elem_name(elem) shape: tuple[int, int] = elem.attrs["shape"] dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" From 0307a1dde3c8cab5d6d0f5d7363ee488902e5c74 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:24:34 +0200 Subject: [PATCH 037/138] (chore): use `H5Group` consistently --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 83a19aab1..4cc9b3d97 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -79,7 +79,7 @@ def get_elem_name(x): raise NotImplementedError(f"Not implemented for {type(x)}") -@get_elem_name.register(h5py.Group) +@get_elem_name.register(H5Group) def _(x): return x.name From ee075cd353443c088b2bf2423c1c1ed9269ca5c9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:40:57 +0200 Subject: [PATCH 038/138] (refactor): make `chunks` public facing API instead of `dataset_kwargs` --- src/anndata/_io/specs/registry.py | 17 ++++++----------- tests/test_io_elementwise.py | 16 +++++----------- 2 files changed, 11 insertions(+), 22 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 6bf2a1964..f5fee7f27 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any, TypedDict +from typing import TYPE_CHECKING, Any from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata.compat import DaskArray, _read_attr @@ -342,12 +342,8 @@ def read_elem(elem: StorageType) -> Any: return Reader(_REGISTRY).read_elem(elem) -class DaskKwargs(TypedDict): - chunks: tuple[int, ...] - - def read_elem_as_dask( - elem: StorageType, dataset_kwargs: DaskKwargs | None = None + elem: StorageType, chunks: tuple[int, ...] | None = None ) -> DaskArray: """ Read an element from a store lazily. @@ -360,16 +356,15 @@ def read_elem_as_dask( ---------- elem The stored element. - dataset_kwargs, optional - Keyword arguments for dask array creation. Only `chunks` is supported with `n` elements, the same `n` as the size of the array. + chunks, optional + length `n`, the same `n` as the size of the underlying array. + Note that the minor axis dimension must match the shape for sparse. Returns ------- DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem( - elem, dataset_kwargs=dataset_kwargs if dataset_kwargs is not None else {} - ) + return Reader(_LAZY_REGISTRY).read_elem(elem, dataset_kwargs={"chunks": chunks}) def write_elem( diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 5927536a4..692f21452 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -239,9 +239,7 @@ def test_read_lazy_2d_dask(sparse_format, store): ) def test_read_lazy_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, n_dims) - X_dask_from_disk = read_elem_as_dask( - arr_store["X"], dataset_kwargs=dict(chunks=chunks) - ) + X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) @@ -280,14 +278,10 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): store = file["/"] if arr_type == "dense": arr_store = create_dense_store(store) - X_dask_from_disk = read_elem_as_dask( - arr_store["X"], dataset_kwargs=dict(chunks=chunks) - ) + X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) - X_dask_from_disk = read_elem_as_dask( - arr_store["X"], dataset_kwargs=dict(chunks=chunks) - ) + X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) file.close() with ( @@ -303,9 +297,9 @@ def test_read_lazy_h5_bad_chunk_kwargs(tmp_path): store = file["/"] arr_store = create_sparse_store(arr_type, store) with pytest.raises(ValueError, match=r"`chunks` must be a tuple of two integers"): - read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE,))) + read_elem_as_dask(arr_store["X"], chunks=(SIZE,)) with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): - read_elem_as_dask(arr_store["X"], dataset_kwargs=dict(chunks=(SIZE, 10))) + read_elem_as_dask(arr_store["X"], chunks=(SIZE, 10)) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) From 89acec41ad2999466f2033f1bd10930d28ac343e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 10:41:34 +0200 Subject: [PATCH 039/138] (fix): regsiter for group not array --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 4cc9b3d97..24b556dd7 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -84,7 +84,7 @@ def _(x): return x.name -@get_elem_name.register(ZarrArray) +@get_elem_name.register(ZarrGroup) def _(x): return PurePosixPath(x.path).name From 48b763076ecacefea7801d6603697605aa71ed79 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 11:22:16 +0200 Subject: [PATCH 040/138] (chore): add warning test --- tests/test_io_dispatched.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 75f6b0033..521ff0ad9 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -3,6 +3,7 @@ import re import h5py +import pytest import zarr from scipy import sparse @@ -85,6 +86,21 @@ def callback(read_func, elem_name, x, dataset_kwargs, iospec): assert_equal(expected, actual) +def test_read_dispatched_warns_with_no_dataset_kwargs(): + adata = gen_adata((100, 100)) + z = zarr.group() + write_elem(z, "/", adata) + + def callback(read_func, elem_name, x, iospec): + return read_elem(x) + + with pytest.warns( + UserWarning, + match="Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", + ): + read_dispatched(z, callback) + + def test_write_dispatched_chunks(): from itertools import chain, repeat From 8712582a5def5426bd809319ac8da31050de13f8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 3 Jul 2024 11:48:10 +0200 Subject: [PATCH 041/138] (chore): make arg order consistent --- tests/test_io_dispatched.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 521ff0ad9..395e942c3 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -182,11 +182,11 @@ def zarr_writer(func, store, k, elem, dataset_kwargs, iospec): zarr_write_keys.append(k) func(store, k, elem, dataset_kwargs=dataset_kwargs) - def h5ad_reader(func, elem_name: str, elem, iospec, dataset_kwargs): + def h5ad_reader(func, elem_name: str, elem, dataset_kwargs, iospec): h5ad_read_keys.append(elem_name) return func(elem) - def zarr_reader(func, elem_name: str, elem, iospec, dataset_kwargs): + def zarr_reader(func, elem_name: str, elem, dataset_kwargs, iospec): zarr_read_keys.append(elem_name) return func(elem) From cda8aa77013ba55884510ef82355eeee5d067ff8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:20:38 +0200 Subject: [PATCH 042/138] (feat): add `callback` typing for `read_dispatched` --- src/anndata/_io/specs/methods.py | 55 ++++++++++++++++-------- src/anndata/_io/specs/registry.py | 30 +++++++++++-- src/anndata/experimental/_dispatch_io.py | 4 +- 3 files changed, 66 insertions(+), 23 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 85bf6dddc..fe2b1bda9 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -4,7 +4,7 @@ from functools import partial from itertools import product from types import MappingProxyType -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING, Any, Literal from warnings import warn import h5py @@ -26,6 +26,7 @@ CupyCSCMatrix, CupyCSRMatrix, DaskArray, + SpArray, ZarrArray, ZarrGroup, _decode_structured_array, @@ -33,11 +34,13 @@ _read_attr, ) -from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial +from .registry import _REGISTRY, IOSpec, Reader, read_elem, read_elem_partial if TYPE_CHECKING: from os import PathLike + from anndata._core.storage import StorageType + H5Array = h5py.Dataset H5Group = h5py.Group H5File = h5py.File @@ -109,7 +112,9 @@ def wrapper( @_REGISTRY.register_read(H5File, IOSpec("", "")) @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) -def read_basic(elem, _reader): +def read_basic( + elem: StorageType, _reader: Reader +) -> dict | np.ndarray[Any, Any] | np.ndarray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -129,7 +134,17 @@ def read_basic(elem, _reader): @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) -def read_basic_zarr(elem, _reader): +def read_basic_zarr( + elem: StorageType, _reader: Reader +) -> ( + dict + | Any + | np.ndarray[np.void] + | np.ndarray[Any, np.dtype[np.float64]] + | np.ndarray[Any, np.dtype[Any]] + | sparse.spmatrix + | SpArray +): from anndata._io import zarr warn( @@ -265,7 +280,7 @@ def write_anndata(f, k, adata, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem, _reader): +def read_anndata(elem: StorageType, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -300,7 +315,7 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem, _reader): +def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, Any]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -374,7 +389,7 @@ def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem, _reader): +def read_array(elem: StorageType, _reader: Reader) -> np.ndarray: return elem[()] @@ -460,7 +475,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader): +def read_recarray(d, _reader) -> np.recarray | np.ndarray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -620,7 +635,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem, _reader): +def read_sparse(elem: StorageType, _reader: Reader) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -658,7 +673,7 @@ def write_awkward(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem, _reader): +def read_awkward(elem: StorageType, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -720,7 +735,7 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem, _reader): +def read_dataframe(elem: StorageType, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -761,7 +776,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem, _reader): +def read_dataframe_0_1_0(elem: StorageType, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -825,7 +840,7 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem, _reader): +def read_categorical(elem: StorageType, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -869,7 +884,9 @@ def write_nullable_integer(f, k, v, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) -def read_nullable_integer(elem, _reader): +def read_nullable_integer( + elem: StorageType, _reader: Reader +) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -880,7 +897,9 @@ def read_nullable_integer(elem, _reader): @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) -def read_nullable_boolean(elem, _reader): +def read_nullable_boolean( + elem: StorageType, _reader: Reader +) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( _reader.read_elem(elem["values"]), mask=_reader.read_elem(elem["mask"]) @@ -896,7 +915,7 @@ def read_nullable_boolean(elem, _reader): @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem, _reader): +def read_scalar(elem: StorageType, _reader: Reader) -> np.number: return elem[()] @@ -929,12 +948,12 @@ def write_hdf5_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem, _reader): +def read_hdf5_string(elem: StorageType, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem, _reader): +def read_zarr_string(elem: StorageType, _reader: Reader) -> str: return str(elem[()]) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index a8357295d..80f612c5c 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Protocol, TypeVar from anndata._io.utils import report_read_key_on_error, report_write_key_on_error from anndata.compat import _read_attr @@ -64,9 +64,17 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator +class reader(Protocol): + def __call__( + self, + elem: StorageType, + _reader: Reader, + ) -> Any: ... + + class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], reader] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ tuple[type, type | tuple[type, str], frozenset[str]], Callable @@ -232,8 +240,24 @@ def _iter_patterns( yield t +InMemoryType = TypeVar("InMemoryType") + + +class read_callback(Protocol): + def __call__( + self, + /, + read_func: Callable[[StorageType, Reader], InMemoryType], + elem_name: str, + elem: StorageType, + iospec: IOSpec, + ) -> InMemoryType: ... + + class Reader: - def __init__(self, registry: IORegistry, callback: Callable | None = None) -> None: + def __init__( + self, registry: IORegistry, callback: read_callback | None = None + ) -> None: self.registry = registry self.callback = callback diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index 2a399d540..86dc936a4 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -6,13 +6,13 @@ if TYPE_CHECKING: from collections.abc import Mapping - from anndata._io.specs import IOSpec + from anndata._io.specs.registry import read_callback from anndata._types import GroupStorageType, StorageType def read_dispatched( elem: StorageType, - callback: Callable[[Callable[[StorageType], Any], str, StorageType, IOSpec], Any], + callback: read_callback, ) -> Any: """ Read elem, calling the callback at each sub-element. From e8f62f44af517d55b19fcbfd2778809667dcea10 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:27:46 +0200 Subject: [PATCH 043/138] (chore): use `npt.NDArray` --- src/anndata/_io/specs/methods.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index fe2b1bda9..e752d3a78 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -39,6 +39,8 @@ if TYPE_CHECKING: from os import PathLike + from numpy import typing as npt + from anndata._core.storage import StorageType H5Array = h5py.Dataset @@ -114,7 +116,7 @@ def wrapper( @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( elem: StorageType, _reader: Reader -) -> dict | np.ndarray[Any, Any] | np.ndarray | sparse.spmatrix | SpArray: +) -> dict | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -136,15 +138,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> ( - dict - | Any - | np.ndarray[np.void] - | np.ndarray[Any, np.dtype[np.float64]] - | np.ndarray[Any, np.dtype[Any]] - | sparse.spmatrix - | SpArray -): +) -> dict | Any | npt.NDArray | npt.NDArray[np.float64] | sparse.spmatrix | SpArray: from anndata._io import zarr warn( @@ -389,7 +383,7 @@ def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: StorageType, _reader: Reader) -> np.ndarray: +def read_array(elem: StorageType, _reader: Reader) -> npt.NDArray: return elem[()] @@ -475,7 +469,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader) -> np.recarray | np.ndarray: +def read_recarray(d, _reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) From f6e48acfc10b47f15ca1109e4e691ebb841a3aa9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:29:11 +0200 Subject: [PATCH 044/138] (fix): remove uneceesary union --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e752d3a78..e7886f59f 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -138,7 +138,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> dict | Any | npt.NDArray | npt.NDArray[np.float64] | sparse.spmatrix | SpArray: +) -> dict | Any | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr warn( From 4de3246638de21e79daea55db81b9d7fdc858b55 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:31:23 +0200 Subject: [PATCH 045/138] (chore): release note --- docs/release-notes/0.10.9.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md index a0beab5f2..2e60dcf2d 100644 --- a/docs/release-notes/0.10.9.md +++ b/docs/release-notes/0.10.9.md @@ -10,6 +10,8 @@ #### Documentation +* add `callback` typing for {func}`~anndata.experimental.read_dispatched` and {func}`~anndata.experimental.write_dispatched` {pr}`1557` {user}`ilan-gold` + #### Performance * Support for `concat_on_disk` outer join {pr}`1504` {user}`ilan-gold` From ba817e0bec259beeeda3da6229855ff025335403 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 12:51:31 +0200 Subject: [PATCH 046/138] (fix); try protocol docs --- docs/conf.py | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index ec253fc68..12ee4dd0f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,6 +51,7 @@ "sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.autosummary", + "sphinx_toolbox.more_autodoc.autoprotocol", "sphinx_autodoc_typehints", # needs to be after napoleon "sphinx_issues", "sphinx_design", diff --git a/pyproject.toml b/pyproject.toml index 0ea5f8962..f6c3b8b09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", + "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", From 438d28ddf749a52ca1e6ceb44381cc3bf7f742c3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:03:30 +0200 Subject: [PATCH 047/138] (feat): create `InMemoryElem` + `DictElemType` to remove `Any` --- src/anndata/_io/specs/methods.py | 9 ++++---- src/anndata/_io/specs/registry.py | 31 +++++++++++++++++++++----- src/anndata/_types.py | 36 ++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 10 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e7886f59f..e5cd92337 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -4,7 +4,7 @@ from functools import partial from itertools import product from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Literal from warnings import warn import h5py @@ -42,6 +42,7 @@ from numpy import typing as npt from anndata._core.storage import StorageType + from anndata._types import DictElemType H5Array = h5py.Dataset H5Group = h5py.Group @@ -116,7 +117,7 @@ def wrapper( @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( elem: StorageType, _reader: Reader -) -> dict | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -138,7 +139,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> dict | Any | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr warn( @@ -309,7 +310,7 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, Any]: +def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, DictElemType]: return {k: _reader.read_elem(v) for k, v in elem.items()} diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 80f612c5c..dfa43a4bc 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -6,13 +6,34 @@ from types import MappingProxyType from typing import TYPE_CHECKING, Any, Protocol, TypeVar +import numpy as np +import pandas as pd +from numpy import typing as npt +from scipy import sparse + +from anndata._core.anndata import AnnData from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata.compat import _read_attr +from anndata._types import DictElemType +from anndata.compat import SpArray, _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from anndata._types import GroupStorageType, StorageType + from anndata._core.storage import StorageType + from anndata._types import GroupStorageType + +InMemoryElem = ( + dict[str, DictElemType] + | npt.NDArray + | sparse.spmatrix + | SpArray + | AnnData + | pd.DataFrame + | pd.Categorical + | str + | np.number + | pd.api.extensions.ExtensionArray +) # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" @@ -69,7 +90,7 @@ def __call__( self, elem: StorageType, _reader: Reader, - ) -> Any: ... + ) -> InMemoryElem: ... class IORegistry: @@ -240,7 +261,7 @@ def _iter_patterns( yield t -InMemoryType = TypeVar("InMemoryType") +InMemoryType = TypeVar("InMemoryType", bound=InMemoryElem) class read_callback(Protocol): @@ -266,7 +287,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - ) -> Any: + ) -> InMemoryElem: """Read an element from a store. See exported function for more details.""" from functools import partial diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 17dd014d5..357bf457c 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -6,7 +6,23 @@ from typing import Union -from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup +import numpy as np +import pandas as pd +from scipy import sparse + +from anndata._core.sparse_dataset import BaseCompressedSparseDataset +from anndata.compat import ( + AwkArray, + CupyArray, + CupySparseMatrix, + DaskArray, + H5Array, + H5Group, + SpArray, + ZappyArray, + ZarrArray, + ZarrGroup, +) __all__ = [ "ArrayStorageType", @@ -14,6 +30,24 @@ "StorageType", ] +DictElemType = ( + np.ndarray + | np.ma.MaskedArray + | sparse.spmatrix + | SpArray + | H5Array + | ZarrArray + | ZappyArray + | BaseCompressedSparseDataset + | DaskArray + | CupyArray + | CupySparseMatrix + | AwkArray + | pd.DataFrame + | np.number + | str +) + ArrayStorageType = Union[ZarrArray, H5Array] GroupStorageType = Union[ZarrGroup, H5Group] StorageType = Union[ArrayStorageType, GroupStorageType] From 296ea3ff87a2a46acd4d10739fc6331e119394bd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:07:21 +0200 Subject: [PATCH 048/138] (chore): refactor `DictElemType` -> `InMemoryArrayOrScalarType` for reuse --- src/anndata/_io/specs/methods.py | 10 ++++++---- src/anndata/_io/specs/registry.py | 16 ++++------------ src/anndata/_types.py | 2 +- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e5cd92337..ec54e55b8 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -42,7 +42,7 @@ from numpy import typing as npt from anndata._core.storage import StorageType - from anndata._types import DictElemType + from anndata._types import InMemoryArrayOrScalarType H5Array = h5py.Dataset H5Group = h5py.Group @@ -117,7 +117,7 @@ def wrapper( @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( elem: StorageType, _reader: Reader -) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad warn( @@ -139,7 +139,7 @@ def read_basic( @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( elem: StorageType, _reader: Reader -) -> dict[str, DictElemType] | npt.NDArray | sparse.spmatrix | SpArray: +) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr warn( @@ -310,7 +310,9 @@ def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem: StorageType, _reader: Reader) -> dict[str, DictElemType]: +def read_mapping( + elem: StorageType, _reader: Reader +) -> dict[str, InMemoryArrayOrScalarType]: return {k: _reader.read_elem(v) for k, v in elem.items()} diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index dfa43a4bc..5134ffab1 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -6,15 +6,12 @@ from types import MappingProxyType from typing import TYPE_CHECKING, Any, Protocol, TypeVar -import numpy as np import pandas as pd -from numpy import typing as npt -from scipy import sparse from anndata._core.anndata import AnnData from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata._types import DictElemType -from anndata.compat import SpArray, _read_attr +from anndata._types import InMemoryArrayOrScalarType +from anndata.compat import _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable @@ -23,15 +20,10 @@ from anndata._types import GroupStorageType InMemoryElem = ( - dict[str, DictElemType] - | npt.NDArray - | sparse.spmatrix - | SpArray + dict[str, InMemoryArrayOrScalarType] + | InMemoryArrayOrScalarType | AnnData - | pd.DataFrame | pd.Categorical - | str - | np.number | pd.api.extensions.ExtensionArray ) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 357bf457c..21d235cdd 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,7 +30,7 @@ "StorageType", ] -DictElemType = ( +InMemoryArrayOrScalarType = ( np.ndarray | np.ma.MaskedArray | sparse.spmatrix From cf13a575e0f3c50c7e8ad8f5140c666aee3798c7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:12:20 +0200 Subject: [PATCH 049/138] (fix): use `Union` --- src/anndata/_types.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 21d235cdd..c08477e2b 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,23 +30,23 @@ "StorageType", ] -InMemoryArrayOrScalarType = ( - np.ndarray - | np.ma.MaskedArray - | sparse.spmatrix - | SpArray - | H5Array - | ZarrArray - | ZappyArray - | BaseCompressedSparseDataset - | DaskArray - | CupyArray - | CupySparseMatrix - | AwkArray - | pd.DataFrame - | np.number - | str -) +InMemoryArrayOrScalarType = Union[ + np.typing.NDArray, + np.ma.MaskedArray, + sparse.spmatrix, + SpArray, + H5Array, + ZarrArray, + ZappyArray, + BaseCompressedSparseDataset, + DaskArray, + CupyArray, + CupySparseMatrix, + AwkArray, + pd.DataFrame, + np.number, + str, +] ArrayStorageType = Union[ZarrArray, H5Array] GroupStorageType = Union[ZarrGroup, H5Group] From d02ba49f6689ce35f802b6c774ea9b5a2ea0b32e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:16:31 +0200 Subject: [PATCH 050/138] (fix): more `Union` --- src/anndata/_io/specs/registry.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 5134ffab1..3137240c2 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Protocol, TypeVar +from typing import TYPE_CHECKING, Any, Protocol, TypeVar, Union import pandas as pd @@ -19,13 +19,13 @@ from anndata._core.storage import StorageType from anndata._types import GroupStorageType -InMemoryElem = ( - dict[str, InMemoryArrayOrScalarType] - | InMemoryArrayOrScalarType - | AnnData - | pd.Categorical - | pd.api.extensions.ExtensionArray -) +InMemoryElem = Union[ + dict[str, InMemoryArrayOrScalarType], + InMemoryArrayOrScalarType, + AnnData, + pd.Categorical, + pd.api.extensions.ExtensionArray, +] # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" From 6970a97d3cef2e903217053b9a713dc95fd959b9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 14:17:02 +0200 Subject: [PATCH 051/138] (refactor): `InMemoryElem` -> `InMemoryReadElem` --- src/anndata/_io/specs/registry.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 3137240c2..e57392792 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -19,7 +19,7 @@ from anndata._core.storage import StorageType from anndata._types import GroupStorageType -InMemoryElem = Union[ +InMemoryReadElem = Union[ dict[str, InMemoryArrayOrScalarType], InMemoryArrayOrScalarType, AnnData, @@ -82,7 +82,7 @@ def __call__( self, elem: StorageType, _reader: Reader, - ) -> InMemoryElem: ... + ) -> InMemoryReadElem: ... class IORegistry: @@ -253,7 +253,7 @@ def _iter_patterns( yield t -InMemoryType = TypeVar("InMemoryType", bound=InMemoryElem) +InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) class read_callback(Protocol): @@ -279,7 +279,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - ) -> InMemoryElem: + ) -> InMemoryReadElem: """Read an element from a store. See exported function for more details.""" from functools import partial From 2282351956feda47b741895e240a882201acc9b3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 Jul 2024 15:26:10 +0200 Subject: [PATCH 052/138] (chore): add needed types to public export + docs fix --- docs/api.md | 3 ++- docs/conf.py | 2 +- pyproject.toml | 1 - src/anndata/experimental/__init__.py | 4 ++++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/api.md b/docs/api.md index fb8f40f93..496f4e0f3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -131,7 +131,8 @@ Utilities for customizing the IO process: experimental.read_dispatched experimental.write_dispatched experimental.IOSpec - + experimental.read_callback + experimental.StorageType ``` ## Errors and warnings diff --git a/docs/conf.py b/docs/conf.py index 12ee4dd0f..952791856 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -51,7 +51,6 @@ "sphinx.ext.mathjax", "sphinx.ext.napoleon", "sphinx.ext.autosummary", - "sphinx_toolbox.more_autodoc.autoprotocol", "sphinx_autodoc_typehints", # needs to be after napoleon "sphinx_issues", "sphinx_design", @@ -95,6 +94,7 @@ # TODO: sphinx’ builtin autodoc.typehints extension isn’t handled by `qualname_overrides` yet # https://github.com/theislab/scanpydoc/issues/140 ("py:class", "h5py._hl.group.Group"), + ("py:class", "h5py._hl.dataset.Dataset"), ] suppress_warnings = [ "ref.citation", diff --git a/pyproject.toml b/pyproject.toml index f6c3b8b09..0ea5f8962 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,6 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", - "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 486f14e8d..6b78e6433 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -3,6 +3,8 @@ from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset from anndata._io.specs import IOSpec, read_elem, write_elem +from .._core.storage import StorageType +from .._io.specs.registry import read_callback from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -20,4 +22,6 @@ "sparse_dataset", "CSRDataset", "CSCDataset", + "read_callback", + "StorageType", ] From a996081625fd44f2b6866c0c2739ff44e2ad6908 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 11:23:47 +0200 Subject: [PATCH 053/138] (chore): type `write_elem` functions --- src/anndata/_io/specs/methods.py | 170 ++++++++++++++++++++++++++----- 1 file changed, 144 insertions(+), 26 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index ec54e55b8..2be9e8964 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -34,7 +34,7 @@ _read_attr, ) -from .registry import _REGISTRY, IOSpec, Reader, read_elem, read_elem_partial +from .registry import _REGISTRY, IOSpec, Reader, Writer, read_elem, read_elem_partial if TYPE_CHECKING: from os import PathLike @@ -42,7 +42,7 @@ from numpy import typing as npt from anndata._core.storage import StorageType - from anndata._types import InMemoryArrayOrScalarType + from anndata._types import GroupStorageType, InMemoryArrayOrScalarType H5Array = h5py.Dataset H5Group = h5py.Group @@ -255,7 +255,13 @@ def _read_partial(group, *, items=None, indices=(slice(None), slice(None))): @_REGISTRY.register_write(ZarrGroup, AnnData, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_write(H5Group, AnnData, IOSpec("anndata", "0.1.0")) -def write_anndata(f, k, adata, _writer, dataset_kwargs=MappingProxyType({})): +def write_anndata( + f: GroupStorageType, + k: str, + adata: AnnData, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) _writer.write_elem(g, "X", adata.X, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "obs", adata.obs, dataset_kwargs=dataset_kwargs) @@ -296,7 +302,13 @@ def read_anndata(elem: StorageType, _reader: Reader) -> AnnData: @_REGISTRY.register_write(H5Group, Raw, IOSpec("raw", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, Raw, IOSpec("raw", "0.1.0")) -def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})): +def write_raw( + f: GroupStorageType, + k: str, + raw: Raw, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) _writer.write_elem(g, "X", raw.X, dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "var", raw.var, dataset_kwargs=dataset_kwargs) @@ -318,7 +330,13 @@ def read_mapping( @_REGISTRY.register_write(H5Group, dict, IOSpec("dict", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, dict, IOSpec("dict", "0.1.0")) -def write_mapping(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_mapping( + f: GroupStorageType, + k: str, + v: dict, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) for sub_k, sub_v in v.items(): _writer.write_elem(g, sub_k, sub_v, dataset_kwargs=dataset_kwargs) @@ -331,7 +349,13 @@ def write_mapping(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_write(H5Group, list, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, list, IOSpec("array", "0.2.0")) -def write_list(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_list( + f: GroupStorageType, + k: str, + elem: list, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): _writer.write_elem(f, k, np.array(elem), dataset_kwargs=dataset_kwargs) @@ -346,7 +370,13 @@ def write_list(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_write(ZarrGroup, h5py.Dataset, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.ma.MaskedArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, ZarrArray, IOSpec("array", "0.2.0")) -def write_basic(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_basic( + f: GroupStorageType, + k: str, + elem: views.ArrayView | np.ndarray | h5py.Dataset | np.ma.MaskedArray | ZarrArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): """Write methods which underlying library handles natively.""" f.create_dataset(k, data=elem, **dataset_kwargs) @@ -360,7 +390,13 @@ def write_basic(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_write(ZarrGroup, DaskArray, IOSpec("array", "0.2.0")) -def write_basic_dask_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_basic_dask_zarr( + f: ZarrGroup, + k: str, + elem: DaskArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): import dask.array as da g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs) @@ -370,7 +406,13 @@ def write_basic_dask_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({ # Adding this separately because h5py isn't serializable # https://github.com/pydata/xarray/issues/4242 @_REGISTRY.register_write(H5Group, DaskArray, IOSpec("array", "0.2.0")) -def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_basic_dask_h5( + f: H5Group, + k: str, + elem: DaskArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): import dask.array as da import dask.config as dc @@ -420,7 +462,13 @@ def read_string_array_partial(d, items=None, indices=slice(None)): ) @_REGISTRY.register_write(H5Group, (np.ndarray, "U"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(H5Group, (np.ndarray, "O"), IOSpec("string-array", "0.2.0")) -def write_vlen_string_array(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_vlen_string_array( + f: H5Group, + k: str, + elem: np.ndarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): """Write methods which underlying library handles nativley.""" str_dtype = h5py.special_dtype(vlen=str) f.create_dataset(k, data=elem.astype(str_dtype), dtype=str_dtype, **dataset_kwargs) @@ -435,7 +483,11 @@ def write_vlen_string_array(f, k, elem, _writer, dataset_kwargs=MappingProxyType @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "U"), IOSpec("string-array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "O"), IOSpec("string-array", "0.2.0")) def write_vlen_string_array_zarr( - f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) + f: ZarrGroup, + k: str, + elem: np.ndarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), ): import numcodecs @@ -483,13 +535,25 @@ def read_recarray(d, _reader) -> np.recarray | npt.NDArray: @_REGISTRY.register_write(H5Group, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_write(H5Group, np.recarray, IOSpec("rec-array", "0.2.0")) -def write_recarray(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_recarray( + f: H5Group, + k: str, + elem: np.ndarray | np.recarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): f.create_dataset(k, data=_to_hdf5_vlen_strings(elem), **dataset_kwargs) @_REGISTRY.register_write(ZarrGroup, (np.ndarray, "V"), IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, np.recarray, IOSpec("rec-array", "0.2.0")) -def write_recarray_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_recarray_zarr( + f: ZarrGroup, + k: str, + elem: np.ndarray | np.recarray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): from anndata.compat import _to_fixed_length_strings f.create_dataset(k, data=_to_fixed_length_strings(elem), **dataset_kwargs) @@ -501,10 +565,10 @@ def write_recarray_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({}) def write_sparse_compressed( - f, - key, - value, - _writer, + f: GroupStorageType, + key: str, + value: sparse.spmatrix | SpArray, + _writer: Writer, fmt: Literal["csr", "csc"], dataset_kwargs=MappingProxyType({}), ): @@ -560,7 +624,13 @@ def write_sparse_compressed( @_REGISTRY.register_write(H5Group, CSCDataset, IOSpec("", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, CSRDataset, IOSpec("", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, CSCDataset, IOSpec("", "0.1.0")) -def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_sparse_dataset( + f: GroupStorageType, + k: str, + elem: CSCDataset | CSRDataset, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): write_sparse_compressed( f, k, @@ -586,7 +656,13 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_write( ZarrGroup, (DaskArray, sparse.csc_matrix), IOSpec("csc_matrix", "0.1.0") ) -def write_dask_sparse(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): +def write_dask_sparse( + f: GroupStorageType, + k: str, + elem: DaskArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): sparse_format = elem._meta.format def as_int64_indices(x): @@ -657,7 +733,13 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) @_REGISTRY.register_write( ZarrGroup, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") ) -def write_awkward(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_awkward( + f: GroupStorageType, + k: str, + v: views.AwkwardArrayView | AwkArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): from anndata.compat import awkward as ak group = f.require_group(k) @@ -689,7 +771,13 @@ def read_awkward(elem: StorageType, _reader: Reader) -> AwkArray: @_REGISTRY.register_write(H5Group, pd.DataFrame, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, views.DataFrameView, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, pd.DataFrame, IOSpec("dataframe", "0.2.0")) -def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})): +def write_dataframe( + f: GroupStorageType, + key: str, + df: views.DataFrameView | pd.DataFrame, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): # Check arguments for reserved in ("_index",): if reserved in df.columns: @@ -825,7 +913,13 @@ def read_partial_dataframe_0_1_0( @_REGISTRY.register_write(H5Group, pd.Categorical, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_write(ZarrGroup, pd.Categorical, IOSpec("categorical", "0.2.0")) -def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_categorical( + f: GroupStorageType, + k: str, + v: pd.Categorical, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) g.attrs["ordered"] = bool(v.ordered) @@ -872,7 +966,13 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)): @_REGISTRY.register_write( ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0") ) -def write_nullable_integer(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): +def write_nullable_integer( + f: GroupStorageType, + k: str, + v: pd.arrays.IntegerArray | pd.arrays.BooleanArray, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): g = f.require_group(k) if v._mask is not None: _writer.write_elem(g, "mask", v._mask, dataset_kwargs=dataset_kwargs) @@ -916,11 +1016,23 @@ def read_scalar(elem: StorageType, _reader: Reader) -> np.number: return elem[()] -def write_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({})): +def write_scalar( + f: GroupStorageType, + key: str, + value, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): return f.create_dataset(key, data=np.array(value), **dataset_kwargs) -def write_hdf5_scalar(f, key, value, _writer, dataset_kwargs=MappingProxyType({})): +def write_hdf5_scalar( + f: H5Group, + key: str, + value, + _writer: Writer, + dataset_kwargs: MappingProxyType = MappingProxyType({}), +): # Can’t compress scalars, error is thrown dataset_kwargs = dataset_kwargs.copy() dataset_kwargs.pop("compression", None) @@ -960,7 +1072,13 @@ def read_zarr_string(elem: StorageType, _reader: Reader) -> str: @_REGISTRY.register_write(H5Group, np.str_, IOSpec("string", "0.2.0")) @_REGISTRY.register_write(H5Group, str, IOSpec("string", "0.2.0")) -def write_string(f, k, v, _writer, dataset_kwargs): +def write_string( + f: GroupStorageType, + k: str, + v: np.str_ | str, + _writer: Writer, + dataset_kwargs: MappingProxyType, +): dataset_kwargs = dataset_kwargs.copy() dataset_kwargs.pop("compression", None) dataset_kwargs.pop("compression_opts", None) From f6e457b7db9d02ec8693a46dfd568a365680c117 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 11:40:47 +0200 Subject: [PATCH 054/138] (chore): create `write_callback` protocol --- src/anndata/_io/specs/registry.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index e57392792..98035536d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -293,8 +293,24 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) +class write_callback(Protocol): + def __call__( + self, + /, + write_func: Callable[ + [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None + ], + store: GroupStorageType, + elem_name: str, + elem: InMemoryReadElem, + *, + iospec: IOSpec, + dataset_kwargs: MappingProxyType, + ) -> InMemoryType: ... + + class Writer: - def __init__(self, registry: IORegistry, callback: Callable | None = None): + def __init__(self, registry: IORegistry, callback: write_callback | None = None): self.registry = registry self.callback = callback From 4416526c3a295c527020d733a4527dd364ca6832 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 12:12:49 +0200 Subject: [PATCH 055/138] (chore): export + docs --- docs/api.md | 1 + src/anndata/experimental/__init__.py | 3 ++- src/anndata/experimental/_dispatch_io.py | 9 +++------ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/docs/api.md b/docs/api.md index 496f4e0f3..9eb57a0a9 100644 --- a/docs/api.md +++ b/docs/api.md @@ -132,6 +132,7 @@ Utilities for customizing the IO process: experimental.write_dispatched experimental.IOSpec experimental.read_callback + experimental.write_callback experimental.StorageType ``` diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 6b78e6433..e042d5e96 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,7 +4,7 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import read_callback +from .._io.specs.registry import read_callback, write_callback from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -23,5 +23,6 @@ "CSRDataset", "CSCDataset", "read_callback", + "write_callback", "StorageType", ] diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index c7c79df1a..549ca85c4 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -4,10 +4,10 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from collections.abc import Callable, Mapping + from collections.abc import Mapping from typing import Any - from anndata._io.specs.registry import read_callback + from anndata._io.specs.registry import read_callback, write_callback from anndata._types import GroupStorageType, StorageType @@ -53,10 +53,7 @@ def write_dispatched( store: GroupStorageType, key: str, elem: Any, - callback: Callable[ - [Callable[[StorageType, str, Any], None], GroupStorageType, str, Any, dict], - None, - ], + callback: write_callback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: From fbe44f0dc30c12833c896ef37f6c39a891fb7bd7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 12:29:08 +0200 Subject: [PATCH 056/138] (fix): add string descriptions --- src/anndata/_io/specs/registry.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index f77471519..9366ce305 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -267,6 +267,13 @@ def __call__( iospec: IOSpec, ) -> InMemoryType: ... + """Callback used in {func}`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Returns + ------- + The element read from the store. + """ + class Reader: def __init__( @@ -307,7 +314,9 @@ def __call__( *, iospec: IOSpec, dataset_kwargs: MappingProxyType, - ) -> InMemoryType: ... + ) -> None: ... + + """Callback used in {func}`anndata.experimental.write_dispatched` to customize writing an element to a store.""" class Writer: From 8c1f01d7b100c471167a5449a767fe34f8c2fe9e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 12:33:52 +0200 Subject: [PATCH 057/138] (fix): try sphinx protocol doc --- docs/conf.py | 1 + pyproject.toml | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 952791856..223fae81d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,6 +60,7 @@ "sphinx.ext.linkcode", "nbsphinx", "IPython.sphinxext.ipython_console_highlighting", + "sphinx_toolbox.more_autodoc.autoprotocol", ] myst_enable_extensions = [ "html_image", # So README.md can be used on github and sphinx docs diff --git a/pyproject.toml b/pyproject.toml index 03a409c7e..f6c3b8b09 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", + "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", @@ -184,9 +185,6 @@ ignore = [ [tool.ruff.lint.isort] known-first-party = ["anndata"] required-imports = ["from __future__ import annotations"] -[tool.ruff.lint.flake8-type-checking] -exempt-modules = [] -strict = true [tool.codespell] skip = ".git,*.pdf,*.svg" From a7d412a72e095572e5c56ec180ae6b7d9105976f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 17:07:52 +0200 Subject: [PATCH 058/138] (fix): try ignoring exports --- docs/conf.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 223fae81d..8eaa58dd5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -96,6 +96,13 @@ # https://github.com/theislab/scanpydoc/issues/140 ("py:class", "h5py._hl.group.Group"), ("py:class", "h5py._hl.dataset.Dataset"), + # for experimental callback exports + ("py:class", "anndata.compat.ZappyArray"), + ("py:class", "anndata.compat.DaskArray"), + ("py:class", "anndata.compat.CupyArray"), + ("py:class", "anndata.compat.CupySparseMatrix"), + ("py:class", "awkward.highlevel.Array"), + ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ] suppress_warnings = [ "ref.citation", From 4d56396c9952b0d75e16b7467c55f110d1b9cbdd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 17:51:33 +0200 Subject: [PATCH 059/138] (fix): remap callback internal usages --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 8eaa58dd5..aba358a71 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -132,6 +132,8 @@ def setup(app: Sphinx): "h5py._hl.files.File": "h5py.File", "h5py._hl.dataset.Dataset": "h5py.Dataset", "anndata._core.anndata.AnnData": "anndata.AnnData", + "anndata._io.specs.registry.read_callback": "anndata.experimental.read_callback", + "anndata._io.specs.registry.write_callback": "anndata.experimental.write_callback", } # -- Social cards --------------------------------------------------------- From 2012ee5fc08c318e706104e31766c6d69935edd9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 Jul 2024 18:52:31 +0200 Subject: [PATCH 060/138] (fix): add docstring --- src/anndata/_io/specs/registry.py | 33 +++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 9366ce305..0b9834d2d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -267,7 +267,19 @@ def __call__( iospec: IOSpec, ) -> InMemoryType: ... - """Callback used in {func}`anndata.experimental.read_dispatched` to customize reading an element from a store. + """ + Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Params + ------ + read_func + :func:`anndata.experimental.read_elem` function to call to read the current element given the :param:`iospec`. + elem_name + The key to read in from the group. + elem + The element to read from. + iospec + Internal AnnData encoding specification for the element. Returns ------- @@ -316,7 +328,24 @@ def __call__( dataset_kwargs: MappingProxyType, ) -> None: ... - """Callback used in {func}`anndata.experimental.write_dispatched` to customize writing an element to a store.""" + """ + Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. + + Params + ------ + write_func + :func:`anndata.experimental.write_elem` function to call to read the current element given the :param:`iospec`. + store + The store to which `elem` should be written. + elem_name + The key to read in from the group. + elem + The element to write out. + iospec + Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :mod:`zarr`. + """ class Writer: From f65f0652cee9a130bd01f91bee97644424ecf09c Mon Sep 17 00:00:00 2001 From: Philipp A Date: Tue, 9 Jul 2024 08:40:29 +0200 Subject: [PATCH 061/138] Discard changes to pyproject.toml --- pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f6c3b8b09..03a409c7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,6 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", - "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", @@ -185,6 +184,9 @@ ignore = [ [tool.ruff.lint.isort] known-first-party = ["anndata"] required-imports = ["from __future__ import annotations"] +[tool.ruff.lint.flake8-type-checking] +exempt-modules = [] +strict = true [tool.codespell] skip = ".git,*.pdf,*.svg" From 8f6ea498a1fff13b547630411ce103764ea82979 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 08:47:29 +0200 Subject: [PATCH 062/138] re-add dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 03a409c7e..813ccee62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,7 @@ doc = [ "sphinx-autodoc-typehints>=1.11.0", "sphinx-issues", "sphinx-copybutton", + "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", "scanpydoc[theme,typehints] >=0.13.4", From 155a21e69c6ab69f8c1c9d616c20bb73ad5a3727 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 10:31:55 +0200 Subject: [PATCH 063/138] Fix docs --- src/anndata/_io/specs/registry.py | 52 +++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 0b9834d2d..d3b26fb99 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -258,22 +258,13 @@ def _iter_patterns( class read_callback(Protocol): - def __call__( - self, - /, - read_func: Callable[[StorageType, Reader], InMemoryType], - elem_name: str, - elem: StorageType, - iospec: IOSpec, - ) -> InMemoryType: ... - """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. Params ------ read_func - :func:`anndata.experimental.read_elem` function to call to read the current element given the :param:`iospec`. + :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. elem_name The key to read in from the group. elem @@ -286,6 +277,15 @@ def __call__( The element read from the store. """ + def __call__( + self, + /, + read_func: Callable[[StorageType, Reader], InMemoryType], + elem_name: str, + elem: StorageType, + iospec: IOSpec, + ) -> InMemoryType: ... + class Reader: def __init__( @@ -314,27 +314,13 @@ def read_elem( class write_callback(Protocol): - def __call__( - self, - /, - write_func: Callable[ - [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None - ], - store: GroupStorageType, - elem_name: str, - elem: InMemoryReadElem, - *, - iospec: IOSpec, - dataset_kwargs: MappingProxyType, - ) -> None: ... - """ Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. Params ------ write_func - :func:`anndata.experimental.write_elem` function to call to read the current element given the :param:`iospec`. + :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. store The store to which `elem` should be written. elem_name @@ -344,9 +330,23 @@ def __call__( iospec Internal AnnData encoding specification for the element. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :mod:`zarr`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ + def __call__( + self, + /, + write_func: Callable[ + [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None + ], + store: GroupStorageType, + elem_name: str, + elem: InMemoryReadElem, + *, + iospec: IOSpec, + dataset_kwargs: MappingProxyType, + ) -> None: ... + class Writer: def __init__(self, registry: IORegistry, callback: write_callback | None = None): From daae3e548e4e7fe1da06777fe260da403684f24d Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 11:11:14 +0200 Subject: [PATCH 064/138] Almost works --- docs/api.md | 4 ++ docs/conf.py | 5 ++ pyproject.toml | 2 +- src/anndata/_io/specs/registry.py | 88 ++++++++++++++-------------- src/anndata/_types.py | 10 +++- src/anndata/experimental/__init__.py | 13 +++- 6 files changed, 73 insertions(+), 49 deletions(-) diff --git a/docs/api.md b/docs/api.md index 9eb57a0a9..c05efb71b 100644 --- a/docs/api.md +++ b/docs/api.md @@ -131,6 +131,10 @@ Utilities for customizing the IO process: experimental.read_dispatched experimental.write_dispatched experimental.IOSpec + experimental.InMemoryReadElem + experimental.InMemoryArrayOrScalarType + experimental.Reader + experimental.Writer experimental.read_callback experimental.write_callback experimental.StorageType diff --git a/docs/conf.py b/docs/conf.py index aba358a71..0f4d24f0a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,6 +70,11 @@ # Generate the API documentation when building autosummary_generate = True autodoc_member_order = "bysource" +autodoc_type_aliases = dict( + InMemoryReadElem="anndata.experimental.InMemoryReadElem", + InMemoryType="anndata.experimental.InMemoryArrayOrScalarType", + InMemoryArrayOrScalarType="anndata.experimental.InMemoryArrayOrScalarType", +) issues_github_path = "scverse/anndata" # autodoc_default_flags = ['members'] napoleon_google_docstring = False diff --git a/pyproject.toml b/pyproject.toml index 813ccee62..310f57fd2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ dev = [ doc = [ "sphinx>=4.4", "sphinx-book-theme>=1.1.0", - "sphinx-autodoc-typehints>=1.11.0", + "sphinx-autodoc-typehints>=2.2.0", "sphinx-issues", "sphinx-copybutton", "sphinx-toolbox", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index d3b26fb99..4c324a005 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Protocol, TypeVar, Union +from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, Union import pandas as pd @@ -15,12 +15,12 @@ if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any + from typing import Any, TypeAlias from anndata._core.storage import StorageType from anndata._types import GroupStorageType -InMemoryReadElem = Union[ +InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], InMemoryArrayOrScalarType, AnnData, @@ -257,26 +257,7 @@ def _iter_patterns( InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) -class read_callback(Protocol): - """ - Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. - - Params - ------ - read_func - :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. - elem_name - The key to read in from the group. - elem - The element to read from. - iospec - Internal AnnData encoding specification for the element. - - Returns - ------- - The element read from the store. - """ - +class read_callback(Protocol, Generic[InMemoryType]): def __call__( self, /, @@ -284,7 +265,26 @@ def __call__( elem_name: str, elem: StorageType, iospec: IOSpec, - ) -> InMemoryType: ... + ) -> InMemoryType: + """ + Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Params + ------ + read_func + :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. + elem_name + The key to read in from the group. + elem + The element to read from. + iospec + Internal AnnData encoding specification for the element. + + Returns + ------- + The element read from the store. + """ + ... class Reader: @@ -314,25 +314,6 @@ def read_elem( class write_callback(Protocol): - """ - Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. - - Params - ------ - write_func - :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. - store - The store to which `elem` should be written. - elem_name - The key to read in from the group. - elem - The element to write out. - iospec - Internal AnnData encoding specification for the element. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. - """ - def __call__( self, /, @@ -345,7 +326,26 @@ def __call__( *, iospec: IOSpec, dataset_kwargs: MappingProxyType, - ) -> None: ... + ) -> None: + """ + Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. + + Params + ------ + write_func + :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. + store + The store to which `elem` should be written. + elem_name + The key to read in from the group. + elem + The element to write out. + iospec + Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + """ + ... class Writer: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index c08477e2b..5827f5b6b 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,10 +4,11 @@ from __future__ import annotations -from typing import Union +from typing import TYPE_CHECKING, Union import numpy as np import pandas as pd +from numpy.typing import NDArray from scipy import sparse from anndata._core.sparse_dataset import BaseCompressedSparseDataset @@ -24,14 +25,17 @@ ZarrGroup, ) +if TYPE_CHECKING: + from typing import TypeAlias + __all__ = [ "ArrayStorageType", "GroupStorageType", "StorageType", ] -InMemoryArrayOrScalarType = Union[ - np.typing.NDArray, +InMemoryArrayOrScalarType: TypeAlias = Union[ + NDArray, np.ma.MaskedArray, sparse.spmatrix, SpArray, diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index e042d5e96..af21c8e15 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,7 +4,14 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import read_callback, write_callback +from .._io.specs.registry import ( + InMemoryArrayOrScalarType, + InMemoryReadElem, + Reader, + Writer, + read_callback, + write_callback, +) from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -22,6 +29,10 @@ "sparse_dataset", "CSRDataset", "CSCDataset", + "InMemoryReadElem", + "InMemoryArrayOrScalarType", + "Reader", + "Writer", "read_callback", "write_callback", "StorageType", From c415ae4f916f81d86c0109649d479d6ab730954f Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 11:34:53 +0200 Subject: [PATCH 065/138] works! --- docs/conf.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 0f4d24f0a..48a9ee4a9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,11 +70,6 @@ # Generate the API documentation when building autosummary_generate = True autodoc_member_order = "bysource" -autodoc_type_aliases = dict( - InMemoryReadElem="anndata.experimental.InMemoryReadElem", - InMemoryType="anndata.experimental.InMemoryArrayOrScalarType", - InMemoryArrayOrScalarType="anndata.experimental.InMemoryArrayOrScalarType", -) issues_github_path = "scverse/anndata" # autodoc_default_flags = ['members'] napoleon_google_docstring = False @@ -108,6 +103,7 @@ ("py:class", "anndata.compat.CupySparseMatrix"), ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), + ("py:obj", "numpy._typing._array_like._ScalarType_co"), ] suppress_warnings = [ "ref.citation", @@ -140,6 +136,12 @@ def setup(app: Sphinx): "anndata._io.specs.registry.read_callback": "anndata.experimental.read_callback", "anndata._io.specs.registry.write_callback": "anndata.experimental.write_callback", } +autodoc_type_aliases = dict( + NDArray=":data:`~numpy.typing.NDArray`", + InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", + InMemoryType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", + InMemoryArrayOrScalarType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", +) # -- Social cards --------------------------------------------------------- From 00010b8b0dc09a249554f9ef241df35041413af0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:45:19 +0200 Subject: [PATCH 066/138] (chore): use pascal-case --- docs/conf.py | 4 +-- src/anndata/_io/specs/registry.py | 8 +++--- src/anndata/experimental/__init__.py | 8 +++--- src/anndata/experimental/_dispatch_io.py | 33 ++++-------------------- 4 files changed, 15 insertions(+), 38 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 48a9ee4a9..e018d0602 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -133,8 +133,8 @@ def setup(app: Sphinx): "h5py._hl.files.File": "h5py.File", "h5py._hl.dataset.Dataset": "h5py.Dataset", "anndata._core.anndata.AnnData": "anndata.AnnData", - "anndata._io.specs.registry.read_callback": "anndata.experimental.read_callback", - "anndata._io.specs.registry.write_callback": "anndata.experimental.write_callback", + "anndata._io.specs.registry.ReadCallback": "anndata.experimental.ReadCallback", + "anndata._io.specs.registry.WriteCallback": "anndata.experimental.WriteCallback", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 4c324a005..d4ef3b91c 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -257,7 +257,7 @@ def _iter_patterns( InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) -class read_callback(Protocol, Generic[InMemoryType]): +class ReadCallback(Protocol, Generic[InMemoryType]): def __call__( self, /, @@ -289,7 +289,7 @@ def __call__( class Reader: def __init__( - self, registry: IORegistry, callback: read_callback | None = None + self, registry: IORegistry, callback: ReadCallback | None = None ) -> None: self.registry = registry self.callback = callback @@ -313,7 +313,7 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -class write_callback(Protocol): +class WriteCallback(Protocol): def __call__( self, /, @@ -349,7 +349,7 @@ def __call__( class Writer: - def __init__(self, registry: IORegistry, callback: write_callback | None = None): + def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry self.callback = callback diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index af21c8e15..4b2101ffe 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,10 +7,10 @@ from .._io.specs.registry import ( InMemoryArrayOrScalarType, InMemoryReadElem, + ReadCallback, Reader, + WriteCallback, Writer, - read_callback, - write_callback, ) from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk @@ -33,7 +33,7 @@ "InMemoryArrayOrScalarType", "Reader", "Writer", - "read_callback", - "write_callback", + "ReadCallback", + "WriteCallback", "StorageType", ] diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index 549ca85c4..b48c09bb8 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -7,13 +7,13 @@ from collections.abc import Mapping from typing import Any - from anndata._io.specs.registry import read_callback, write_callback + from anndata._io.specs.registry import ReadCallback, WriteCallback from anndata._types import GroupStorageType, StorageType def read_dispatched( elem: StorageType, - callback: read_callback, + callback: ReadCallback, ) -> Any: """ Read elem, calling the callback at each sub-element. @@ -24,19 +24,7 @@ def read_dispatched( Storage container (e.g. `h5py.Group`, `zarr.Group`). This must have anndata element specifications. callback - Function to call at each anndata encoded element. See details below for - signature. - - - The callback has the following signature: - - * `read_func` (`Callable`): A callable which takes the encoded element and returns it's decoded value. - This is the default decoding function, and what to call if you don't want to modify the decoding. - It will call this callback again at the next element encoding it sees. - * `key` (`str`): They absolute key of the element in the store. This will be an absolute key. - * `elem` (`StorageType`): The encoded element. - * `iospec` (`IOSpec`): The specification of the element. This is passed as a keyword argument. - + Function to call at each anndata encoded element. See Also -------- @@ -53,7 +41,7 @@ def write_dispatched( store: GroupStorageType, key: str, elem: Any, - callback: write_callback, + callback: WriteCallback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: @@ -69,22 +57,11 @@ def write_dispatched( elem The element to write. Probably an AnnData. callback - Function called when writing each element. See below for signature. + Function called when writing each element. dataset_kwargs Keyword arguments to pass to the dataset creation function. - The callback has the following signature: - - * `write_func` (`Callable`): A callable which takes the in memory element and writes it to the store. - This is the default encoding function, and what to call if you don't want to change behaviour at this level. - * `store` (`GroupStorageType`): The store to write to. - * `key` (`str`): The key to write elem into store at. This will be an absolute key. - * `elem` (`Any`): The element to write. - * `dataset_kwargs` (`dict`): Keyword arguments to pass to the dataset creation function. This is passed as a keyword argument. - * `iospec` (`IOSpec`): The specification of the element. This is passed as a keyword argument. - - See Also -------- From 0bd87fcf23c16189648add7f638ae39d0e47b357 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:45:33 +0200 Subject: [PATCH 067/138] (feat): type read/write funcs in callback --- docs/api.md | 6 ++++-- src/anndata/_io/specs/registry.py | 23 ++++++++++++++++------- src/anndata/experimental/__init__.py | 4 ++++ 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/docs/api.md b/docs/api.md index c05efb71b..b0a1dfc61 100644 --- a/docs/api.md +++ b/docs/api.md @@ -134,9 +134,11 @@ Utilities for customizing the IO process: experimental.InMemoryReadElem experimental.InMemoryArrayOrScalarType experimental.Reader + experimental.Read experimental.Writer - experimental.read_callback - experimental.write_callback + experimental.Write + experimental.ReadCallback + experimental.WriteCallback experimental.StorageType ``` diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index d4ef3b91c..0fc360715 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -78,7 +78,7 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -class reader(Protocol): +class Read(Protocol): def __call__( self, elem: StorageType, @@ -86,12 +86,23 @@ def __call__( ) -> InMemoryReadElem: ... +class Write(Protocol): + def __call__( + self, + f: GroupStorageType, + k: str, + v: InMemoryReadElem, + _writer: Writer, + dataset_kwargs: MappingProxyType, + ) -> None: ... + + class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], reader] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], Read] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ - tuple[type, type | tuple[type, str], frozenset[str]], Callable + tuple[type, type | tuple[type, str], frozenset[str]], Write ] = {} self.write_specs: dict[type | tuple[type, str], IOSpec] = {} @@ -261,7 +272,7 @@ class ReadCallback(Protocol, Generic[InMemoryType]): def __call__( self, /, - read_func: Callable[[StorageType, Reader], InMemoryType], + read_func: Read, elem_name: str, elem: StorageType, iospec: IOSpec, @@ -317,9 +328,7 @@ class WriteCallback(Protocol): def __call__( self, /, - write_func: Callable[ - [GroupStorageType, str, InMemoryReadElem, Writer, MappingProxyType], None - ], + write_func: Write, store: GroupStorageType, elem_name: str, elem: InMemoryReadElem, diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 4b2101ffe..78726490e 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,8 +7,10 @@ from .._io.specs.registry import ( InMemoryArrayOrScalarType, InMemoryReadElem, + Read, ReadCallback, Reader, + Write, WriteCallback, Writer, ) @@ -32,7 +34,9 @@ "InMemoryReadElem", "InMemoryArrayOrScalarType", "Reader", + "Read", "Writer", + "Write", "ReadCallback", "WriteCallback", "StorageType", From 5997678dc2776d1e24a65b9580356f339f10598b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:52:02 +0200 Subject: [PATCH 068/138] (fix): use generic for `Read` as well. --- src/anndata/_io/specs/registry.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 0fc360715..c6068f65f 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -78,7 +78,10 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -class Read(Protocol): +InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem, covariant=True) + + +class Read(Protocol, Generic[InMemoryType]): def __call__( self, elem: StorageType, @@ -265,14 +268,11 @@ def _iter_patterns( yield t -InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem) - - class ReadCallback(Protocol, Generic[InMemoryType]): def __call__( self, /, - read_func: Read, + read_func: Read[InMemoryType], elem_name: str, elem: StorageType, iospec: IOSpec, From f20833201c7197c0e311e489027071bbad5aa7b1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 11:55:44 +0200 Subject: [PATCH 069/138] (fix): need more aliases --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index e018d0602..f5e54f8ea 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -135,6 +135,8 @@ def setup(app: Sphinx): "anndata._core.anndata.AnnData": "anndata.AnnData", "anndata._io.specs.registry.ReadCallback": "anndata.experimental.ReadCallback", "anndata._io.specs.registry.WriteCallback": "anndata.experimental.WriteCallback", + "anndata._io.specs.registry.Read": "anndata.experimental.Read", + "anndata._io.specs.registry.Write": "anndata.experimental.Write", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", From eb69fcba70d8d2816350bc20141c5c1e6237c9a2 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 13:07:42 +0200 Subject: [PATCH 070/138] Split table, format --- docs/api.md | 13 +++++++++++-- src/anndata/experimental/_dispatch_io.py | 8 +++----- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/docs/api.md b/docs/api.md index b0a1dfc61..a6d92211c 100644 --- a/docs/api.md +++ b/docs/api.md @@ -82,7 +82,8 @@ Writing to other formats. API's in the experimental module are currently in development and subject to change at any time. ``` -Two classes for working with batched access to collections of many `AnnData` objects or `h5ad` files. In paritcular, for pytorch-based models. +Two classes for working with batched access to collections of many `AnnData` objects or `h5ad` files. +In particular, for pytorch-based models. ```{eval-rst} .. autosummary:: @@ -112,7 +113,7 @@ Out of core concatenation experimental.concat_on_disk ``` -Low level methods for reading and writing elements of an `` AnnData` `` object to a store: +Low level methods for reading and writing elements of an `AnnData` object to a store: ```{eval-rst} .. autosummary:: @@ -130,6 +131,14 @@ Utilities for customizing the IO process: experimental.read_dispatched experimental.write_dispatched +``` + +Types used by the former: + +```{eval-rst} +.. autosummary:: + :toctree: generated/ + experimental.IOSpec experimental.InMemoryReadElem experimental.InMemoryArrayOrScalarType diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index b48c09bb8..22b684cf5 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -21,13 +21,13 @@ def read_dispatched( Params ------ elem - Storage container (e.g. `h5py.Group`, `zarr.Group`). This must have anndata - element specifications. + Storage container (e.g. `h5py.Group`, `zarr.Group`). + This must have anndata element specifications. callback Function to call at each anndata encoded element. + See Also -------- - :doc:`/tutorials/notebooks/{read,write}_dispatched` """ from anndata._io.specs import _REGISTRY, Reader @@ -61,10 +61,8 @@ def write_dispatched( dataset_kwargs Keyword arguments to pass to the dataset creation function. - See Also -------- - :doc:`/tutorials/notebooks/{read,write}_dispatched` """ from anndata._io.specs import _REGISTRY, Writer From 477bbefc50507cbc9c62cef2f49f6b1d9f98fbbd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 13:52:36 +0200 Subject: [PATCH 071/138] (refactor): move to `_types` file --- docs/conf.py | 8 +- src/anndata/_io/specs/registry.py | 100 +++-------------- src/anndata/_types.py | 131 ++++++++++++++++++++++- src/anndata/experimental/__init__.py | 6 +- src/anndata/experimental/_dispatch_io.py | 8 +- 5 files changed, 159 insertions(+), 94 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index f5e54f8ea..96d94fa58 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -133,10 +133,10 @@ def setup(app: Sphinx): "h5py._hl.files.File": "h5py.File", "h5py._hl.dataset.Dataset": "h5py.Dataset", "anndata._core.anndata.AnnData": "anndata.AnnData", - "anndata._io.specs.registry.ReadCallback": "anndata.experimental.ReadCallback", - "anndata._io.specs.registry.WriteCallback": "anndata.experimental.WriteCallback", - "anndata._io.specs.registry.Read": "anndata.experimental.Read", - "anndata._io.specs.registry.Write": "anndata.experimental.Write", + "anndata._types.ReadCallback": "anndata.experimental.ReadCallback", + "anndata._types.WriteCallback": "anndata.experimental.WriteCallback", + "anndata._types.Read": "anndata.experimental.Read", + "anndata._types.Write": "anndata.experimental.Write", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index c6068f65f..fbebcfc06 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from functools import singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, Union +from typing import TYPE_CHECKING, Union import pandas as pd @@ -18,7 +18,13 @@ from typing import Any, TypeAlias from anndata._core.storage import StorageType - from anndata._types import GroupStorageType + from anndata._types import ( + GroupStorageType, + Read, + ReadCallback, + Write, + WriteCallback, + ) InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], @@ -78,28 +84,6 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -InMemoryType = TypeVar("InMemoryType", bound=InMemoryReadElem, covariant=True) - - -class Read(Protocol, Generic[InMemoryType]): - def __call__( - self, - elem: StorageType, - _reader: Reader, - ) -> InMemoryReadElem: ... - - -class Write(Protocol): - def __call__( - self, - f: GroupStorageType, - k: str, - v: InMemoryReadElem, - _writer: Writer, - dataset_kwargs: MappingProxyType, - ) -> None: ... - - class IORegistry: def __init__(self): self.read: dict[tuple[type, IOSpec, frozenset[str]], Read] = {} @@ -268,36 +252,6 @@ def _iter_patterns( yield t -class ReadCallback(Protocol, Generic[InMemoryType]): - def __call__( - self, - /, - read_func: Read[InMemoryType], - elem_name: str, - elem: StorageType, - iospec: IOSpec, - ) -> InMemoryType: - """ - Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. - - Params - ------ - read_func - :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. - elem_name - The key to read in from the group. - elem - The element to read from. - iospec - Internal AnnData encoding specification for the element. - - Returns - ------- - The element read from the store. - """ - ... - - class Reader: def __init__( self, registry: IORegistry, callback: ReadCallback | None = None @@ -324,37 +278,13 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -class WriteCallback(Protocol): - def __call__( - self, - /, - write_func: Write, - store: GroupStorageType, - elem_name: str, - elem: InMemoryReadElem, - *, - iospec: IOSpec, - dataset_kwargs: MappingProxyType, - ) -> None: - """ - Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. - - Params - ------ - write_func - :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. - store - The store to which `elem` should be written. - elem_name - The key to read in from the group. - elem - The element to write out. - iospec - Internal AnnData encoding specification for the element. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. - """ - ... +InMemoryReadElem: TypeAlias = Union[ + dict[str, InMemoryArrayOrScalarType], + InMemoryArrayOrScalarType, + AnnData, + pd.Categorical, + pd.api.extensions.ExtensionArray, +] class Writer: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 5827f5b6b..1cfaedbdf 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Protocol, TypeVar, Union import numpy as np import pandas as pd @@ -26,8 +26,11 @@ ) if TYPE_CHECKING: + from types import MappingProxyType from typing import TypeAlias + from anndata._io.specs.registry import IOSpec, Reader, Writer + __all__ = [ "ArrayStorageType", "GroupStorageType", @@ -55,3 +58,129 @@ ArrayStorageType = Union[ZarrArray, H5Array] GroupStorageType = Union[ZarrGroup, H5Group] StorageType = Union[ArrayStorageType, GroupStorageType] + +ContravariantInMemoryType = TypeVar( + "ContravariantInMemoryType", + bound="InMemoryReadElem", # noqa: F821 + contravariant=True, +) +CovariantInMemoryType = TypeVar( + "CovariantInMemoryType", + bound="InMemoryReadElem", # noqa: F821 + covariant=True, +) +InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") # noqa: F821 + + +class Read(Protocol[CovariantInMemoryType]): + def __call__( + self, + elem: StorageType, + _reader: Reader, + ) -> CovariantInMemoryType: + """Low-level reading function for an element. + + Parameters + ---------- + elem + The element to read from. + _reader + The :class:`anndata.experimental.Reader` instance. + + Returns + ------- + The element read from the store. + """ + + ... + + +class Write(Protocol[ContravariantInMemoryType]): + def __call__( + self, + f: GroupStorageType, + k: str, + v: ContravariantInMemoryType, + _writer: Writer, + dataset_kwargs: MappingProxyType, + ) -> None: + """Low-level writing function for an element. + + Parameters + ---------- + f + The store to which `elem` should be written. + k + The key to read in from the group. + v + The element to write out. + _writer + The :class:`anndata.experimental.Writer` instance. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + """ + + ... + + +class ReadCallback(Protocol[InvariantInMemoryType]): + def __call__( + self, + /, + read_func: Read[InvariantInMemoryType], + elem_name: str, + elem: StorageType, + iospec: IOSpec, + ) -> InvariantInMemoryType: + """ + Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. + + Params + ------ + read_func + :func:`anndata.experimental.read_elem` function to call to read the current element given the ``iospec``. + elem_name + The key to read in from the group. + elem + The element to read from. + iospec + Internal AnnData encoding specification for the element. + + Returns + ------- + The element read from the store. + """ + ... + + +class WriteCallback(Protocol[InvariantInMemoryType]): + def __call__( + self, + /, + write_func: Write[InvariantInMemoryType], + store: GroupStorageType, + elem_name: str, + elem: InvariantInMemoryType, + *, + iospec: IOSpec, + dataset_kwargs: MappingProxyType, + ) -> None: + """ + Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. + + Params + ------ + write_func + :func:`anndata.experimental.write_elem` function to call to read the current element given the ``iospec``. + store + The store to which `elem` should be written. + elem_name + The key to read in from the group. + elem + The element to write out. + iospec + Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + """ + ... diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 78726490e..9e3f91191 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,12 +7,14 @@ from .._io.specs.registry import ( InMemoryArrayOrScalarType, InMemoryReadElem, + Reader, + Writer, +) +from .._types import ( Read, ReadCallback, - Reader, Write, WriteCallback, - Writer, ) from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index b48c09bb8..789f158af 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -7,8 +7,12 @@ from collections.abc import Mapping from typing import Any - from anndata._io.specs.registry import ReadCallback, WriteCallback - from anndata._types import GroupStorageType, StorageType + from anndata._types import ( + GroupStorageType, + ReadCallback, + StorageType, + WriteCallback, + ) def read_dispatched( From 8d23f6f443a21895474d888726f29fa193d6f0d9 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 14:45:56 +0200 Subject: [PATCH 072/138] bump scanpydoc --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 310f57fd2..17898ec55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ doc = [ "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", - "scanpydoc[theme,typehints] >=0.13.4", + "scanpydoc[theme,typehints] >=0.13.5", "zarr", "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks From 9b647c2838b7722b831e2dc3aa1b6148678950f6 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 9 Jul 2024 15:18:59 +0200 Subject: [PATCH 073/138] Some basic syntax fixes --- src/anndata/_io/specs/registry.py | 9 --------- src/anndata/_types.py | 12 +++++------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index fbebcfc06..f61fd9ee3 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -278,15 +278,6 @@ def read_elem( return self.callback(read_func, elem.name, elem, iospec=iospec) -InMemoryReadElem: TypeAlias = Union[ - dict[str, InMemoryArrayOrScalarType], - InMemoryArrayOrScalarType, - AnnData, - pd.Categorical, - pd.api.extensions.ExtensionArray, -] - - class Writer: def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 1cfaedbdf..0ddce2e5d 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -11,8 +11,8 @@ from numpy.typing import NDArray from scipy import sparse -from anndata._core.sparse_dataset import BaseCompressedSparseDataset -from anndata.compat import ( +from ._core.sparse_dataset import BaseCompressedSparseDataset +from .compat import ( AwkArray, CupyArray, CupySparseMatrix, @@ -29,7 +29,7 @@ from types import MappingProxyType from typing import TypeAlias - from anndata._io.specs.registry import IOSpec, Reader, Writer + from ._io.specs.registry import IOSpec, Reader, Writer __all__ = [ "ArrayStorageType", @@ -91,8 +91,7 @@ def __call__( ------- The element read from the store. """ - - ... + ... class Write(Protocol[ContravariantInMemoryType]): @@ -119,8 +118,7 @@ def __call__( dataset_kwargs Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ - - ... + ... class ReadCallback(Protocol[InvariantInMemoryType]): From 5ef93e1b3fa5edab43830d827f6f244cc2addaf2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:52:34 +0200 Subject: [PATCH 074/138] (fix): change `Read{Callback}` type for kwargs --- src/anndata/_types.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 0ddce2e5d..1f97673cf 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -77,6 +77,8 @@ def __call__( self, elem: StorageType, _reader: Reader, + *, + dataset_kwargs: MappingProxyType, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -86,6 +88,8 @@ def __call__( The element to read from. _reader The :class:`anndata.experimental.Reader` instance. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. Returns ------- @@ -129,6 +133,8 @@ def __call__( elem_name: str, elem: StorageType, iospec: IOSpec, + *, + dataset_kwargs: MappingProxyType, ) -> InvariantInMemoryType: """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. @@ -143,6 +149,8 @@ def __call__( The element to read from. iospec Internal AnnData encoding specification for the element. + dataset_kwargs + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. Returns ------- From 9cfe9086ce6a97e9d6cf7833b9e496728881e00a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:55:52 +0200 Subject: [PATCH 075/138] (chore): test `chunks `argument --- tests/test_io_elementwise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 1d7d01241..31149a1ae 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -282,6 +282,7 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) + assert X_dask_from_disk.chunksize == chunks X_from_disk = read_elem(arr_store["X"]) file.close() with ( From 99fc6db05a6b075b0c01760ea9d8e6f1f8e9ec35 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:57:32 +0200 Subject: [PATCH 076/138] (fix): type `read_recarray` --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 581d5d90a..9bc882c57 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -527,7 +527,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d, _reader) -> np.recarray | npt.NDArray: +def read_recarray(d: StorageType, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) From b5bccc3818e9bed4f610bf31a367b79ee3a3531e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 15:59:50 +0200 Subject: [PATCH 077/138] (fix): `GroupyStorageType` not `StorageType` --- src/anndata/_io/specs/methods.py | 33 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 9bc882c57..a3e6662c4 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -41,7 +41,6 @@ from numpy import typing as npt - from anndata._core.storage import StorageType from anndata._types import GroupStorageType, InMemoryArrayOrScalarType from anndata.compat import SpArray @@ -119,7 +118,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -141,7 +140,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -284,7 +283,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem: StorageType, _reader: Reader) -> AnnData: +def read_anndata(elem: GroupStorageType, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -326,7 +325,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) def read_mapping( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -431,7 +430,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: StorageType, _reader: Reader) -> npt.NDArray: +def read_array(elem: GroupStorageType, _reader: Reader) -> npt.NDArray: return elem[()] @@ -527,7 +526,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d: StorageType, _reader: Reader) -> np.recarray | npt.NDArray: +def read_recarray(d: GroupStorageType, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -711,7 +710,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem: StorageType, _reader: Reader) -> sparse.spmatrix | SpArray: +def read_sparse(elem: GroupStorageType, _reader: Reader) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -755,7 +754,7 @@ def write_awkward( @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem: StorageType, _reader: Reader) -> AwkArray: +def read_awkward(elem: GroupStorageType, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -823,7 +822,7 @@ def write_dataframe( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem: StorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -864,7 +863,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem: StorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe_0_1_0(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -934,7 +933,7 @@ def write_categorical( @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem: StorageType, _reader: Reader) -> pd.Categorical: +def read_categorical(elem: GroupStorageType, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -985,7 +984,7 @@ def write_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) def read_nullable_integer( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -998,7 +997,7 @@ def read_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) def read_nullable_boolean( - elem: StorageType, _reader: Reader + elem: GroupStorageType, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1015,7 +1014,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem: StorageType, _reader: Reader) -> np.number: +def read_scalar(elem: GroupStorageType, _reader: Reader) -> np.number: return elem[()] @@ -1060,12 +1059,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem: StorageType, _reader: Reader) -> str: +def read_hdf5_string(elem: GroupStorageType, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem: StorageType, _reader: Reader) -> str: +def read_zarr_string(elem: GroupStorageType, _reader: Reader) -> str: return str(elem[()]) From e5ea2b0520ead9675df7ca5c35ff6157f1fcee14 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:04:47 +0200 Subject: [PATCH 078/138] (fix): little type fixes --- src/anndata/_io/specs/methods.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index a3e6662c4..6f58093a8 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -41,7 +41,11 @@ from numpy import typing as npt - from anndata._types import GroupStorageType, InMemoryArrayOrScalarType + from anndata._types import ( + ArrayStorageType, + GroupStorageType, + InMemoryArrayOrScalarType, + ) from anndata.compat import SpArray from .registry import Reader, Writer @@ -118,7 +122,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: GroupStorageType, _reader: Reader + elem: H5File | H5Group | H5Array, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -140,7 +144,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: GroupStorageType, _reader: Reader + elem: ZarrGroup | ZarrArray, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -283,7 +287,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem: GroupStorageType, _reader: Reader) -> AnnData: +def read_anndata(elem: GroupStorageType | H5File, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -430,7 +434,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: GroupStorageType, _reader: Reader) -> npt.NDArray: +def read_array(elem: ArrayStorageType, _reader: Reader) -> npt.NDArray: return elem[()] @@ -447,7 +451,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array(d, _reader): +def read_string_array(d: H5Array, _reader: Reader): return read_array(d.asstr(), _reader=_reader) @@ -526,7 +530,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d: GroupStorageType, _reader: Reader) -> np.recarray | npt.NDArray: +def read_recarray(d: ArrayStorageType, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -1014,7 +1018,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem: GroupStorageType, _reader: Reader) -> np.number: +def read_scalar(elem: ArrayStorageType, _reader: Reader) -> np.number: return elem[()] @@ -1059,12 +1063,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem: GroupStorageType, _reader: Reader) -> str: +def read_hdf5_string(elem: H5Array, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem: GroupStorageType, _reader: Reader) -> str: +def read_zarr_string(elem: ZarrArray, _reader: Reader) -> str: return str(elem[()]) @@ -1075,7 +1079,7 @@ def read_zarr_string(elem: GroupStorageType, _reader: Reader) -> str: @_REGISTRY.register_write(H5Group, np.str_, IOSpec("string", "0.2.0")) @_REGISTRY.register_write(H5Group, str, IOSpec("string", "0.2.0")) def write_string( - f: GroupStorageType, + f: H5Group, k: str, v: np.str_ | str, _writer: Writer, From 6ac72d63d8e9b92c40df12354aec0c79651b8ccb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:22:48 +0200 Subject: [PATCH 079/138] (fix): clarify `H5File` typing --- src/anndata/_io/specs/methods.py | 8 +++----- src/anndata/_types.py | 9 ++++++--- src/anndata/compat/__init__.py | 1 + 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 6f58093a8..3f4e73358 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -26,6 +26,9 @@ CupyCSCMatrix, CupyCSRMatrix, DaskArray, + H5Array, + H5File, + H5Group, ZarrArray, ZarrGroup, _decode_structured_array, @@ -50,11 +53,6 @@ from .registry import Reader, Writer -H5Array = h5py.Dataset -H5Group = h5py.Group -H5File = h5py.File - - #################### # Dask utils # #################### diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 0ddce2e5d..b9b0065fd 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,6 +30,9 @@ from typing import TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer + from .compat import ( + H5File, + ) __all__ = [ "ArrayStorageType", @@ -75,7 +78,7 @@ class Read(Protocol[CovariantInMemoryType]): def __call__( self, - elem: StorageType, + elem: StorageType | H5File, _reader: Reader, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -97,7 +100,7 @@ def __call__( class Write(Protocol[ContravariantInMemoryType]): def __call__( self, - f: GroupStorageType, + f: StorageType, k: str, v: ContravariantInMemoryType, _writer: Writer, @@ -156,7 +159,7 @@ def __call__( self, /, write_func: Write[InvariantInMemoryType], - store: GroupStorageType, + store: StorageType, elem_name: str, elem: InvariantInMemoryType, *, diff --git a/src/anndata/compat/__init__.py b/src/anndata/compat/__init__.py index da67141b1..6edc7e2c9 100644 --- a/src/anndata/compat/__init__.py +++ b/src/anndata/compat/__init__.py @@ -49,6 +49,7 @@ class Empty: Index = Union[Index1D, tuple[Index1D, Index1D], scipy.sparse.spmatrix, SpArray] H5Group = h5py.Group H5Array = h5py.Dataset +H5File = h5py.File ############################# From 989dc6546275d99b04520f60a9e230f06d586600 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:44:15 +0200 Subject: [PATCH 080/138] (fix): dask doc --- src/anndata/_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 1f97673cf..742089ce6 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -150,7 +150,7 @@ def __call__( iospec Internal AnnData encoding specification for the element. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. Returns ------- From 36b0207e799f220f8aead955274a360bb58917a3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 16:52:47 +0200 Subject: [PATCH 081/138] (fix): dask docs --- docs/conf.py | 1 + src/anndata/_types.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 96d94fa58..fe6833d8d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -126,6 +126,7 @@ def setup(app: Sphinx): scipy=("https://docs.scipy.org/doc/scipy/", None), sklearn=("https://scikit-learn.org/stable/", None), zarr=("https://zarr.readthedocs.io/en/stable/", None), + dask=("https://docs.dask.org/en/stable/", None), xarray=("https://xarray.pydata.org/en/stable/", None), ) qualname_overrides = { diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 742089ce6..b63ce83ed 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -89,7 +89,7 @@ def __call__( _reader The :class:`anndata.experimental.Reader` instance. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask.from_zarr`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. Returns ------- @@ -120,7 +120,7 @@ def __call__( _writer The :class:`anndata.experimental.Writer` instance. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. """ ... From ca6cf6629103b5e67e2c48aca249714a38ad07de Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:15:43 +0200 Subject: [PATCH 082/138] (fix): typing --- src/anndata/_io/specs/lazy_methods.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a65fbc91b..b4f9ed982 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -19,6 +19,8 @@ from collections.abc import Mapping from typing import Any, Literal + from .registry import Reader + @overload def make_block_indexer( @@ -96,7 +98,7 @@ def _(x): @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, - _reader, + _reader: Reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da @@ -145,9 +147,8 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem, - _reader, - chunks: tuple[int] | None = None, + elem: H5Array, + _reader: Reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da From eabaf3512ffba12abff52d8657f696a8e25ae924 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:20:20 +0200 Subject: [PATCH 083/138] (fix): handle case when `chunks` is `None` --- src/anndata/_io/specs/lazy_methods.py | 7 +++++-- src/anndata/_io/specs/registry.py | 4 +++- tests/test_io_elementwise.py | 26 ++++++++++++++++++++++---- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index b4f9ed982..c0597a49f 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -184,8 +184,11 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem, _reader, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}) + elem: ZarrArray, + _reader: Reader, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): + chunks: tuple[int, ...] = dataset_kwargs.get("chunks", elem.chunks) import dask.array as da - return da.from_zarr(elem) + return da.from_zarr(elem, chunks=chunks) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 1a4d2913f..067728d41 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -387,7 +387,9 @@ def read_elem_as_dask( ------- DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem(elem, dataset_kwargs={"chunks": chunks}) + return Reader(_LAZY_REGISTRY).read_elem( + elem, dataset_kwargs={"chunks": chunks} if chunks is not None else {} + ) def write_elem( diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 31149a1ae..fa867cac6 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -59,7 +59,7 @@ def store(request, tmp_path) -> H5Group | ZarrGroup: sparse_formats = ["csr", "csc"] -SIZE = 1000 +SIZE = 2500 @pytest.fixture(params=sparse_formats) @@ -235,7 +235,15 @@ def test_read_lazy_2d_dask(sparse_format, store): @pytest.mark.parametrize( ("n_dims", "chunks"), - [(1, (100,)), (1, (400,)), (2, (100, 100)), (2, (400, 400)), (2, (200, 400))], + [ + (1, (100,)), + (1, (400,)), + (2, (100, 100)), + (2, (400, 400)), + (2, (200, 400)), + (1, None), + (2, None), + ], ) def test_read_lazy_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, n_dims) @@ -269,7 +277,13 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): @pytest.mark.parametrize( ("arr_type", "chunks"), - [("dense", (100, 100)), ("csc", (SIZE, 10)), ("csr", (10, SIZE))], + [ + ("dense", (100, 100)), + ("csc", (SIZE, 10)), + ("csr", (10, SIZE)), + ("csc", None), + ("csr", None), + ], ) def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): import dask.distributed as dd @@ -282,7 +296,11 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - assert X_dask_from_disk.chunksize == chunks + if chunks is not None: + assert X_dask_from_disk.chunksize == chunks + else: + # assert that sparse chunks are set correctly by default + assert X_dask_from_disk.chunksize[bool(arr_type == "csr")] == SIZE X_from_disk = read_elem(arr_store["X"]) file.close() with ( From 4c398c3f2329ab3c0212670652ea1936255375e5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:26:59 +0200 Subject: [PATCH 084/138] (feat): add string-array reading --- src/anndata/_io/specs/lazy_methods.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index c0597a49f..a4aefc07e 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -145,6 +145,7 @@ def make_dask_chunk(block_id: tuple[int, int]): return da_mtx +@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, @@ -182,6 +183,7 @@ def make_dask_chunk(block_id: tuple[int, int]): ) +@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, From d6fc8a47934d2a424bad95c0c43ad6898da6f17a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Jul 2024 17:40:03 +0200 Subject: [PATCH 085/138] (fix): remove `string-array` because it is not tested --- src/anndata/_io/specs/lazy_methods.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index a4aefc07e..c0597a49f 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -145,7 +145,6 @@ def make_dask_chunk(block_id: tuple[int, int]): return da_mtx -@_LAZY_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, @@ -183,7 +182,6 @@ def make_dask_chunk(block_id: tuple[int, int]): ) -@_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, From 33aebb26c6100e74e24124d483b2f47f4b198480 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 11:33:12 +0200 Subject: [PATCH 086/138] (refactor): clean up tests --- tests/test_io_elementwise.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index fa867cac6..750dfa66b 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -245,7 +245,7 @@ def test_read_lazy_2d_dask(sparse_format, store): (2, None), ], ) -def test_read_lazy_nd_dask(store, n_dims, chunks): +def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): arr_store = create_dense_store(store, n_dims) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) X_from_disk = read_elem(arr_store["X"]) @@ -285,11 +285,7 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): ("csr", None), ], ) -def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): - import dask.distributed as dd - - file = h5py.File(tmp_path / "test.h5", "w") - store = file["/"] +def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) @@ -302,15 +298,10 @@ def test_read_lazy_h5_chunk_kwargs(arr_type, chunks, tmp_path): # assert that sparse chunks are set correctly by default assert X_dask_from_disk.chunksize[bool(arr_type == "csr")] == SIZE X_from_disk = read_elem(arr_store["X"]) - file.close() - with ( - dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, - dd.Client(cluster) as _client, - ): - assert_equal(X_from_disk, X_dask_from_disk) + assert_equal(X_from_disk, X_dask_from_disk) -def test_read_lazy_h5_bad_chunk_kwargs(tmp_path): +def test_read_lazy_bad_chunk_kwargs(tmp_path): arr_type = "csr" file = h5py.File(tmp_path / "test.h5", "w") store = file["/"] From 701cd8527f7a3d465ab045461f37db644cf4f894 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 Jul 2024 12:54:26 +0200 Subject: [PATCH 087/138] (fix): overfetching problem --- src/anndata/_io/specs/lazy_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index c0597a49f..52d29eb5e 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -45,7 +45,7 @@ def make_block_indexer( ) -> tuple[slice, slice] | tuple[slice]: index1d = slice( block_id[is_csc] * stride, - min((block_id[is_csc] * stride) + stride, shape[0]), + min((block_id[is_csc] * stride) + stride, shape[is_csc]), ) if is_csc: return (slice(None), index1d) @@ -105,7 +105,7 @@ def read_sparse_as_dask( path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem elem_name = get_elem_name(elem) - shape: tuple[int, int] = elem.attrs["shape"] + shape: tuple[int, int] = tuple(elem.attrs["shape"]) dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" @@ -155,7 +155,7 @@ def read_h5_array( path = Path(elem.file.filename) elem_name = elem.name - shape = elem.shape + shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = dataset_kwargs.get( "chunks", (_DEFAULT_STRIDE,) * len(shape) From 43b21a21385465d54fd74fc85ecd03a6e0b15227 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 09:02:14 +0200 Subject: [PATCH 088/138] Fix circular import --- src/anndata/_types.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index b9b0065fd..4853c8f15 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -30,9 +30,7 @@ from typing import TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer - from .compat import ( - H5File, - ) + from .compat import H5File __all__ = [ "ArrayStorageType", @@ -63,16 +61,12 @@ StorageType = Union[ArrayStorageType, GroupStorageType] ContravariantInMemoryType = TypeVar( - "ContravariantInMemoryType", - bound="InMemoryReadElem", # noqa: F821 - contravariant=True, + "ContravariantInMemoryType", bound="InMemoryReadElem", contravariant=True ) CovariantInMemoryType = TypeVar( - "CovariantInMemoryType", - bound="InMemoryReadElem", # noqa: F821 - covariant=True, + "CovariantInMemoryType", bound="InMemoryReadElem", covariant=True ) -InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") # noqa: F821 +InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") class Read(Protocol[CovariantInMemoryType]): @@ -185,3 +179,8 @@ def __call__( Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... + + +if TYPE_CHECKING: + # Needs to be at the end because Sphinx’s type import suffers from circular imports + from ._io.specs.registry import InMemoryReadElem From 0e22449573ba2ac88d6c9ffe109b47d28d8bc2fb Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 09:18:01 +0200 Subject: [PATCH 089/138] add some typing --- src/anndata/_io/specs/registry.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index f61fd9ee3..181848d41 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -15,7 +15,7 @@ if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any, TypeAlias + from typing import Any, TypeAlias, TypeVar from anndata._core.storage import StorageType from anndata._types import ( @@ -26,6 +26,9 @@ WriteCallback, ) + T = TypeVar("T") + W = TypeVar("W", bound=Write) + InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], InMemoryArrayOrScalarType, @@ -47,7 +50,7 @@ class IOSpec: class IORegistryError(Exception): @classmethod def _from_write_parts( - cls, dest_type: type, typ: type, modifiers: frozenset[str] + cls, dest_type: type | tuple[type, str], typ: type, modifiers: frozenset[str] ) -> IORegistryError: msg = f"No method registered for writing {typ} into {dest_type}" if modifiers: @@ -71,7 +74,7 @@ def _from_read_parts( def write_spec(spec: IOSpec): - def decorator(func: Callable): + def decorator(func: W) -> W: @wraps(func) def wrapper(g: GroupStorageType, k: str, *args, **kwargs): result = func(g, k, *args, **kwargs) @@ -99,7 +102,7 @@ def register_write( src_type: type | tuple[type, str], spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ): + ) -> Callable[[Write[T]], Write[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -125,7 +128,7 @@ def get_writer( dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str] = frozenset(), - ): + ) -> Write: import h5py if dest_type is h5py.File: @@ -141,7 +144,7 @@ def has_writer( dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str], - ): + ) -> bool: return (dest_type, src_type, modifiers) in self.write def register_read( @@ -149,7 +152,7 @@ def register_read( src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ): + ) -> Callable[[Read[T]], Read[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -161,7 +164,7 @@ def _register(func): def get_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() - ): + ) -> Read: if (src_type, spec, modifiers) in self.read: return self.read[(src_type, spec, modifiers)] else: @@ -171,7 +174,7 @@ def get_reader( def has_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() - ): + ) -> bool: return (src_type, spec, modifiers) in self.read def register_read_partial( From ec546f451067fb88407179f7acb81ae9f3bc56af Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:05:27 +0200 Subject: [PATCH 090/138] fix mapping types --- src/anndata/_io/h5ad.py | 2 +- src/anndata/_io/specs/methods.py | 109 +++++++++++++++++++------------ src/anndata/_types.py | 9 +-- 3 files changed, 73 insertions(+), 47 deletions(-) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 142acc77d..2cd2fca48 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -47,7 +47,7 @@ def write_h5ad( adata: AnnData, *, as_dense: Sequence[str] = (), - dataset_kwargs: Mapping = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), **kwargs, ) -> None: if isinstance(as_dense, str): diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 3f4e73358..8107d88dd 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -40,7 +40,7 @@ if TYPE_CHECKING: from os import PathLike - from typing import Literal + from typing import Any, Literal from numpy import typing as npt @@ -100,11 +100,13 @@ def wrapper( f, k, cupy_val: CupyArray | CupyCSCMatrix | CupyCSRMatrix, - _writer, *, + _writer: Writer, dataset_kwargs=MappingProxyType, ): - return write_func(f, k, cupy_val.get(), _writer, dataset_kwargs=dataset_kwargs) + return write_func( + f, k, cupy_val.get(), _writer=_writer, dataset_kwargs=dataset_kwargs + ) return wrapper @@ -120,7 +122,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: H5File | H5Group | H5Array, _reader: Reader + elem: H5File | H5Group | H5Array, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -142,7 +144,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: ZarrGroup | ZarrArray, _reader: Reader + elem: ZarrGroup | ZarrArray, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -263,8 +265,9 @@ def write_anndata( f: GroupStorageType, k: str, adata: AnnData, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) _writer.write_elem(g, "X", adata.X, dataset_kwargs=dataset_kwargs) @@ -285,7 +288,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata(elem: GroupStorageType | H5File, _reader: Reader) -> AnnData: +def read_anndata(elem: GroupStorageType | H5File, *, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -310,8 +313,9 @@ def write_raw( f: GroupStorageType, k: str, raw: Raw, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) _writer.write_elem(g, "X", raw.X, dataset_kwargs=dataset_kwargs) @@ -327,7 +331,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) def read_mapping( - elem: GroupStorageType, _reader: Reader + elem: GroupStorageType, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -338,8 +342,9 @@ def write_mapping( f: GroupStorageType, k: str, v: dict, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) for sub_k, sub_v in v.items(): @@ -357,8 +362,9 @@ def write_list( f: GroupStorageType, k: str, elem: list, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): _writer.write_elem(f, k, np.array(elem), dataset_kwargs=dataset_kwargs) @@ -378,8 +384,9 @@ def write_basic( f: GroupStorageType, k: str, elem: views.ArrayView | np.ndarray | h5py.Dataset | np.ma.MaskedArray | ZarrArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write methods which underlying library handles natively.""" f.create_dataset(k, data=elem, **dataset_kwargs) @@ -398,8 +405,9 @@ def write_basic_dask_zarr( f: ZarrGroup, k: str, elem: DaskArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da @@ -414,8 +422,9 @@ def write_basic_dask_h5( f: H5Group, k: str, elem: DaskArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import dask.array as da import dask.config as dc @@ -432,7 +441,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array(elem: ArrayStorageType, _reader: Reader) -> npt.NDArray: +def read_array(elem: ArrayStorageType, *, _reader: Reader) -> npt.NDArray: return elem[()] @@ -449,7 +458,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array(d: H5Array, _reader: Reader): +def read_string_array(d: H5Array, *, _reader: Reader): return read_array(d.asstr(), _reader=_reader) @@ -470,8 +479,9 @@ def write_vlen_string_array( f: H5Group, k: str, elem: np.ndarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): """Write methods which underlying library handles nativley.""" str_dtype = h5py.special_dtype(vlen=str) @@ -490,8 +500,9 @@ def write_vlen_string_array_zarr( f: ZarrGroup, k: str, elem: np.ndarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): import numcodecs @@ -528,7 +539,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray(d: ArrayStorageType, _reader: Reader) -> np.recarray | npt.NDArray: +def read_recarray(d: ArrayStorageType, *, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -543,8 +554,9 @@ def write_recarray( f: H5Group, k: str, elem: np.ndarray | np.recarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): f.create_dataset(k, data=_to_hdf5_vlen_strings(elem), **dataset_kwargs) @@ -555,8 +567,9 @@ def write_recarray_zarr( f: ZarrGroup, k: str, elem: np.ndarray | np.recarray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): from anndata.compat import _to_fixed_length_strings @@ -572,6 +585,7 @@ def write_sparse_compressed( f: GroupStorageType, key: str, value: sparse.spmatrix | SpArray, + *, _writer: Writer, fmt: Literal["csr", "csc"], dataset_kwargs=MappingProxyType({}), @@ -632,14 +646,15 @@ def write_sparse_dataset( f: GroupStorageType, k: str, elem: CSCDataset | CSRDataset, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): write_sparse_compressed( f, k, elem._to_backed(), - _writer, + _writer=_writer, fmt=elem.format, dataset_kwargs=dataset_kwargs, ) @@ -664,8 +679,9 @@ def write_dask_sparse( f: GroupStorageType, k: str, elem: DaskArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): sparse_format = elem._meta.format @@ -712,7 +728,9 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) -def read_sparse(elem: GroupStorageType, _reader: Reader) -> sparse.spmatrix | SpArray: +def read_sparse( + elem: GroupStorageType, *, _reader: Reader +) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -741,8 +759,9 @@ def write_awkward( f: GroupStorageType, k: str, v: views.AwkwardArrayView | AwkArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): from anndata.compat import awkward as ak @@ -756,7 +775,7 @@ def write_awkward( @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward(elem: GroupStorageType, _reader: Reader) -> AwkArray: +def read_awkward(elem: GroupStorageType, *, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -779,8 +798,9 @@ def write_dataframe( f: GroupStorageType, key: str, df: views.DataFrameView | pd.DataFrame, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): # Check arguments for reserved in ("_index",): @@ -824,7 +844,7 @@ def write_dataframe( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -865,7 +885,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0(elem: GroupStorageType, _reader: Reader) -> pd.DataFrame: +def read_dataframe_0_1_0(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -921,8 +941,9 @@ def write_categorical( f: GroupStorageType, k: str, v: pd.Categorical, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) g.attrs["ordered"] = bool(v.ordered) @@ -935,7 +956,7 @@ def write_categorical( @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem: GroupStorageType, _reader: Reader) -> pd.Categorical: +def read_categorical(elem: GroupStorageType, *, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -974,8 +995,9 @@ def write_nullable_integer( f: GroupStorageType, k: str, v: pd.arrays.IntegerArray | pd.arrays.BooleanArray, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) if v._mask is not None: @@ -986,7 +1008,7 @@ def write_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) def read_nullable_integer( - elem: GroupStorageType, _reader: Reader + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -999,7 +1021,7 @@ def read_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) def read_nullable_boolean( - elem: GroupStorageType, _reader: Reader + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1016,7 +1038,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar(elem: ArrayStorageType, _reader: Reader) -> np.number: +def read_scalar(elem: ArrayStorageType, *, _reader: Reader) -> np.number: return elem[()] @@ -1024,8 +1046,9 @@ def write_scalar( f: GroupStorageType, key: str, value, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): return f.create_dataset(key, data=np.array(value), **dataset_kwargs) @@ -1034,11 +1057,12 @@ def write_hdf5_scalar( f: H5Group, key: str, value, + *, _writer: Writer, - dataset_kwargs: MappingProxyType = MappingProxyType({}), + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): # Can’t compress scalars, error is thrown - dataset_kwargs = dataset_kwargs.copy() + dataset_kwargs = dict(dataset_kwargs) dataset_kwargs.pop("compression", None) dataset_kwargs.pop("compression_opts", None) f.create_dataset(key, data=np.array(value), **dataset_kwargs) @@ -1061,12 +1085,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string(elem: H5Array, _reader: Reader) -> str: +def read_hdf5_string(elem: H5Array, *, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string(elem: ZarrArray, _reader: Reader) -> str: +def read_zarr_string(elem: ZarrArray, *, _reader: Reader) -> str: return str(elem[()]) @@ -1080,8 +1104,9 @@ def write_string( f: H5Group, k: str, v: np.str_ | str, + *, _writer: Writer, - dataset_kwargs: MappingProxyType, + dataset_kwargs: Mapping[str, Any], ): dataset_kwargs = dataset_kwargs.copy() dataset_kwargs.pop("compression", None) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 4853c8f15..f53d644dc 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -26,8 +26,8 @@ ) if TYPE_CHECKING: - from types import MappingProxyType - from typing import TypeAlias + from collections.abc import Mapping + from typing import Any, TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer from .compat import H5File @@ -97,8 +97,9 @@ def __call__( f: StorageType, k: str, v: ContravariantInMemoryType, + *, _writer: Writer, - dataset_kwargs: MappingProxyType, + dataset_kwargs: Mapping[str, Any], ) -> None: """Low-level writing function for an element. @@ -158,7 +159,7 @@ def __call__( elem: InvariantInMemoryType, *, iospec: IOSpec, - dataset_kwargs: MappingProxyType, + dataset_kwargs: Mapping[str, Any], ) -> None: """ Callback used in :func:`anndata.experimental.write_dispatched` to customize writing an element to a store. From 7c2e4da9d01f60da91d0e3bcbac6e3899a3a120f Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:07:35 +0200 Subject: [PATCH 091/138] Fix Read/Write --- docs/api.md | 2 - src/anndata/_io/specs/registry.py | 65 ++++++++++++++++------------ src/anndata/_types.py | 27 +++++++++--- src/anndata/experimental/__init__.py | 9 +--- 4 files changed, 59 insertions(+), 44 deletions(-) diff --git a/docs/api.md b/docs/api.md index a6d92211c..a605afcc3 100644 --- a/docs/api.md +++ b/docs/api.md @@ -142,9 +142,7 @@ Types used by the former: experimental.IOSpec experimental.InMemoryReadElem experimental.InMemoryArrayOrScalarType - experimental.Reader experimental.Read - experimental.Writer experimental.Write experimental.ReadCallback experimental.WriteCallback diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 181848d41..d8ece4ef9 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -2,7 +2,7 @@ from collections.abc import Mapping from dataclasses import dataclass -from functools import singledispatch, wraps +from functools import partial, singledispatch, wraps from types import MappingProxyType from typing import TYPE_CHECKING, Union @@ -24,10 +24,12 @@ ReadCallback, Write, WriteCallback, + _ReadInternal, + _WriteInternal, ) T = TypeVar("T") - W = TypeVar("W", bound=Write) + W = TypeVar("W", bound=_WriteInternal) InMemoryReadElem: TypeAlias = Union[ dict[str, InMemoryArrayOrScalarType], @@ -50,7 +52,7 @@ class IOSpec: class IORegistryError(Exception): @classmethod def _from_write_parts( - cls, dest_type: type | tuple[type, str], typ: type, modifiers: frozenset[str] + cls, dest_type: type, typ: type | tuple[type, str], modifiers: frozenset[str] ) -> IORegistryError: msg = f"No method registered for writing {typ} into {dest_type}" if modifiers: @@ -89,10 +91,10 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], Read] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], _ReadInternal] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ - tuple[type, type | tuple[type, str], frozenset[str]], Write + tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal ] = {} self.write_specs: dict[type | tuple[type, str], IOSpec] = {} @@ -102,7 +104,7 @@ def register_write( src_type: type | tuple[type, str], spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ) -> Callable[[Write[T]], Write[T]]: + ) -> Callable[[_WriteInternal[T]], _WriteInternal[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -123,21 +125,23 @@ def _register(func): return _register - def get_writer( + def get_write_func( self, dest_type: type, src_type: type | tuple[type, str], modifiers: frozenset[str] = frozenset(), + *, + writer: Writer, ) -> Write: import h5py if dest_type is h5py.File: dest_type = h5py.Group - if (dest_type, src_type, modifiers) in self.write: - return self.write[(dest_type, src_type, modifiers)] - else: + if (dest_type, src_type, modifiers) not in self.write: raise IORegistryError._from_write_parts(dest_type, src_type, modifiers) + internal = self.write[(dest_type, src_type, modifiers)] + return partial(internal, _writer=writer) def has_writer( self, @@ -152,7 +156,7 @@ def register_read( src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ) -> Callable[[Read[T]], Read[T]]: + ) -> Callable[[_ReadInternal[T]], _ReadInternal[T]]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -162,15 +166,20 @@ def _register(func): return _register - def get_reader( - self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() + def get_read_func( + self, + src_type: type, + spec: IOSpec, + modifiers: frozenset[str] = frozenset(), + *, + reader: Reader, ) -> Read: - if (src_type, spec, modifiers) in self.read: - return self.read[(src_type, spec, modifiers)] - else: + if (src_type, spec, modifiers) not in self.read: raise IORegistryError._from_read_parts( "read", _REGISTRY.read, src_type, spec ) + internal = self.read[(src_type, spec, modifiers)] + return partial(internal, _reader=reader) def has_reader( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() @@ -269,12 +278,10 @@ def read_elem( modifiers: frozenset[str] = frozenset(), ) -> InMemoryReadElem: """Read an element from a store. See exported function for more details.""" - from functools import partial iospec = get_spec(elem) - read_func = partial( - self.registry.get_reader(type(elem), iospec, modifiers), - _reader=self, + read_func = self.registry.get_read_func( + type(elem), iospec, modifiers, reader=self ) if self.callback is None: return read_func(elem) @@ -286,12 +293,18 @@ def __init__(self, registry: IORegistry, callback: WriteCallback | None = None): self.registry = registry self.callback = callback - def find_writer(self, dest_type: type, elem, modifiers: frozenset[str]): + def find_write_func( + self, dest_type: type, elem: Any, modifiers: frozenset[str] + ) -> Write: for pattern in _iter_patterns(elem): if self.registry.has_writer(dest_type, pattern, modifiers): - return self.registry.get_writer(dest_type, pattern, modifiers) + return self.registry.get_write_func( + dest_type, pattern, modifiers, writer=self + ) # Raises IORegistryError - return self.registry.get_writer(dest_type, type(elem), modifiers) + return self.registry.get_write_func( + dest_type, type(elem), modifiers, writer=self + ) @report_write_key_on_error def write_elem( @@ -303,7 +316,6 @@ def write_elem( dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), modifiers: frozenset[str] = frozenset(), ): - from functools import partial from pathlib import PurePosixPath import h5py @@ -325,10 +337,7 @@ def write_elem( elif k in store: del store[k] - write_func = partial( - self.find_writer(dest_type, elem, modifiers), - _writer=self, - ) + write_func = self.find_write_func(dest_type, elem, modifiers) if self.callback is None: return write_func(store, k, elem, dataset_kwargs=dataset_kwargs) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index f53d644dc..cdd6c98ef 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -69,11 +69,19 @@ InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") -class Read(Protocol[CovariantInMemoryType]): +class _ReadInternal(Protocol[CovariantInMemoryType]): def __call__( self, elem: StorageType | H5File, + *, _reader: Reader, + ) -> CovariantInMemoryType: ... + + +class Read(Protocol[CovariantInMemoryType]): + def __call__( + self, + elem: StorageType | H5File, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -81,8 +89,6 @@ def __call__( ---------- elem The element to read from. - _reader - The :class:`anndata.experimental.Reader` instance. Returns ------- @@ -91,7 +97,7 @@ def __call__( ... -class Write(Protocol[ContravariantInMemoryType]): +class _WriteInternal(Protocol[ContravariantInMemoryType]): def __call__( self, f: StorageType, @@ -100,6 +106,17 @@ def __call__( *, _writer: Writer, dataset_kwargs: Mapping[str, Any], + ) -> None: ... + + +class Write(Protocol[ContravariantInMemoryType]): + def __call__( + self, + f: StorageType, + k: str, + v: ContravariantInMemoryType, + *, + dataset_kwargs: Mapping[str, Any], ) -> None: """Low-level writing function for an element. @@ -111,8 +128,6 @@ def __call__( The key to read in from the group. v The element to write out. - _writer - The :class:`anndata.experimental.Writer` instance. dataset_kwargs Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 9e3f91191..93bcf54d8 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,12 +4,7 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import ( - InMemoryArrayOrScalarType, - InMemoryReadElem, - Reader, - Writer, -) +from .._io.specs.registry import InMemoryArrayOrScalarType, InMemoryReadElem from .._types import ( Read, ReadCallback, @@ -35,9 +30,7 @@ "CSCDataset", "InMemoryReadElem", "InMemoryArrayOrScalarType", - "Reader", "Read", - "Writer", "Write", "ReadCallback", "WriteCallback", From 1ba5b99eb6f28128d46b689dc75bd8dd98ba9818 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:13:05 +0200 Subject: [PATCH 092/138] Fix one more --- src/anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 8107d88dd..a0a840154 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -102,7 +102,7 @@ def wrapper( cupy_val: CupyArray | CupyCSCMatrix | CupyCSRMatrix, *, _writer: Writer, - dataset_kwargs=MappingProxyType, + dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): return write_func( f, k, cupy_val.get(), _writer=_writer, dataset_kwargs=dataset_kwargs From 49c0d490456abfe2a66ea5e98b8ba6b0e11c255c Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:19:01 +0200 Subject: [PATCH 093/138] unify names --- src/anndata/_io/specs/registry.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index d8ece4ef9..72ac18a0b 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -125,7 +125,7 @@ def _register(func): return _register - def get_write_func( + def get_write( self, dest_type: type, src_type: type | tuple[type, str], @@ -143,7 +143,7 @@ def get_write_func( internal = self.write[(dest_type, src_type, modifiers)] return partial(internal, _writer=writer) - def has_writer( + def has_write( self, dest_type: type, src_type: type | tuple[type, str], @@ -166,7 +166,7 @@ def _register(func): return _register - def get_read_func( + def get_read( self, src_type: type, spec: IOSpec, @@ -181,7 +181,7 @@ def get_read_func( internal = self.read[(src_type, spec, modifiers)] return partial(internal, _reader=reader) - def has_reader( + def has_read( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() ) -> bool: return (src_type, spec, modifiers) in self.read @@ -201,7 +201,7 @@ def _register(func): return _register - def get_partial_reader( + def get_partial_read( self, src_type: type, spec: IOSpec, modifiers: frozenset[str] = frozenset() ): if (src_type, spec, modifiers) in self.read_partial: @@ -280,9 +280,7 @@ def read_elem( """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) - read_func = self.registry.get_read_func( - type(elem), iospec, modifiers, reader=self - ) + read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) if self.callback is None: return read_func(elem) return self.callback(read_func, elem.name, elem, iospec=iospec) @@ -297,14 +295,12 @@ def find_write_func( self, dest_type: type, elem: Any, modifiers: frozenset[str] ) -> Write: for pattern in _iter_patterns(elem): - if self.registry.has_writer(dest_type, pattern, modifiers): - return self.registry.get_write_func( + if self.registry.has_write(dest_type, pattern, modifiers): + return self.registry.get_write( dest_type, pattern, modifiers, writer=self ) # Raises IORegistryError - return self.registry.get_write_func( - dest_type, type(elem), modifiers, writer=self - ) + return self.registry.get_write(dest_type, type(elem), modifiers, writer=self) @report_write_key_on_error def write_elem( @@ -402,9 +398,10 @@ def read_elem_partial( modifiers: frozenset[str] = frozenset(), ): """Read part of an element from an on disk store.""" - return _REGISTRY.get_partial_reader( + read_partial = _REGISTRY.get_partial_read( type(elem), get_spec(elem), frozenset(modifiers) - )(elem, items=items, indices=indices) + ) + return read_partial(elem, items=items, indices=indices) @singledispatch From 36667358f4f1af07ea3c87e3628bb7c310683fba Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:47:59 +0200 Subject: [PATCH 094/138] claift ReadCallback signature --- src/anndata/_types.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index cdd6c98ef..64cc54837 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -141,6 +141,7 @@ def __call__( read_func: Read[InvariantInMemoryType], elem_name: str, elem: StorageType, + *, iospec: IOSpec, ) -> InvariantInMemoryType: """ From 3a332ade227b5d945125b60d69dd34e15a0f7bd9 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 10:59:43 +0200 Subject: [PATCH 095/138] Fix type aliases --- docs/conf.py | 7 +++++-- src/anndata/_types.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 96d94fa58..1d0fc00be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -140,9 +140,12 @@ def setup(app: Sphinx): } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", - InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", - InMemoryType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", InMemoryArrayOrScalarType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", + InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", + **{ + f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryReadElem`" + for v in ["In", "Co", "Contra"] + }, ) # -- Social cards --------------------------------------------------------- diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 64cc54837..c2eff0d4e 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -60,6 +60,7 @@ GroupStorageType = Union[ZarrGroup, H5Group] StorageType = Union[ArrayStorageType, GroupStorageType] +# NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py! ContravariantInMemoryType = TypeVar( "ContravariantInMemoryType", bound="InMemoryReadElem", contravariant=True ) From d0f4d13c4189261d59bd15ceeb62c71a1e79a2ae Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 12:47:39 +0200 Subject: [PATCH 096/138] (fix): clean up typing to use `RWAble` --- docs/api.md | 6 +++-- docs/conf.py | 7 +++--- src/anndata/_io/specs/methods.py | 10 ++++----- src/anndata/_io/specs/registry.py | 25 ++++++--------------- src/anndata/_types.py | 28 ++++++++++++++---------- src/anndata/experimental/__init__.py | 11 +++++++--- src/anndata/experimental/_dispatch_io.py | 5 +++-- 7 files changed, 48 insertions(+), 44 deletions(-) diff --git a/docs/api.md b/docs/api.md index a605afcc3..fa76d5119 100644 --- a/docs/api.md +++ b/docs/api.md @@ -140,8 +140,10 @@ Types used by the former: :toctree: generated/ experimental.IOSpec - experimental.InMemoryReadElem - experimental.InMemoryArrayOrScalarType + experimental.InMemoryElem + experimental.RWAbleDict + experimental.RWAbleList + experimental.RWAble experimental.Read experimental.Write experimental.ReadCallback diff --git a/docs/conf.py b/docs/conf.py index 1d0fc00be..8b91035dd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -140,10 +140,11 @@ def setup(app: Sphinx): } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", - InMemoryArrayOrScalarType=":data:`~anndata.experimental.InMemoryArrayOrScalarType`", - InMemoryReadElem=":data:`~anndata.experimental.InMemoryReadElem`", + RWAble=":data:`~anndata.experimental.RWAble`", + RWAbleDict=":data:`~anndata.experimental.RWAbleDict`", + RWAbleList=":data:`~anndata.experimental.RWAbleList`", **{ - f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryReadElem`" + f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryElem`" for v in ["In", "Co", "Contra"] }, ) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index a0a840154..855c6b89f 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -48,6 +48,8 @@ ArrayStorageType, GroupStorageType, InMemoryArrayOrScalarType, + RWAbleDict, + RWAbleList, ) from anndata.compat import SpArray @@ -330,9 +332,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping( - elem: GroupStorageType, *, _reader: Reader -) -> dict[str, InMemoryArrayOrScalarType]: +def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> RWAbleDict: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -341,7 +341,7 @@ def read_mapping( def write_mapping( f: GroupStorageType, k: str, - v: dict, + v: RWAbleDict, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -361,7 +361,7 @@ def write_mapping( def write_list( f: GroupStorageType, k: str, - elem: list, + elem: RWAbleList, *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 72ac18a0b..4e840fab5 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -4,22 +4,19 @@ from dataclasses import dataclass from functools import partial, singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING -import pandas as pd - -from anndata._core.anndata import AnnData from anndata._io.utils import report_read_key_on_error, report_write_key_on_error -from anndata._types import InMemoryArrayOrScalarType from anndata.compat import _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any, TypeAlias, TypeVar + from typing import Any, TypeVar from anndata._core.storage import StorageType from anndata._types import ( GroupStorageType, + InMemoryElem, Read, ReadCallback, Write, @@ -31,14 +28,6 @@ T = TypeVar("T") W = TypeVar("W", bound=_WriteInternal) -InMemoryReadElem: TypeAlias = Union[ - dict[str, InMemoryArrayOrScalarType], - InMemoryArrayOrScalarType, - AnnData, - pd.Categorical, - pd.api.extensions.ExtensionArray, -] - # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-" # TODO: Should filetype be included in the IOSpec if it changes the encoding? Or does the intent that these things be "the same" overrule that? @@ -276,7 +265,7 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - ) -> InMemoryReadElem: + ) -> InMemoryElem: """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) @@ -307,7 +296,7 @@ def write_elem( self, store: GroupStorageType, k: str, - elem: Any, + elem: InMemoryElem, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), modifiers: frozenset[str] = frozenset(), @@ -347,7 +336,7 @@ def write_elem( ) -def read_elem(elem: StorageType) -> Any: +def read_elem(elem: StorageType) -> InMemoryElem: """ Read an element from a store. @@ -365,7 +354,7 @@ def read_elem(elem: StorageType) -> Any: def write_elem( store: GroupStorageType, k: str, - elem: Any, + elem: InMemoryElem, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> None: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index c2eff0d4e..03afe7a5b 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -11,6 +11,8 @@ from numpy.typing import NDArray from scipy import sparse +from anndata._core.anndata import AnnData + from ._core.sparse_dataset import BaseCompressedSparseDataset from .compat import ( AwkArray, @@ -55,19 +57,28 @@ np.number, str, ] +RWAble: TypeAlias = InMemoryArrayOrScalarType | "RWAbleDict" | "RWAbleList" # noqa: TCH010 +RWAbleDict: TypeAlias = dict[str, RWAble] +RWAbleList: TypeAlias = list[RWAble] +InMemoryElem: TypeAlias = Union[ + RWAble, + AnnData, + pd.Categorical, + pd.api.extensions.ExtensionArray, +] -ArrayStorageType = Union[ZarrArray, H5Array] -GroupStorageType = Union[ZarrGroup, H5Group] -StorageType = Union[ArrayStorageType, GroupStorageType] +ArrayStorageType: TypeAlias = Union[ZarrArray, H5Array] +GroupStorageType: TypeAlias = Union[ZarrGroup, H5Group] +StorageType: TypeAlias = Union[ArrayStorageType, GroupStorageType] # NOTE: If you change these, be sure to update `autodoc_type_aliases` in docs/conf.py! ContravariantInMemoryType = TypeVar( - "ContravariantInMemoryType", bound="InMemoryReadElem", contravariant=True + "ContravariantInMemoryType", bound="InMemoryElem", contravariant=True ) CovariantInMemoryType = TypeVar( - "CovariantInMemoryType", bound="InMemoryReadElem", covariant=True + "CovariantInMemoryType", bound="InMemoryElem", covariant=True ) -InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryReadElem") +InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryElem") class _ReadInternal(Protocol[CovariantInMemoryType]): @@ -197,8 +208,3 @@ def __call__( Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... - - -if TYPE_CHECKING: - # Needs to be at the end because Sphinx’s type import suffers from circular imports - from ._io.specs.registry import InMemoryReadElem diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 93bcf54d8..fc6b8331b 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,10 +4,13 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._io.specs.registry import InMemoryArrayOrScalarType, InMemoryReadElem from .._types import ( + InMemoryElem, Read, ReadCallback, + RWAble, + RWAbleDict, + RWAbleList, Write, WriteCallback, ) @@ -28,9 +31,11 @@ "sparse_dataset", "CSRDataset", "CSCDataset", - "InMemoryReadElem", - "InMemoryArrayOrScalarType", + "InMemoryElem", "Read", + "RWAbleDict", + "RWAbleList", + "RWAble", "Write", "ReadCallback", "WriteCallback", diff --git a/src/anndata/experimental/_dispatch_io.py b/src/anndata/experimental/_dispatch_io.py index b8950cf77..20b47baeb 100644 --- a/src/anndata/experimental/_dispatch_io.py +++ b/src/anndata/experimental/_dispatch_io.py @@ -9,6 +9,7 @@ from anndata._types import ( GroupStorageType, + InMemoryElem, ReadCallback, StorageType, WriteCallback, @@ -18,7 +19,7 @@ def read_dispatched( elem: StorageType, callback: ReadCallback, -) -> Any: +) -> InMemoryElem: """ Read elem, calling the callback at each sub-element. @@ -44,7 +45,7 @@ def read_dispatched( def write_dispatched( store: GroupStorageType, key: str, - elem: Any, + elem: InMemoryElem, callback: WriteCallback, *, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), From ea29cfa28aa9a2bf2e42d18bf6cf88a771fcf941 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 14:53:43 +0200 Subject: [PATCH 097/138] (fix): use `Union` --- src/anndata/_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 03afe7a5b..9863b43ef 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -57,7 +57,7 @@ np.number, str, ] -RWAble: TypeAlias = InMemoryArrayOrScalarType | "RWAbleDict" | "RWAbleList" # noqa: TCH010 +RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 RWAbleDict: TypeAlias = dict[str, RWAble] RWAbleList: TypeAlias = list[RWAble] InMemoryElem: TypeAlias = Union[ From f4ff2368554d0c8d179f0cc1add6f413d56ba39d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:11:47 +0200 Subject: [PATCH 098/138] (fix): add qualname override --- docs/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 8b91035dd..7bd05af9f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -137,6 +137,9 @@ def setup(app: Sphinx): "anndata._types.WriteCallback": "anndata.experimental.WriteCallback", "anndata._types.Read": "anndata.experimental.Read", "anndata._types.Write": "anndata.experimental.Write", + "anndata._types.RWAble": "anndata.experimental.RWAble", + "anndata._types.RWAbleDict": "anndata.experimental.RWAbleDict", + "anndata._types.RWAbleList": "anndata.experimental.RWAbleList", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", From f50b286459a672048c4eec51d5b2f1765bbefc96 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:18:51 +0200 Subject: [PATCH 099/138] (fix): ignore dask and masked array --- docs/conf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 7bd05af9f..89727fd6d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -101,6 +101,8 @@ ("py:class", "anndata.compat.DaskArray"), ("py:class", "anndata.compat.CupyArray"), ("py:class", "anndata.compat.CupySparseMatrix"), + ("py:class", "numpy.ma.core.MaskedArray"), + ("py:class", "dask.array.core.Array"), ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), From 712e0856c9c80e3456a111a6ed0de41329adf7f2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:24:22 +0200 Subject: [PATCH 100/138] (fix): ignore erroneous class warning --- docs/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/conf.py b/docs/conf.py index 89727fd6d..5a47af0f4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -106,6 +106,9 @@ ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), + # Something is picking these up as classes despite being aliases so this just suppresses the warning, but doesn't affect the build + ("py:class", "RWAbleDict"), + ("py:class", "RWAbleList"), ] suppress_warnings = [ "ref.citation", From 24dd18bb8069e1c0632a8fe66e5ffe2eb02c56fa Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:35:22 +0200 Subject: [PATCH 101/138] (fix): upgrade `scanpydoc` --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 17898ec55..43e5ab416 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,7 +74,7 @@ doc = [ "sphinx-toolbox", "sphinxext.opengraph", "nbsphinx", - "scanpydoc[theme,typehints] >=0.13.5", + "scanpydoc[theme,typehints] >=0.13.6", "zarr", "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks From 79d3fdc54c775b88f6ac9c65e83fed08049c5484 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 15:57:07 +0200 Subject: [PATCH 102/138] (fix): use `MutableMapping` instead of `dict` due to broken docstring --- src/anndata/_types.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9863b43ef..9594a2ab8 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,6 +4,7 @@ from __future__ import annotations +from collections.abc import MutableMapping from typing import TYPE_CHECKING, Protocol, TypeVar, Union import numpy as np @@ -58,7 +59,8 @@ str, ] RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 -RWAbleDict: TypeAlias = dict[str, RWAble] +# dict has a broken docstring: https://readthedocs.com/projects/icb-anndata/builds/2342910/ +RWAbleDict: TypeAlias = MutableMapping[str, RWAble] RWAbleList: TypeAlias = list[RWAble] InMemoryElem: TypeAlias = Union[ RWAble, From d3bcddf8d9bd3c7b6a20bcc1fa380548c2dd0522 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 16:12:25 +0200 Subject: [PATCH 103/138] Add data docs --- src/anndata/experimental/__init__.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index fc6b8331b..879f39a01 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -4,21 +4,26 @@ from anndata._io.specs import IOSpec, read_elem, write_elem from .._core.storage import StorageType -from .._types import ( - InMemoryElem, - Read, - ReadCallback, - RWAble, - RWAbleDict, - RWAbleList, - Write, - WriteCallback, -) +from .._types import InMemoryElem as _InMemoryElem +from .._types import Read, ReadCallback, Write, WriteCallback +from .._types import RWAble as _RWAble +from .._types import RWAbleDict as _RWAbleDict +from .._types import RWAbleList as _RWAbleList from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection from .pytorch import AnnLoader +# Sphinx can’t find data docstrings when objects are re-exported +InMemoryElem = _InMemoryElem +"""An in-memory element that can be read and written.""" +RWAble = _RWAble +"""A serializable object.""" +RWAbleDict = _RWAbleDict +"""A dict containing serializable objects.""" +RWAbleList = _RWAbleList +"""A list containing serializable objects.""" + __all__ = [ "AnnCollection", "AnnLoader", From 84fdc964bb2ec95bfcd8aee59a4eb4bb36972633 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 16:13:27 +0200 Subject: [PATCH 104/138] Revert "(fix): use `MutableMapping` instead of `dict` due to broken docstring" This reverts commit 79d3fdc54c775b88f6ac9c65e83fed08049c5484. --- src/anndata/_types.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9594a2ab8..9863b43ef 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,7 +4,6 @@ from __future__ import annotations -from collections.abc import MutableMapping from typing import TYPE_CHECKING, Protocol, TypeVar, Union import numpy as np @@ -59,8 +58,7 @@ str, ] RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 -# dict has a broken docstring: https://readthedocs.com/projects/icb-anndata/builds/2342910/ -RWAbleDict: TypeAlias = MutableMapping[str, RWAble] +RWAbleDict: TypeAlias = dict[str, RWAble] RWAbleList: TypeAlias = list[RWAble] InMemoryElem: TypeAlias = Union[ RWAble, From 2608bc306e4a89662d87a06323ba69103825719e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 11 Jul 2024 16:29:28 +0200 Subject: [PATCH 105/138] (fix): add clarification --- src/anndata/experimental/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 879f39a01..89d462a1f 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -16,13 +16,13 @@ # Sphinx can’t find data docstrings when objects are re-exported InMemoryElem = _InMemoryElem -"""An in-memory element that can be read and written.""" +"""An in-memory element that can be read and written, including an :class:`anndata.AnnData` objects.""" RWAble = _RWAble -"""A serializable object.""" +"""A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" RWAbleDict = _RWAbleDict -"""A dict containing serializable objects.""" +"""A dict containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" RWAbleList = _RWAbleList -"""A list containing serializable objects.""" +"""A list containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns`.""" __all__ = [ "AnnCollection", From e551e18e48ca4e367a6005043529a9557bcc376b Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 11 Jul 2024 16:29:09 +0200 Subject: [PATCH 106/138] Simplify --- docs/api.md | 2 -- docs/conf.py | 7 ------- src/anndata/_io/specs/methods.py | 9 ++++----- src/anndata/_types.py | 6 +++--- src/anndata/experimental/__init__.py | 8 -------- 5 files changed, 7 insertions(+), 25 deletions(-) diff --git a/docs/api.md b/docs/api.md index fa76d5119..36ebeac88 100644 --- a/docs/api.md +++ b/docs/api.md @@ -141,8 +141,6 @@ Types used by the former: experimental.IOSpec experimental.InMemoryElem - experimental.RWAbleDict - experimental.RWAbleList experimental.RWAble experimental.Read experimental.Write diff --git a/docs/conf.py b/docs/conf.py index 5a47af0f4..d83861d13 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -106,9 +106,6 @@ ("py:class", "awkward.highlevel.Array"), ("py:class", "anndata._core.sparse_dataset.BaseCompressedSparseDataset"), ("py:obj", "numpy._typing._array_like._ScalarType_co"), - # Something is picking these up as classes despite being aliases so this just suppresses the warning, but doesn't affect the build - ("py:class", "RWAbleDict"), - ("py:class", "RWAbleList"), ] suppress_warnings = [ "ref.citation", @@ -143,14 +140,10 @@ def setup(app: Sphinx): "anndata._types.Read": "anndata.experimental.Read", "anndata._types.Write": "anndata.experimental.Write", "anndata._types.RWAble": "anndata.experimental.RWAble", - "anndata._types.RWAbleDict": "anndata.experimental.RWAbleDict", - "anndata._types.RWAbleList": "anndata.experimental.RWAbleList", } autodoc_type_aliases = dict( NDArray=":data:`~numpy.typing.NDArray`", RWAble=":data:`~anndata.experimental.RWAble`", - RWAbleDict=":data:`~anndata.experimental.RWAbleDict`", - RWAbleList=":data:`~anndata.experimental.RWAbleList`", **{ f"{v}variantInMemoryType": ":data:`~anndata.experimental.InMemoryElem`" for v in ["In", "Co", "Contra"] diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 855c6b89f..48106b85d 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -48,8 +48,7 @@ ArrayStorageType, GroupStorageType, InMemoryArrayOrScalarType, - RWAbleDict, - RWAbleList, + RWAble, ) from anndata.compat import SpArray @@ -332,7 +331,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> RWAbleDict: +def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, RWAble]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -341,7 +340,7 @@ def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> RWAbleDict: def write_mapping( f: GroupStorageType, k: str, - v: RWAbleDict, + v: dict[str, RWAble], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), @@ -361,7 +360,7 @@ def write_mapping( def write_list( f: GroupStorageType, k: str, - elem: RWAbleList, + elem: list[RWAble], *, _writer: Writer, dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9863b43ef..e0b663f16 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -57,9 +57,9 @@ np.number, str, ] -RWAble: TypeAlias = Union[InMemoryArrayOrScalarType, "RWAbleDict", "RWAbleList"] # noqa: TCH010 -RWAbleDict: TypeAlias = dict[str, RWAble] -RWAbleList: TypeAlias = list[RWAble] +RWAble: TypeAlias = Union[ + InMemoryArrayOrScalarType, dict[str, "RWAble"], list["RWAble"] +] # noqa: TCH010 InMemoryElem: TypeAlias = Union[ RWAble, AnnData, diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 89d462a1f..904dd5807 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -7,8 +7,6 @@ from .._types import InMemoryElem as _InMemoryElem from .._types import Read, ReadCallback, Write, WriteCallback from .._types import RWAble as _RWAble -from .._types import RWAbleDict as _RWAbleDict -from .._types import RWAbleList as _RWAbleList from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk from .multi_files import AnnCollection @@ -19,10 +17,6 @@ """An in-memory element that can be read and written, including an :class:`anndata.AnnData` objects.""" RWAble = _RWAble """A serializable object, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" -RWAbleDict = _RWAbleDict -"""A dict containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns` or `obsm`.""" -RWAbleList = _RWAbleList -"""A list containing serializable objects, excluding :class:`anndata.AnnData` objects i.e., something that can be stored in `uns`.""" __all__ = [ "AnnCollection", @@ -38,8 +32,6 @@ "CSCDataset", "InMemoryElem", "Read", - "RWAbleDict", - "RWAbleList", "RWAble", "Write", "ReadCallback", From 1ffe43ecb72bea57e4bbd48c97d64f9eaa3c2540 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 12 Jul 2024 17:51:43 +0200 Subject: [PATCH 107/138] (fix): remove double `dask` intersphinx --- docs/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index ae1e19cf9..f943fbb60 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -128,7 +128,6 @@ def setup(app: Sphinx): scipy=("https://docs.scipy.org/doc/scipy/", None), sklearn=("https://scikit-learn.org/stable/", None), zarr=("https://zarr.readthedocs.io/en/stable/", None), - dask=("https://docs.dask.org/en/stable/", None), xarray=("https://xarray.pydata.org/en/stable/", None), dask=("https://docs.dask.org/en/stable/", None), ) From f9df5bc60acdaf5a28e4596b0f200d3c884fb1ba Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 12 Jul 2024 17:54:23 +0200 Subject: [PATCH 108/138] (fix): remove `_types.DaskArray` from type checking block --- src/anndata/_io/specs/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index e23804919..8c5913850 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -17,7 +17,6 @@ from anndata._core.storage import StorageType from anndata._types import ( - DaskArray, GroupStorageType, InMemoryElem, Read, From a85da39adbe1f783d3d6a1c08497437d8626d4b2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 15 Jul 2024 11:30:52 +0200 Subject: [PATCH 109/138] (refactor): use `block_info` for resolving fetch location --- src/anndata/_io/specs/lazy_methods.py | 52 +++++++++------------------ 1 file changed, 17 insertions(+), 35 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 52d29eb5e..2af600823 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -4,7 +4,7 @@ from functools import singledispatch from pathlib import Path, PurePosixPath from types import MappingProxyType -from typing import TYPE_CHECKING, overload +from typing import TYPE_CHECKING import h5py import numpy as np @@ -17,41 +17,11 @@ if TYPE_CHECKING: from collections.abc import Mapping - from typing import Any, Literal + from typing import Any, Literal, Union from .registry import Reader -@overload -def make_block_indexer( - *, - is_csc: Literal[True], - stride: int, - shape: tuple[int, int], - block_id: tuple[int, int], -) -> tuple[slice, slice]: ... -@overload -def make_block_indexer( - *, - is_csc: Literal[False], - stride: int, - shape: tuple[int, int], - block_id: tuple[int, int], -) -> tuple[slice]: ... - - -def make_block_indexer( - *, is_csc: bool, stride: int, shape: tuple[int, int], block_id: tuple[int, int] -) -> tuple[slice, slice] | tuple[slice]: - index1d = slice( - block_id[is_csc] * stride, - min((block_id[is_csc] * stride) + stride, shape[is_csc]), - ) - if is_csc: - return (slice(None), index1d) - return (index1d,) - - @contextmanager def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): if not isinstance(path_or_group, Path): @@ -118,13 +88,25 @@ def read_sparse_as_dask( raise ValueError("Only the major axis can be chunked") stride = chunks[int(is_csc)] - def make_dask_chunk(block_id: tuple[int, int]): + def make_dask_chunk( + block_info: Union[ # noqa: UP007 + dict[ + Literal[None], + dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 + ], + None, + ] = None, + ): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 + if block_info is None: + raise ValueError("Block info is required") with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - index = make_block_indexer( - is_csc=is_csc, stride=stride, shape=shape, block_id=block_id + array_location = block_info[None]["array-location"] + index = ( + slice(array_location[0][0], array_location[0][1]), + slice(array_location[1][0], array_location[1][1]), ) chunk = mtx[index] return chunk From 899184f70013c3c43a5970e5c4d078828f344589 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 15 Jul 2024 14:08:20 +0200 Subject: [PATCH 110/138] (fix): dtype for reading --- src/anndata/_io/specs/lazy_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 2af600823..9349b491c 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -122,7 +122,7 @@ def make_dask_chunk( make_dask_chunk, dtype=dtype, chunks=chunk_layout, - meta=memory_format((0, 0), dtype=np.float32), + meta=memory_format((0, 0), dtype=dtype), ) return da_mtx From efb70ec893d01250e6e527d7fa0bc16a83131f27 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 16 Jul 2024 14:52:37 +0200 Subject: [PATCH 111/138] (fix): ignore import cycle problem (why??) --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index f943fbb60..f59e67f4f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,6 +110,7 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 + "autosummary.import_cycle", # https://readthedocs.com/projects/icb-anndata/builds/2349021/ ] From 118f43c2fe4f948cb6067c14850ecdce9efe94d7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 16 Jul 2024 14:59:42 +0200 Subject: [PATCH 112/138] (fix): add issue --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index f59e67f4f..f7fe9d1be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,7 +110,7 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 - "autosummary.import_cycle", # https://readthedocs.com/projects/icb-anndata/builds/2349021/ + "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] From f742a0a8cbce3cc75f517c0a809964ea02dd834c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 10:00:02 +0200 Subject: [PATCH 113/138] (fix): subclass `Reader` to remove `datasetkwargs` --- src/anndata/_io/h5ad.py | 2 +- src/anndata/_io/specs/lazy_methods.py | 23 ++++++-------------- src/anndata/_io/specs/methods.py | 17 --------------- src/anndata/_io/specs/registry.py | 31 +++++++++++++++++---------- src/anndata/_io/zarr.py | 2 +- src/anndata/experimental/merge.py | 2 +- tests/test_backed_sparse.py | 2 +- tests/test_io_dispatched.py | 26 +++++----------------- 8 files changed, 36 insertions(+), 69 deletions(-) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 653b96f54..2cd2fca48 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -233,7 +233,7 @@ def read_h5ad( with h5py.File(filename, "r") as f: - def callback(func, elem_name: str, elem, dataset_kwargs, iospec): + def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 9349b491c..f5bb2173c 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -3,7 +3,6 @@ from contextlib import contextmanager from functools import singledispatch from pathlib import Path, PurePosixPath -from types import MappingProxyType from typing import TYPE_CHECKING import h5py @@ -16,8 +15,7 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Mapping - from typing import Any, Literal, Union + from typing import Literal, Union from .registry import Reader @@ -67,9 +65,7 @@ def _(x): @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( - elem: H5Group | ZarrGroup, - _reader: Reader, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + elem: H5Group | ZarrGroup, _reader: Reader, chunks: tuple[int, ...] | None = None ): import dask.array as da @@ -79,7 +75,6 @@ def read_sparse_as_dask( dtype = elem["data"].dtype is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" - chunks = dataset_kwargs.get("chunks", None) stride: int = _DEFAULT_STRIDE if chunks is not None: if len(chunks) != 2: @@ -129,9 +124,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, - _reader: Reader, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + elem: H5Array, _reader: Reader, chunks: tuple[int, ...] | None = None ): import dask.array as da @@ -139,8 +132,8 @@ def read_h5_array( elem_name = elem.name shape = tuple(elem.shape) dtype = elem.dtype - chunks: tuple[int, ...] = dataset_kwargs.get( - "chunks", (_DEFAULT_STRIDE,) * len(shape) + chunks: tuple[int, ...] = ( + chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) def make_dask_chunk(block_id: tuple[int, int]): @@ -166,11 +159,9 @@ def make_dask_chunk(block_id: tuple[int, int]): @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, - _reader: Reader, - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), + elem: ZarrArray, _reader: Reader, chunks: tuple[int, ...] | None = None ): - chunks: tuple[int, ...] = dataset_kwargs.get("chunks", elem.chunks) + chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da return da.from_zarr(elem, chunks=chunks) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index e04534c71..719c9975d 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -126,7 +126,6 @@ def read_basic( elem: H5File | H5Group | H5Array, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -151,7 +150,6 @@ def read_basic_zarr( elem: ZarrGroup | ZarrArray, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -299,7 +297,6 @@ def read_anndata( elem: GroupStorageType | H5File, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> AnnData: d = {} for k in [ @@ -346,7 +343,6 @@ def read_mapping( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> dict[str, RWAble]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -460,7 +456,6 @@ def read_array( elem: ArrayStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> npt.NDArray: return elem[()] @@ -482,7 +477,6 @@ def read_string_array( d: H5Array, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ): return read_array(d.asstr(), _reader=_reader) @@ -568,7 +562,6 @@ def read_recarray( d: ArrayStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype @@ -785,7 +778,6 @@ def read_sparse( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -835,7 +827,6 @@ def read_awkward( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> AwkArray: from anndata.compat import awkward as ak @@ -909,7 +900,6 @@ def read_dataframe( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") @@ -955,7 +945,6 @@ def read_dataframe_0_1_0( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") @@ -1031,7 +1020,6 @@ def read_categorical( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), @@ -1087,7 +1075,6 @@ def read_nullable_integer( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -1103,7 +1090,6 @@ def read_nullable_boolean( elem: GroupStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1124,7 +1110,6 @@ def read_scalar( elem: ArrayStorageType, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> np.number: return elem[()] @@ -1176,7 +1161,6 @@ def read_hdf5_string( elem: H5Array, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> str: return elem.asstr()[()] @@ -1186,7 +1170,6 @@ def read_zarr_string( elem: ZarrArray, *, _reader: Reader, - dataset_kwargs: MappingProxyType = MappingProxyType({}), ) -> str: return str(elem[()]) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 8c5913850..62ba5564d 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -1,6 +1,5 @@ from __future__ import annotations -import inspect import warnings from collections.abc import Mapping from dataclasses import dataclass @@ -275,16 +274,28 @@ def read_elem( iospec = get_spec(elem) read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) if self.callback is None: - return read_func(elem, dataset_kwargs=dataset_kwargs) - if "dataset_kwargs" not in inspect.getfullargspec(self.callback)[0]: + return read_func(elem) + return self.callback(read_func, elem.name, elem, iospec=iospec) + + +class DaskReader(Reader): + @report_read_key_on_error + def read_elem( + self, + elem: StorageType, + modifiers: frozenset[str] = frozenset(), + chunks: tuple[int, ...] | None = None, + ) -> InMemoryElem: + """Read an element from a store. See exported function for more details.""" + + iospec = get_spec(elem) + read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) + if self.callback is None: warnings.warn( - "Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", + "Dask reading does not use a callback. Ignoring callback.", stacklevel=2, ) - return self.callback(read_func, elem.name, elem, iospec=iospec) - return self.callback( - read_func, elem.name, elem, dataset_kwargs=dataset_kwargs, iospec=iospec - ) + return read_func(elem, chunks=chunks) class Writer: @@ -385,9 +396,7 @@ def read_elem_as_dask( ------- DaskArray """ - return Reader(_LAZY_REGISTRY).read_elem( - elem, dataset_kwargs={"chunks": chunks} if chunks is not None else {} - ) + return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) def write_elem( diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 9d6f759ff..0e015244a 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -66,7 +66,7 @@ def read_zarr(store: str | Path | MutableMapping | zarr.Group) -> AnnData: f = zarr.open(store, mode="r") # Read with handling for backwards compat - def callback(func, elem_name: str, elem, dataset_kwargs, iospec): + def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{ diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index 8882f3c63..9690420ec 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -134,7 +134,7 @@ def read_as_backed(group: ZarrGroup | H5Group): BaseCompressedSparseDataset, Array or EAGER_TYPES are encountered. """ - def callback(func, elem_name: str, elem, dataset_kwargs, iospec): + def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type in SPARSE_MATRIX: return sparse_dataset(elem) elif iospec.encoding_type in EAGER_TYPES: diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index f5e593273..4e5f5445d 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -70,7 +70,7 @@ def read_zarr_backed(path): f = zarr.open(path, mode="r") # Read with handling for backwards compat - def callback(func, elem_name, elem, iospec, dataset_kwargs): + def callback(func, elem_name, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( **{k: read_dispatched(v, callback) for k, v in elem.items()} diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 395e942c3..76e17ad2d 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -3,7 +3,6 @@ import re import h5py -import pytest import zarr from scipy import sparse @@ -19,7 +18,7 @@ def test_read_dispatched_w_regex(): - def read_only_axis_dfs(func, elem_name: str, elem, iospec, dataset_kwargs): + def read_only_axis_dfs(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata": return func(elem) elif re.match(r"^/((obs)|(var))?(/.*)?$", elem_name): @@ -41,7 +40,7 @@ def read_only_axis_dfs(func, elem_name: str, elem, iospec, dataset_kwargs): def test_read_dispatched_dask(): import dask.array as da - def read_as_dask_array(func, elem_name: str, elem, iospec, dataset_kwargs): + def read_as_dask_array(func, elem_name: str, elem, iospec): if iospec.encoding_type in { "dataframe", "csr_matrix", @@ -78,7 +77,7 @@ def test_read_dispatched_null_case(): expected = read_elem(z) - def callback(read_func, elem_name, x, dataset_kwargs, iospec): + def callback(read_func, elem_name, x, iospec): return read_elem(x) actual = read_dispatched(z, callback) @@ -86,21 +85,6 @@ def callback(read_func, elem_name, x, dataset_kwargs, iospec): assert_equal(expected, actual) -def test_read_dispatched_warns_with_no_dataset_kwargs(): - adata = gen_adata((100, 100)) - z = zarr.group() - write_elem(z, "/", adata) - - def callback(read_func, elem_name, x, iospec): - return read_elem(x) - - with pytest.warns( - UserWarning, - match="Callback does not accept dataset_kwargs. Ignoring dataset_kwargs.", - ): - read_dispatched(z, callback) - - def test_write_dispatched_chunks(): from itertools import chain, repeat @@ -182,11 +166,11 @@ def zarr_writer(func, store, k, elem, dataset_kwargs, iospec): zarr_write_keys.append(k) func(store, k, elem, dataset_kwargs=dataset_kwargs) - def h5ad_reader(func, elem_name: str, elem, dataset_kwargs, iospec): + def h5ad_reader(func, elem_name: str, elem, iospec): h5ad_read_keys.append(elem_name) return func(elem) - def zarr_reader(func, elem_name: str, elem, dataset_kwargs, iospec): + def zarr_reader(func, elem_name: str, elem, iospec): zarr_read_keys.append(elem_name) return func(elem) From ae68731385759feae5510dfe66b82688977f5ea2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 10:00:31 +0200 Subject: [PATCH 114/138] (fix): add message tp errpr --- src/anndata/_io/specs/lazy_methods.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f5bb2173c..8fc251b98 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -80,7 +80,10 @@ def read_sparse_as_dask( if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") if chunks[int(not is_csc)] != shape[int(not is_csc)]: - raise ValueError("Only the major axis can be chunked") + raise ValueError( + "Only the major axis can be chunked. " + f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" + ) stride = chunks[int(is_csc)] def make_dask_chunk( From f5e7760aba721c17b952f031c20e8b8a40a4a045 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 18 Jul 2024 10:01:03 +0200 Subject: [PATCH 115/138] Update tests/test_io_elementwise.py Co-authored-by: Isaac Virshup --- tests/test_io_elementwise.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index ac67dd215..80da79014 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -272,12 +272,11 @@ def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): def test_read_lazy_h5_cluster(sparse_format, tmp_path): import dask.distributed as dd - file = h5py.File(tmp_path / "test.h5", "w") - store = file["/"] - arr_store = create_sparse_store(sparse_format, store) - X_dask_from_disk = read_elem_as_dask(arr_store["X"]) - X_from_disk = read_elem(arr_store["X"]) - file.close() + with h5py.File(tmp_path / "test.h5", "w") as file: + store = file["/"] + arr_store = create_sparse_store(sparse_format, store) + X_dask_from_disk = read_elem_as_dask(arr_store["X"]) + X_from_disk = read_elem(arr_store["X"]) with ( dd.LocalCluster(n_workers=1, threads_per_worker=1) as cluster, dd.Client(cluster) as _client, From 96b13a34645f249348df93aea1fc91b02e8365a2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:00:11 +0200 Subject: [PATCH 116/138] (fix): correct `self.callback` check --- src/anndata/_io/specs/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 62ba5564d..7ad43fe3c 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -290,7 +290,7 @@ def read_elem( iospec = get_spec(elem) read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) - if self.callback is None: + if self.callback is not None: warnings.warn( "Dask reading does not use a callback. Ignoring callback.", stacklevel=2, From 9c68e365414bc4f605db21c38167681fcf3e32b6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:22:07 +0200 Subject: [PATCH 117/138] (fix): erroneous diffs --- src/anndata/_io/specs/methods.py | 92 ++++++-------------------------- 1 file changed, 17 insertions(+), 75 deletions(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 719c9975d..1bf2d13a3 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -123,9 +123,7 @@ def wrapper( @_REGISTRY.register_read(H5Group, IOSpec("", "")) @_REGISTRY.register_read(H5Array, IOSpec("", "")) def read_basic( - elem: H5File | H5Group | H5Array, - *, - _reader: Reader, + elem: H5File | H5Group | H5Array, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import h5ad @@ -147,9 +145,7 @@ def read_basic( @_REGISTRY.register_read(ZarrGroup, IOSpec("", "")) @_REGISTRY.register_read(ZarrArray, IOSpec("", "")) def read_basic_zarr( - elem: ZarrGroup | ZarrArray, - *, - _reader: Reader, + elem: ZarrGroup | ZarrArray, *, _reader: Reader ) -> dict[str, InMemoryArrayOrScalarType] | npt.NDArray | sparse.spmatrix | SpArray: from anndata._io import zarr @@ -293,11 +289,7 @@ def write_anndata( @_REGISTRY.register_read(H5File, IOSpec("raw", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("anndata", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("raw", "0.1.0")) -def read_anndata( - elem: GroupStorageType | H5File, - *, - _reader: Reader, -) -> AnnData: +def read_anndata(elem: GroupStorageType | H5File, *, _reader: Reader) -> AnnData: d = {} for k in [ "X", @@ -339,11 +331,7 @@ def write_raw( @_REGISTRY.register_read(H5Group, IOSpec("dict", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dict", "0.1.0")) -def read_mapping( - elem: GroupStorageType, - *, - _reader: Reader, -) -> dict[str, RWAble]: +def read_mapping(elem: GroupStorageType, *, _reader: Reader) -> dict[str, RWAble]: return {k: _reader.read_elem(v) for k, v in elem.items()} @@ -452,11 +440,7 @@ def write_basic_dask_h5( @_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) -def read_array( - elem: ArrayStorageType, - *, - _reader: Reader, -) -> npt.NDArray: +def read_array(elem: ArrayStorageType, *, _reader: Reader) -> npt.NDArray: return elem[()] @@ -473,11 +457,7 @@ def read_zarr_array_partial(elem, *, items=None, indices=(slice(None, None))): # arrays of strings @_REGISTRY.register_read(H5Array, IOSpec("string-array", "0.2.0")) -def read_string_array( - d: H5Array, - *, - _reader: Reader, -): +def read_string_array(d: H5Array, *, _reader: Reader): return read_array(d.asstr(), _reader=_reader) @@ -558,11 +538,7 @@ def _to_hdf5_vlen_strings(value: np.ndarray) -> np.ndarray: @_REGISTRY.register_read(H5Array, IOSpec("rec-array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("rec-array", "0.2.0")) -def read_recarray( - d: ArrayStorageType, - *, - _reader: Reader, -) -> np.recarray | npt.NDArray: +def read_recarray(d: ArrayStorageType, *, _reader: Reader) -> np.recarray | npt.NDArray: value = d[()] dtype = value.dtype value = _from_fixed_length_strings(value) @@ -775,9 +751,7 @@ def chunk_slice(start: int, stop: int) -> tuple[slice | None, slice | None]: @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse( - elem: GroupStorageType, - *, - _reader: Reader, + elem: GroupStorageType, *, _reader: Reader ) -> sparse.spmatrix | SpArray: return sparse_dataset(elem).to_memory() @@ -823,11 +797,7 @@ def write_awkward( @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) -def read_awkward( - elem: GroupStorageType, - *, - _reader: Reader, -) -> AwkArray: +def read_awkward(elem: GroupStorageType, *, _reader: Reader) -> AwkArray: from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") @@ -896,11 +866,7 @@ def write_dataframe( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.2.0")) -def read_dataframe( - elem: GroupStorageType, - *, - _reader: Reader, -) -> pd.DataFrame: +def read_dataframe(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = list(_read_attr(elem.attrs, "column-order")) idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -941,11 +907,7 @@ def read_dataframe_partial( @_REGISTRY.register_read(H5Group, IOSpec("dataframe", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("dataframe", "0.1.0")) -def read_dataframe_0_1_0( - elem: GroupStorageType, - *, - _reader: Reader, -) -> pd.DataFrame: +def read_dataframe_0_1_0(elem: GroupStorageType, *, _reader: Reader) -> pd.DataFrame: columns = _read_attr(elem.attrs, "column-order") idx_key = _read_attr(elem.attrs, "_index") df = pd.DataFrame( @@ -1016,11 +978,7 @@ def write_categorical( @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical( - elem: GroupStorageType, - *, - _reader: Reader, -) -> pd.Categorical: +def read_categorical(elem: GroupStorageType, *, _reader: Reader) -> pd.Categorical: return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), @@ -1072,9 +1030,7 @@ def write_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0")) def read_nullable_integer( - elem: GroupStorageType, - *, - _reader: Reader, + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.IntegerArray( @@ -1087,9 +1043,7 @@ def read_nullable_integer( @_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0")) def read_nullable_boolean( - elem: GroupStorageType, - *, - _reader: Reader, + elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: if "mask" in elem: return pd.arrays.BooleanArray( @@ -1106,11 +1060,7 @@ def read_nullable_boolean( @_REGISTRY.register_read(H5Array, IOSpec("numeric-scalar", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("numeric-scalar", "0.2.0")) -def read_scalar( - elem: ArrayStorageType, - *, - _reader: Reader, -) -> np.number: +def read_scalar(elem: ArrayStorageType, *, _reader: Reader) -> np.number: return elem[()] @@ -1157,20 +1107,12 @@ def write_hdf5_scalar( @_REGISTRY.register_read(H5Array, IOSpec("string", "0.2.0")) -def read_hdf5_string( - elem: H5Array, - *, - _reader: Reader, -) -> str: +def read_hdf5_string(elem: H5Array, *, _reader: Reader) -> str: return elem.asstr()[()] @_REGISTRY.register_read(ZarrArray, IOSpec("string", "0.2.0")) -def read_zarr_string( - elem: ZarrArray, - *, - _reader: Reader, -) -> str: +def read_zarr_string(elem: ZarrArray, *, _reader: Reader) -> str: return str(elem[()]) From 410aeda2df1841bfb85bfc74233b6620137ced9e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:22:48 +0200 Subject: [PATCH 118/138] (fix): extra `read_elem` `dataset_kwargs` --- src/anndata/_io/specs/registry.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 7ad43fe3c..c749c5d0b 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -267,7 +267,6 @@ def read_elem( self, elem: StorageType, modifiers: frozenset[str] = frozenset(), - dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ) -> InMemoryElem: """Read an element from a store. See exported function for more details.""" From 31a30c4327433bfba354e913fcfed8e859a840eb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:26:20 +0200 Subject: [PATCH 119/138] (fix): remove more `dataset_kwargs` nonsense --- src/anndata/_types.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 4b45e54e7..9fbcf57b2 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -29,7 +29,6 @@ if TYPE_CHECKING: from collections.abc import Mapping - from types import MappingProxyType from typing import Any, TypeAlias from ._io.specs.registry import IOSpec, Reader, Writer @@ -95,8 +94,6 @@ class Read(Protocol[CovariantInMemoryType]): def __call__( self, elem: StorageType | H5File, - *, - dataset_kwargs: MappingProxyType, ) -> CovariantInMemoryType: """Low-level reading function for an element. @@ -104,9 +101,6 @@ def __call__( ---------- elem The element to read from. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. - Returns ------- The element read from the store. @@ -146,7 +140,7 @@ def __call__( v The element to write out. dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. + Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`zarr:index`. """ ... @@ -160,7 +154,6 @@ def __call__( elem: StorageType, *, iospec: IOSpec, - dataset_kwargs: MappingProxyType, ) -> InvariantInMemoryType: """ Callback used in :func:`anndata.experimental.read_dispatched` to customize reading an element from a store. @@ -175,8 +168,6 @@ def __call__( The element to read from. iospec Internal AnnData encoding specification for the element. - dataset_kwargs - Keyword arguments to be passed to a library-level io function, like `chunks` for :doc:`dask:index`. Returns ------- From 80fe8cb32c7c487e252cf7338f6b19deeceeb981 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:27:48 +0200 Subject: [PATCH 120/138] (chore): add docs --- docs/api.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api.md b/docs/api.md index 36ebeac88..92139fe06 100644 --- a/docs/api.md +++ b/docs/api.md @@ -121,6 +121,7 @@ Low level methods for reading and writing elements of an `AnnData` object to a s experimental.read_elem experimental.write_elem + experimental.read_elem_as_dask ``` Utilities for customizing the IO process: From b3142487bdd87f579e9be75f2a2aab80b21b4e91 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:35:44 +0200 Subject: [PATCH 121/138] (fix): use `block_info` for dense --- src/anndata/_io/specs/lazy_methods.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 8fc251b98..823254bac 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -139,13 +139,22 @@ def read_h5_array( chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) - def make_dask_chunk(block_id: tuple[int, int]): + def make_dask_chunk( + block_info: Union[ # noqa: UP007 + dict[ + Literal[None], + dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 + ], + None, + ] = None, + ): + if block_info is None: + raise ValueError("Block info is required") with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): - start = block_id[i] * chunks[i] - stop = min(((block_id[i] * chunks[i]) + chunks[i]), shape[i]) - idx += (slice(start, stop),) + array_location = block_info[None]["array-location"][i] + idx += (slice(array_location[0], array_location[1]),) return f[idx] chunk_layout = tuple( From 02d47352110806d6a605bfdc1599d1243a941cf9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:39:06 +0200 Subject: [PATCH 122/138] (fix): more erroneous diffs --- tests/test_io_dispatched.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_io_dispatched.py b/tests/test_io_dispatched.py index 76e17ad2d..833b23e83 100644 --- a/tests/test_io_dispatched.py +++ b/tests/test_io_dispatched.py @@ -76,11 +76,7 @@ def test_read_dispatched_null_case(): write_elem(z, "/", adata) expected = read_elem(z) - - def callback(read_func, elem_name, x, iospec): - return read_elem(x) - - actual = read_dispatched(z, callback) + actual = read_dispatched(z, lambda _, __, x, **___: read_elem(x)) assert_equal(expected, actual) From 6e5534a639d59c404a434c5d07144fad5184689d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 18 Jul 2024 13:44:07 +0200 Subject: [PATCH 123/138] (fix): use context again --- tests/test_io_elementwise.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 80da79014..5dd1791d1 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -312,13 +312,15 @@ def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): def test_read_lazy_bad_chunk_kwargs(tmp_path): arr_type = "csr" - file = h5py.File(tmp_path / "test.h5", "w") - store = file["/"] - arr_store = create_sparse_store(arr_type, store) - with pytest.raises(ValueError, match=r"`chunks` must be a tuple of two integers"): - read_elem_as_dask(arr_store["X"], chunks=(SIZE,)) - with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): - read_elem_as_dask(arr_store["X"], chunks=(SIZE, 10)) + with h5py.File(tmp_path / "test.h5", "w") as file: + store = file["/"] + arr_store = create_sparse_store(arr_type, store) + with pytest.raises( + ValueError, match=r"`chunks` must be a tuple of two integers" + ): + read_elem_as_dask(arr_store["X"], chunks=(SIZE,)) + with pytest.raises(ValueError, match=r"Only the major axis can be chunked"): + read_elem_as_dask(arr_store["X"], chunks=(SIZE, 10)) @pytest.mark.parametrize("sparse_format", ["csr", "csc"]) From d26cfe81a91639e261ec9f64048e60717fbb47d7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 11:32:27 +0200 Subject: [PATCH 124/138] (fix): change size by dimension in tests --- tests/test_io_elementwise.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index 5dd1791d1..62284a0c9 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -74,13 +74,15 @@ def sparse_format(request): def create_dense_store(store, n_dims: int = 2): - X = np.random.randn(*((SIZE,) * n_dims)) + X = np.random.randn(*[SIZE * (i + 1) for i in range(n_dims)]) write_elem(store, "X", X) return store -def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: +def create_sparse_store( + sparse_format: Literal["csc", "csr"], store: G, shape=(SIZE, SIZE * 2) +) -> G: """Returns a store Parameters @@ -95,14 +97,15 @@ def create_sparse_store(sparse_format: Literal["csc", "csr"], store: G) -> G: import dask.array as da X = sparse.random( - SIZE, - SIZE, + shape[0], + shape[1], format=sparse_format, density=0.01, random_state=np.random.default_rng(), ) X_dask = da.from_array( - X, chunks=(100 if format == "csr" else SIZE, SIZE if format == "csr" else 100) + X, + chunks=(100 if format == "csr" else SIZE, SIZE * 2 if format == "csr" else 100), ) write_elem(store, "X", X) @@ -233,11 +236,18 @@ def test_read_lazy_2d_dask(sparse_format, store): assert_equal(X_from_disk, X_dask_from_disk) random_int_indices = np.random.randint(0, SIZE, (SIZE // 10,)) random_int_indices.sort() - random_bool_mask = np.random.randn(SIZE) > 0 index_slice = slice(0, SIZE // 10) - for index in [random_int_indices, index_slice, random_bool_mask]: + for index in [random_int_indices, index_slice]: assert_equal(X_from_disk[index, :], X_dask_from_disk[index, :]) assert_equal(X_from_disk[:, index], X_dask_from_disk[:, index]) + random_bool_mask = np.random.randn(SIZE) > 0 + assert_equal( + X_from_disk[random_bool_mask, :], X_dask_from_disk[random_bool_mask, :] + ) + random_bool_mask = np.random.randn(SIZE * 2) > 0 + assert_equal( + X_from_disk[:, random_bool_mask], X_dask_from_disk[:, random_bool_mask] + ) assert arr_store["X_dask/indptr"].dtype == np.int64 assert arr_store["X_dask/indices"].dtype == np.int64 @@ -289,7 +299,7 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): [ ("dense", (100, 100)), ("csc", (SIZE, 10)), - ("csr", (10, SIZE)), + ("csr", (10, SIZE * 2)), ("csc", None), ("csr", None), ], @@ -304,8 +314,9 @@ def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): if chunks is not None: assert X_dask_from_disk.chunksize == chunks else: + minor_index = int(arr_type == "csr") # assert that sparse chunks are set correctly by default - assert X_dask_from_disk.chunksize[bool(arr_type == "csr")] == SIZE + assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk) From 94e43a33bff09af3a4ef01d09a7ef9f287934d4e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:23:24 +0200 Subject: [PATCH 125/138] (refactor): clean up `get_elem_name` --- src/anndata/_core/file_backing.py | 17 ++++++++++++++++- src/anndata/_io/specs/lazy_methods.py | 21 +++------------------ src/anndata/_io/specs/registry.py | 18 ------------------ 3 files changed, 19 insertions(+), 37 deletions(-) diff --git a/src/anndata/_core/file_backing.py b/src/anndata/_core/file_backing.py index d283a1dfd..f7dcae8b1 100644 --- a/src/anndata/_core/file_backing.py +++ b/src/anndata/_core/file_backing.py @@ -2,7 +2,7 @@ from collections.abc import Mapping from functools import singledispatch -from pathlib import Path +from pathlib import Path, PurePosixPath from typing import TYPE_CHECKING import h5py @@ -161,3 +161,18 @@ def _(x): @filename.register(ZarrGroup) def _(x): return x.store.path + + +@singledispatch +def get_elem_name(x): + raise NotImplementedError(f"Not implemented for {type(x)}") + + +@get_elem_name.register(h5py.Group) +def _(x): + return x.name + + +@get_elem_name.register(ZarrGroup) +def _(x): + return PurePosixPath(x.path).name diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 823254bac..da457f0ea 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,8 +1,7 @@ from __future__ import annotations from contextlib import contextmanager -from functools import singledispatch -from pathlib import Path, PurePosixPath +from pathlib import Path from typing import TYPE_CHECKING import h5py @@ -10,6 +9,7 @@ from scipy import sparse import anndata as ad +from anndata._core.file_backing import filename, get_elem_name from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup from .registry import _LAZY_REGISTRY, IOSpec @@ -45,21 +45,6 @@ def compute_chunk_layout_for_axis_shape( return chunk -@singledispatch -def get_elem_name(x): - raise NotImplementedError(f"Not implemented for {type(x)}") - - -@get_elem_name.register(H5Group) -def _(x): - return x.name - - -@get_elem_name.register(ZarrGroup) -def _(x): - return PurePosixPath(x.path).name - - @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -69,7 +54,7 @@ def read_sparse_as_dask( ): import dask.array as da - path_or_group = Path(elem.file.filename) if isinstance(elem, H5Group) else elem + path_or_group = Path(filename(elem)) if isinstance(elem, H5Group) else elem elem_name = get_elem_name(elem) shape: tuple[int, int] = tuple(elem.attrs["shape"]) dtype = elem["data"].dtype diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index c749c5d0b..1ca54b5ce 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -438,21 +438,3 @@ def read_elem_partial( type(elem), get_spec(elem), frozenset(modifiers) ) return read_partial(elem, items=items, indices=indices) - - -@singledispatch -def elem_key(elem) -> str: - return elem.name - - -# raise NotImplementedError() - -# @elem_key.register(ZarrGroup) -# @elem_key.register(ZarrArray) -# def _(elem): -# return elem.name - -# @elem_key.register(H5Array) -# @elem_key.register(H5Group) -# def _(elem): -# re From 51600168693daa98ed33515e5f88c05edbcbd9f8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:28:36 +0200 Subject: [PATCH 126/138] (fix): try new sphinx for error --- docs/conf.py | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index f7fe9d1be..f943fbb60 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,7 +110,6 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 - "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] From 43da9a3e18e9eb4aed2871f59a47b5b0aa810a46 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:32:11 +0200 Subject: [PATCH 127/138] (fix): return type --- src/anndata/_io/specs/lazy_methods.py | 8 +++++--- src/anndata/_io/specs/registry.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index da457f0ea..ba5331de2 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -17,6 +17,8 @@ if TYPE_CHECKING: from typing import Literal, Union + from anndata.compat import DaskArray + from .registry import Reader @@ -51,7 +53,7 @@ def compute_chunk_layout_for_axis_shape( @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, _reader: Reader, chunks: tuple[int, ...] | None = None -): +) -> DaskArray: import dask.array as da path_or_group = Path(filename(elem)) if isinstance(elem, H5Group) else elem @@ -113,7 +115,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( elem: H5Array, _reader: Reader, chunks: tuple[int, ...] | None = None -): +) -> DaskArray: import dask.array as da path = Path(elem.file.filename) @@ -157,7 +159,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( elem: ZarrArray, _reader: Reader, chunks: tuple[int, ...] | None = None -): +) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 1ca54b5ce..822c89e4b 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -284,7 +284,7 @@ def read_elem( elem: StorageType, modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, - ) -> InMemoryElem: + ) -> DaskArray: """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) From 9735ced2b618472f66174bc122629109ff26e615 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 12:41:24 +0200 Subject: [PATCH 128/138] (fix): protocol for reading --- src/anndata/_io/specs/lazy_methods.py | 10 ++++++---- src/anndata/_io/specs/registry.py | 15 +++++++++++---- src/anndata/_types.py | 27 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index ba5331de2..dd99c46ba 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -19,7 +19,7 @@ from anndata.compat import DaskArray - from .registry import Reader + from .registry import DaskReader @contextmanager @@ -52,7 +52,9 @@ def compute_chunk_layout_for_axis_shape( @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( - elem: H5Group | ZarrGroup, _reader: Reader, chunks: tuple[int, ...] | None = None + elem: H5Group | ZarrGroup, + _reader: DaskReader, + chunks: tuple[int, ...] | None = None, ) -> DaskArray: import dask.array as da @@ -114,7 +116,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, _reader: Reader, chunks: tuple[int, ...] | None = None + elem: H5Array, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da @@ -158,7 +160,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, _reader: Reader, chunks: tuple[int, ...] | None = None + elem: ZarrArray, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 822c89e4b..faf6ff9ba 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -20,6 +20,7 @@ InMemoryElem, Read, ReadCallback, + ReadDask, Write, WriteCallback, _ReadInternal, @@ -81,7 +82,9 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): class IORegistry: def __init__(self): - self.read: dict[tuple[type, IOSpec, frozenset[str]], _ReadInternal] = {} + self.read: dict[ + tuple[type, IOSpec, frozenset[str]], _ReadInternal | ReadDask + ] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal @@ -163,7 +166,7 @@ def get_read( modifiers: frozenset[str] = frozenset(), *, reader: Reader, - ) -> Read: + ) -> Read | ReadDask: if (src_type, spec, modifiers) not in self.read: raise IORegistryError._from_read_parts("read", self.read, src_type, spec) internal = self.read[(src_type, spec, modifiers)] @@ -271,7 +274,9 @@ def read_elem( """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) - read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) + read_func: Read = self.registry.get_read( + type(elem), iospec, modifiers, reader=self + ) if self.callback is None: return read_func(elem) return self.callback(read_func, elem.name, elem, iospec=iospec) @@ -288,7 +293,9 @@ def read_elem( """Read an element from a store. See exported function for more details.""" iospec = get_spec(elem) - read_func = self.registry.get_read(type(elem), iospec, modifiers, reader=self) + read_func: ReadDask = self.registry.get_read( + type(elem), iospec, modifiers, reader=self + ) if self.callback is not None: warnings.warn( "Dask reading does not use a callback. Ignoring callback.", diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 9fbcf57b2..f091b701a 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -31,6 +31,8 @@ from collections.abc import Mapping from typing import Any, TypeAlias + from anndata._io.specs.registry import DaskReader + from ._io.specs.registry import IOSpec, Reader, Writer from .compat import H5File @@ -108,6 +110,31 @@ def __call__( ... +class ReadDask(Protocol): + def __call__( + self, + elem: StorageType | H5File, + *, + _reader: DaskReader, + chunks: tuple[int, ...] | None = None, + ) -> DaskArray: + """Low-level reading function for a dask element. + + Parameters + ---------- + elem + The element to read from. + _reader + The parent object that will be used to read the element. + chunks + The chunks size to be used. + Returns + ------- + The dask element read from the store. + """ + ... + + class _WriteInternal(Protocol[ContravariantInMemoryType]): def __call__( self, From f1730c3a9938cfb9f44c53e1573888821885282a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 13:21:46 +0200 Subject: [PATCH 129/138] (fix): bring back ignored warning --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index f943fbb60..f7fe9d1be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -110,6 +110,7 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 + "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] From 9861b56771af30bd747f39d160dd0beebf77dec5 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 14:00:03 +0200 Subject: [PATCH 130/138] Fix docs --- docs/_templates/autosummary/class.rst | 4 ++-- docs/conf.py | 3 +-- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/_templates/autosummary/class.rst b/docs/_templates/autosummary/class.rst index b4e7370aa..8fe1d69d0 100644 --- a/docs/_templates/autosummary/class.rst +++ b/docs/_templates/autosummary/class.rst @@ -13,7 +13,7 @@ .. autosummary:: :toctree: . {% for item in attributes %} - ~{{ fullname }}.{{ item }} + ~{{ name }}.{{ item }} {%- endfor %} {% endif %} {% endblock %} @@ -26,7 +26,7 @@ :toctree: . {% for item in methods %} {%- if item != '__init__' %} - ~{{ fullname }}.{{ item }} + ~{{ name }}.{{ item }} {%- endif -%} {%- endfor %} {% endif %} diff --git a/docs/conf.py b/docs/conf.py index f7fe9d1be..5b1b95f30 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -27,7 +27,7 @@ # default settings templates_path = ["_templates"] html_static_path = ["_static"] -source_suffix = [".rst", ".md"] +source_suffix = {".rst": "restructuredtext", ".md": "markdown"} master_doc = "index" default_role = "literal" exclude_patterns = [ @@ -110,7 +110,6 @@ suppress_warnings = [ "ref.citation", "myst.header", # https://github.com/executablebooks/MyST-Parser/issues/262 - "autosummary.import_cycle", # https://github.com/sphinx-doc/sphinx/issues/12589 ] diff --git a/pyproject.toml b/pyproject.toml index 43e5ab416..ef97699f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,7 @@ dev = [ "pytest-xdist", ] doc = [ - "sphinx>=4.4", + "sphinx>=7.4.6", "sphinx-book-theme>=1.1.0", "sphinx-autodoc-typehints>=2.2.0", "sphinx-issues", From 235096a9fdb2bb983b456d676d5309b0ba560a2c Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 16:03:45 +0200 Subject: [PATCH 131/138] almost fix typing --- src/anndata/_io/specs/lazy_methods.py | 5 ++-- src/anndata/_io/specs/registry.py | 34 +++++++++++------------ src/anndata/_types.py | 40 ++++++++++++--------------- 3 files changed, 36 insertions(+), 43 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index dd99c46ba..0b9def583 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -53,6 +53,7 @@ def compute_chunk_layout_for_axis_shape( @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_as_dask( elem: H5Group | ZarrGroup, + *, _reader: DaskReader, chunks: tuple[int, ...] | None = None, ) -> DaskArray: @@ -116,7 +117,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(H5Array, IOSpec("array", "0.2.0")) def read_h5_array( - elem: H5Array, _reader: DaskReader, chunks: tuple[int, ...] | None = None + elem: H5Array, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: import dask.array as da @@ -160,7 +161,7 @@ def make_dask_chunk( @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) def read_zarr_array( - elem: ZarrArray, _reader: DaskReader, chunks: tuple[int, ...] | None = None + elem: ZarrArray, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None ) -> DaskArray: chunks: tuple[int, ...] = chunks if chunks is not None else elem.chunks import dask.array as da diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index faf6ff9ba..e3003cc52 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -5,25 +5,23 @@ from dataclasses import dataclass from functools import partial, singledispatch, wraps from types import MappingProxyType -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Generic, TypeVar from anndata._io.utils import report_read_key_on_error, report_write_key_on_error +from anndata._types import Read, ReadDask, _ReadDaskInternal, _ReadInternal from anndata.compat import DaskArray, _read_attr if TYPE_CHECKING: from collections.abc import Callable, Generator, Iterable - from typing import Any, TypeVar + from typing import Any from anndata._core.storage import StorageType from anndata._types import ( GroupStorageType, InMemoryElem, - Read, ReadCallback, - ReadDask, Write, WriteCallback, - _ReadInternal, _WriteInternal, ) @@ -80,11 +78,13 @@ def wrapper(g: GroupStorageType, k: str, *args, **kwargs): return decorator -class IORegistry: +_R = TypeVar("_R", _ReadInternal, _ReadDaskInternal) +R = TypeVar("R", Read, ReadDask) + + +class IORegistry(Generic[_R, R]): def __init__(self): - self.read: dict[ - tuple[type, IOSpec, frozenset[str]], _ReadInternal | ReadDask - ] = {} + self.read: dict[tuple[type, IOSpec, frozenset[str]], _R] = {} self.read_partial: dict[tuple[type, IOSpec, frozenset[str]], Callable] = {} self.write: dict[ tuple[type, type | tuple[type, str], frozenset[str]], _WriteInternal @@ -149,7 +149,7 @@ def register_read( src_type: type, spec: IOSpec | Mapping[str, str], modifiers: Iterable[str] = frozenset(), - ) -> Callable[[_ReadInternal[T]], _ReadInternal[T]]: + ) -> Callable[[_R], _R]: spec = proc_spec(spec) modifiers = frozenset(modifiers) @@ -166,7 +166,7 @@ def get_read( modifiers: frozenset[str] = frozenset(), *, reader: Reader, - ) -> Read | ReadDask: + ) -> R: if (src_type, spec, modifiers) not in self.read: raise IORegistryError._from_read_parts("read", self.read, src_type, spec) internal = self.read[(src_type, spec, modifiers)] @@ -212,8 +212,8 @@ def get_spec(self, elem: Any) -> IOSpec: return self.write_specs[type(elem)] -_REGISTRY = IORegistry() -_LAZY_REGISTRY = IORegistry() +_REGISTRY: IORegistry[_ReadInternal, Read] = IORegistry() +_LAZY_REGISTRY: IORegistry[_ReadDaskInternal, ReadDask] = IORegistry() @singledispatch @@ -290,17 +290,15 @@ def read_elem( modifiers: frozenset[str] = frozenset(), chunks: tuple[int, ...] | None = None, ) -> DaskArray: - """Read an element from a store. See exported function for more details.""" + """Read a dask element from a store. See exported function for more details.""" iospec = get_spec(elem) read_func: ReadDask = self.registry.get_read( type(elem), iospec, modifiers, reader=self ) if self.callback is not None: - warnings.warn( - "Dask reading does not use a callback. Ignoring callback.", - stacklevel=2, - ) + msg = "Dask reading does not use a callback. Ignoring callback." + warnings.warn(msg, stacklevel=2) return read_func(elem, chunks=chunks) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index f091b701a..3549152f5 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -34,7 +34,6 @@ from anndata._io.specs.registry import DaskReader from ._io.specs.registry import IOSpec, Reader, Writer - from .compat import H5File __all__ = [ "ArrayStorageType", @@ -82,21 +81,22 @@ ) InvariantInMemoryType = TypeVar("InvariantInMemoryType", bound="InMemoryElem") +SCo = TypeVar("SCo", covariant=True, bound=StorageType) +SCon = TypeVar("SCon", contravariant=True, bound=StorageType) -class _ReadInternal(Protocol[CovariantInMemoryType]): - def __call__( - self, - elem: StorageType | H5File, - *, - _reader: Reader, - ) -> CovariantInMemoryType: ... +class _ReadInternal(Protocol[SCon, CovariantInMemoryType]): + def __call__(self, elem: SCon, *, _reader: Reader) -> CovariantInMemoryType: ... -class Read(Protocol[CovariantInMemoryType]): + +class _ReadDaskInternal(Protocol[SCon]): def __call__( - self, - elem: StorageType | H5File, - ) -> CovariantInMemoryType: + self, elem: SCon, *, _reader: DaskReader, chunks: tuple[int, ...] | None = None + ) -> DaskArray: ... + + +class Read(Protocol[SCon, CovariantInMemoryType]): + def __call__(self, elem: SCon) -> CovariantInMemoryType: """Low-level reading function for an element. Parameters @@ -110,13 +110,9 @@ def __call__( ... -class ReadDask(Protocol): +class ReadDask(Protocol[SCon]): def __call__( - self, - elem: StorageType | H5File, - *, - _reader: DaskReader, - chunks: tuple[int, ...] | None = None, + self, elem: SCon, *, chunks: tuple[int, ...] | None = None ) -> DaskArray: """Low-level reading function for a dask element. @@ -124,10 +120,8 @@ def __call__( ---------- elem The element to read from. - _reader - The parent object that will be used to read the element. chunks - The chunks size to be used. + The chunk size to be used. Returns ------- The dask element read from the store. @@ -172,11 +166,11 @@ def __call__( ... -class ReadCallback(Protocol[InvariantInMemoryType]): +class ReadCallback(Protocol[SCo, InvariantInMemoryType]): def __call__( self, /, - read_func: Read[InvariantInMemoryType], + read_func: Read[SCo, InvariantInMemoryType], elem_name: str, elem: StorageType, *, From dce9f07a271316a47498c63b3f4c11ca12e2810b Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 16:26:42 +0200 Subject: [PATCH 132/138] add wrapper --- src/anndata/_io/specs/lazy_methods.py | 61 ++++++++++++++------------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 0b9def583..33c7aba6b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,8 +1,9 @@ from __future__ import annotations from contextlib import contextmanager +from functools import wraps from pathlib import Path -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal, ParamSpec, TypeVar, Union import h5py import numpy as np @@ -15,7 +16,8 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from typing import Literal, Union + from collections.abc import Callable + from typing import Concatenate from anndata.compat import DaskArray @@ -47,6 +49,29 @@ def compute_chunk_layout_for_axis_shape( return chunk +P = ParamSpec("P") +R = TypeVar("R") +BlockInfo = dict[ + Literal[None], + dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], +] + + +def require_block_info( + f: Callable[Concatenate[BlockInfo, P], R], +) -> Callable[Concatenate[BlockInfo | None, P], R]: + @wraps(f) + def wrapper( + block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs + ) -> R: + if block_info is None: + msg = "Block info is required" + raise ValueError(msg) + return f(block_info, *args, **kwargs) + + return wrapper + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -76,19 +101,10 @@ def read_sparse_as_dask( ) stride = chunks[int(is_csc)] - def make_dask_chunk( - block_info: Union[ # noqa: UP007 - dict[ - Literal[None], - dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 - ], - None, - ] = None, - ): + @require_block_info + def make_dask_chunk(block_info: BlockInfo): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 - if block_info is None: - raise ValueError("Block info is required") with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) array_location = block_info[None]["array-location"] @@ -129,17 +145,8 @@ def read_h5_array( chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) - def make_dask_chunk( - block_info: Union[ # noqa: UP007 - dict[ - Literal[None], - dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], # noqa: UP007 - ], - None, - ] = None, - ): - if block_info is None: - raise ValueError("Block info is required") + @require_block_info + def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): @@ -152,11 +159,7 @@ def make_dask_chunk( for i in range(len(shape)) ) - return da.map_blocks( - make_dask_chunk, - dtype=dtype, - chunks=chunk_layout, - ) + return da.map_blocks(make_dask_chunk, dtype=dtype, chunks=chunk_layout) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) From 2725ef2a1462abe7cb7096bcb4c50486d916618a Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 16:29:58 +0200 Subject: [PATCH 133/138] move into type checking --- src/anndata/_io/specs/lazy_methods.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 33c7aba6b..85e6fc25b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -3,7 +3,7 @@ from contextlib import contextmanager from functools import wraps from pathlib import Path -from typing import TYPE_CHECKING, Literal, ParamSpec, TypeVar, Union +from typing import TYPE_CHECKING import h5py import numpy as np @@ -17,12 +17,20 @@ if TYPE_CHECKING: from collections.abc import Callable - from typing import Concatenate + from typing import Concatenate, Literal, ParamSpec, TypeVar from anndata.compat import DaskArray from .registry import DaskReader + BlockInfo = dict[ + Literal[None], + dict[str, tuple[int, ...] | list[tuple[int, ...]]], + ] + + P = ParamSpec("P") + R = TypeVar("R") + @contextmanager def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): @@ -49,14 +57,6 @@ def compute_chunk_layout_for_axis_shape( return chunk -P = ParamSpec("P") -R = TypeVar("R") -BlockInfo = dict[ - Literal[None], - dict[str, Union[tuple[int, ...], list[tuple[int, ...]]]], -] - - def require_block_info( f: Callable[Concatenate[BlockInfo, P], R], ) -> Callable[Concatenate[BlockInfo | None, P], R]: From ffe89f0b5518a51dd9506e11a2308bff5ec940c7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 22 Jul 2024 16:38:10 +0200 Subject: [PATCH 134/138] (fix): small type fxes --- src/anndata/_io/specs/lazy_methods.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 85e6fc25b..f51c0f684 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -80,7 +80,7 @@ def read_sparse_as_dask( elem: H5Group | ZarrGroup, *, _reader: DaskReader, - chunks: tuple[int, ...] | None = None, + chunks: tuple[int, int] | None = None, ) -> DaskArray: import dask.array as da @@ -91,15 +91,16 @@ def read_sparse_as_dask( is_csc: bool = elem.attrs["encoding-type"] == "csc_matrix" stride: int = _DEFAULT_STRIDE + major_dim, minor_dim = (1, 0) if is_csc else (0, 1) if chunks is not None: if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") - if chunks[int(not is_csc)] != shape[int(not is_csc)]: + if chunks[minor_dim] != shape[minor_dim]: raise ValueError( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" ) - stride = chunks[int(is_csc)] + stride = chunks[major_dim] @require_block_info def make_dask_chunk(block_info: BlockInfo): @@ -107,10 +108,12 @@ def make_dask_chunk(block_info: BlockInfo): # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - array_location = block_info[None]["array-location"] + (xxx_start, xxx_end), (yyy_start, yyy_end) = block_info[None][ + "array-location" + ] index = ( - slice(array_location[0][0], array_location[0][1]), - slice(array_location[1][0], array_location[1][1]), + slice(xxx_start, xxx_end), + slice(yyy_start, yyy_end), ) chunk = mtx[index] return chunk @@ -150,8 +153,8 @@ def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): - array_location = block_info[None]["array-location"][i] - idx += (slice(array_location[0], array_location[1]),) + (start, stop) = block_info[None]["array-location"][i] + idx += (slice(start, stop),) return f[idx] chunk_layout = tuple( From 75a64fc3bd573d5a4840b83ec0041c171326dfc3 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:03:27 +0200 Subject: [PATCH 135/138] block info types --- src/anndata/_io/specs/lazy_methods.py | 38 +++++++++++++-------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f51c0f684..f7630bd63 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -16,18 +16,24 @@ from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Mapping, Sequence from typing import Concatenate, Literal, ParamSpec, TypeVar from anndata.compat import DaskArray from .registry import DaskReader - BlockInfo = dict[ + BlockInfo2D = Mapping[ Literal[None], - dict[str, tuple[int, ...] | list[tuple[int, ...]]], + dict[str, tuple[tuple[int, int], tuple[int, int]]], ] + BlockInfoND = Mapping[ + Literal[None], + dict[str, Sequence[tuple[int, int]]], + ] + + B = TypeVar("B", BlockInfo2D, BlockInfoND) P = ParamSpec("P") R = TypeVar("R") @@ -58,12 +64,10 @@ def compute_chunk_layout_for_axis_shape( def require_block_info( - f: Callable[Concatenate[BlockInfo, P], R], -) -> Callable[Concatenate[BlockInfo | None, P], R]: + f: Callable[Concatenate[B, P], R], +) -> Callable[Concatenate[B | None, P], R]: @wraps(f) - def wrapper( - block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs - ) -> R: + def wrapper(block_info: B | None = None, *args: P.args, **kwargs: P.kwargs) -> R: if block_info is None: msg = "Block info is required" raise ValueError(msg) @@ -80,7 +84,7 @@ def read_sparse_as_dask( elem: H5Group | ZarrGroup, *, _reader: DaskReader, - chunks: tuple[int, int] | None = None, + chunks: tuple[int, ...] | None = None, # only tuple[int, int] is supported here ) -> DaskArray: import dask.array as da @@ -103,19 +107,13 @@ def read_sparse_as_dask( stride = chunks[major_dim] @require_block_info - def make_dask_chunk(block_info: BlockInfo): + def make_dask_chunk(block_info: BlockInfo2D): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - (xxx_start, xxx_end), (yyy_start, yyy_end) = block_info[None][ - "array-location" - ] - index = ( - slice(xxx_start, xxx_end), - slice(yyy_start, yyy_end), - ) - chunk = mtx[index] + range_i, range_j = block_info[None]["array-location"] + chunk = mtx[slice(*range_i), slice(*range_j)] return chunk shape_minor, shape_major = shape if is_csc else shape[::-1] @@ -141,7 +139,7 @@ def read_h5_array( import dask.array as da path = Path(elem.file.filename) - elem_name = elem.name + elem_name: str = elem.name shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = ( @@ -149,7 +147,7 @@ def read_h5_array( ) @require_block_info - def make_dask_chunk(block_info: BlockInfo): + def make_dask_chunk(block_info: BlockInfoND): with maybe_open_h5(path, elem_name) as f: idx = () for i in range(len(shape)): From 3f734fe24ad30cef67c7401283c224529f94de2f Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:13:27 +0200 Subject: [PATCH 136/138] simplify --- src/anndata/_io/specs/lazy_methods.py | 36 +++++++++++++-------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index f7630bd63..4084fcf41 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -23,17 +23,11 @@ from .registry import DaskReader - BlockInfo2D = Mapping[ - Literal[None], - dict[str, tuple[tuple[int, int], tuple[int, int]]], - ] - - BlockInfoND = Mapping[ + BlockInfo = Mapping[ Literal[None], dict[str, Sequence[tuple[int, int]]], ] - B = TypeVar("B", BlockInfo2D, BlockInfoND) P = ParamSpec("P") R = TypeVar("R") @@ -64,10 +58,12 @@ def compute_chunk_layout_for_axis_shape( def require_block_info( - f: Callable[Concatenate[B, P], R], -) -> Callable[Concatenate[B | None, P], R]: + f: Callable[Concatenate[BlockInfo, P], R], +) -> Callable[Concatenate[BlockInfo | None, P], R]: @wraps(f) - def wrapper(block_info: B | None = None, *args: P.args, **kwargs: P.kwargs) -> R: + def wrapper( + block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs + ) -> R: if block_info is None: msg = "Block info is required" raise ValueError(msg) @@ -76,6 +72,12 @@ def wrapper(block_info: B | None = None, *args: P.args, **kwargs: P.kwargs) -> R return wrapper +def get_chunks_indexer(block_info: BlockInfo) -> tuple[slice, ...]: + return tuple( + slice(start, stop) for start, stop in block_info[None]["array-location"] + ) + + @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csr_matrix", "0.1.0")) @_LAZY_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @@ -107,13 +109,13 @@ def read_sparse_as_dask( stride = chunks[major_dim] @require_block_info - def make_dask_chunk(block_info: BlockInfo2D): + def make_dask_chunk(block_info: BlockInfo): # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - range_i, range_j = block_info[None]["array-location"] - chunk = mtx[slice(*range_i), slice(*range_j)] + xs, ys = get_chunks_indexer(block_info) + chunk = mtx[xs, ys] return chunk shape_minor, shape_major = shape if is_csc else shape[::-1] @@ -147,13 +149,9 @@ def read_h5_array( ) @require_block_info - def make_dask_chunk(block_info: BlockInfoND): + def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: - idx = () - for i in range(len(shape)): - (start, stop) = block_info[None]["array-location"][i] - idx += (slice(start, stop),) - return f[idx] + return f[get_chunks_indexer(block_info)] chunk_layout = tuple( compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) From c4c2356171e27ee60b4b6ac75ee25965303a4f79 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:15:11 +0200 Subject: [PATCH 137/138] rename --- src/anndata/_io/specs/lazy_methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 4084fcf41..e0153e17b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -72,7 +72,7 @@ def wrapper( return wrapper -def get_chunks_indexer(block_info: BlockInfo) -> tuple[slice, ...]: +def get_array_ranges(block_info: BlockInfo) -> tuple[slice, ...]: return tuple( slice(start, stop) for start, stop in block_info[None]["array-location"] ) @@ -114,7 +114,7 @@ def make_dask_chunk(block_info: BlockInfo): # https://github.com/scverse/anndata/issues/1105 with maybe_open_h5(path_or_group, elem_name) as f: mtx = ad.experimental.sparse_dataset(f) - xs, ys = get_chunks_indexer(block_info) + xs, ys = get_array_ranges(block_info) chunk = mtx[xs, ys] return chunk @@ -151,7 +151,7 @@ def read_h5_array( @require_block_info def make_dask_chunk(block_info: BlockInfo): with maybe_open_h5(path, elem_name) as f: - return f[get_chunks_indexer(block_info)] + return f[get_array_ranges(block_info)] chunk_layout = tuple( compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) From cc67a9b54a6690847bae3701fd6daf64a7678ab2 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 22 Jul 2024 17:35:34 +0200 Subject: [PATCH 138/138] simplify more --- src/anndata/_io/specs/lazy_methods.py | 81 ++++++++++++--------------- 1 file changed, 37 insertions(+), 44 deletions(-) diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index e0153e17b..8a1b31e6b 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -1,7 +1,7 @@ from __future__ import annotations from contextlib import contextmanager -from functools import wraps +from functools import partial from pathlib import Path from typing import TYPE_CHECKING @@ -10,17 +10,18 @@ from scipy import sparse import anndata as ad -from anndata._core.file_backing import filename, get_elem_name -from anndata.compat import H5Array, H5Group, ZarrArray, ZarrGroup +from ..._core.file_backing import filename, get_elem_name +from ...compat import H5Array, H5Group, ZarrArray, ZarrGroup from .registry import _LAZY_REGISTRY, IOSpec if TYPE_CHECKING: - from collections.abc import Callable, Mapping, Sequence - from typing import Concatenate, Literal, ParamSpec, TypeVar - - from anndata.compat import DaskArray + from collections.abc import Callable, Generator, Mapping, Sequence + from typing import Literal, ParamSpec, TypeVar + from ..._core.sparse_dataset import CSCDataset, CSRDataset + from ..._types import ArrayStorageType, StorageType + from ...compat import DaskArray from .registry import DaskReader BlockInfo = Mapping[ @@ -33,7 +34,9 @@ @contextmanager -def maybe_open_h5(path_or_group: Path | ZarrGroup, elem_name: str): +def maybe_open_h5( + path_or_group: Path | ZarrGroup, elem_name: str +) -> Generator[StorageType, None, None]: if not isinstance(path_or_group, Path): yield path_or_group return @@ -57,25 +60,26 @@ def compute_chunk_layout_for_axis_shape( return chunk -def require_block_info( - f: Callable[Concatenate[BlockInfo, P], R], -) -> Callable[Concatenate[BlockInfo | None, P], R]: - @wraps(f) - def wrapper( - block_info: BlockInfo | None = None, *args: P.args, **kwargs: P.kwargs - ) -> R: - if block_info is None: - msg = "Block info is required" - raise ValueError(msg) - return f(block_info, *args, **kwargs) - - return wrapper - - -def get_array_ranges(block_info: BlockInfo) -> tuple[slice, ...]: - return tuple( - slice(start, stop) for start, stop in block_info[None]["array-location"] - ) +def make_dask_chunk( + path_or_group: Path | ZarrGroup, + elem_name: str, + block_info: BlockInfo | None = None, + *, + wrap: Callable[[ArrayStorageType], ArrayStorageType] + | Callable[[H5Group | ZarrGroup], CSRDataset | CSCDataset] = lambda g: g, +): + if block_info is None: + msg = "Block info is required" + raise ValueError(msg) + # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` + # https://github.com/scverse/anndata/issues/1105 + with maybe_open_h5(path_or_group, elem_name) as f: + mtx = wrap(f) + idx = tuple( + slice(start, stop) for start, stop in block_info[None]["array-location"] + ) + chunk = mtx[idx] + return chunk @_LAZY_REGISTRY.register_read(H5Group, IOSpec("csc_matrix", "0.1.0")) @@ -108,16 +112,6 @@ def read_sparse_as_dask( ) stride = chunks[major_dim] - @require_block_info - def make_dask_chunk(block_info: BlockInfo): - # We need to open the file in each task since `dask` cannot share h5py objects when using `dask.distributed` - # https://github.com/scverse/anndata/issues/1105 - with maybe_open_h5(path_or_group, elem_name) as f: - mtx = ad.experimental.sparse_dataset(f) - xs, ys = get_array_ranges(block_info) - chunk = mtx[xs, ys] - return chunk - shape_minor, shape_major = shape if is_csc else shape[::-1] chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) chunks_minor = (shape_minor,) @@ -125,8 +119,11 @@ def make_dask_chunk(block_info: BlockInfo): (chunks_minor, chunks_major) if is_csc else (chunks_major, chunks_minor) ) memory_format = sparse.csc_matrix if is_csc else sparse.csr_matrix + make_chunk = partial( + make_dask_chunk, path_or_group, elem_name, wrap=ad.experimental.sparse_dataset + ) da_mtx = da.map_blocks( - make_dask_chunk, + make_chunk, dtype=dtype, chunks=chunk_layout, meta=memory_format((0, 0), dtype=dtype), @@ -148,17 +145,13 @@ def read_h5_array( chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) ) - @require_block_info - def make_dask_chunk(block_info: BlockInfo): - with maybe_open_h5(path, elem_name) as f: - return f[get_array_ranges(block_info)] - chunk_layout = tuple( compute_chunk_layout_for_axis_shape(chunks[i], shape[i]) for i in range(len(shape)) ) - return da.map_blocks(make_dask_chunk, dtype=dtype, chunks=chunk_layout) + make_chunk = partial(make_dask_chunk, path, elem_name) + return da.map_blocks(make_chunk, dtype=dtype, chunks=chunk_layout) @_LAZY_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0"))