From 1c1783cc65375f892761c6d33da516129c51f370 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 22 May 2024 14:44:23 +0200 Subject: [PATCH 1/8] (feat): allow `join=outer` for `concat_on_disk` --- src/anndata/experimental/merge.py | 2 -- tests/test_concatenate_disk.py | 12 ++++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/anndata/experimental/merge.py b/src/anndata/experimental/merge.py index 2e0e925a2..d71f5bf09 100644 --- a/src/anndata/experimental/merge.py +++ b/src/anndata/experimental/merge.py @@ -536,8 +536,6 @@ def concat_on_disk( # Argument normalization if pairwise: raise NotImplementedError("pairwise concatenation not yet implemented") - if join != "inner": - raise NotImplementedError("only inner join is currently supported") merge = resolve_merge_strategy(merge) uns_merge = resolve_merge_strategy(uns_merge) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index 1ccecc7b5..cba0901b1 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -39,7 +39,7 @@ def array_type(request): return request.param -@pytest.fixture(params=["inner"]) +@pytest.fixture(params=["inner", "outer"]) def join_type(request): return request.param @@ -208,7 +208,7 @@ def test_concat_ordered_categoricals_retained(tmp_path, file_format): @pytest.fixture -def obsm_adatas(): +def xxxm_adatas(): def gen_index(n): return [f"cell{i}" for i in range(n)] @@ -256,8 +256,12 @@ def gen_index(n): ] -def test_concatenate_obsm_inner(obsm_adatas, tmp_path, file_format): - assert_eq_concat_on_disk(obsm_adatas, tmp_path, file_format, join="inner") +def test_concatenate_xxxm(xxxm_adatas, tmp_path, file_format, join_type): + if join_type == "outer": + for i in range(len(xxxm_adatas)): + xxxm_adatas[i] = xxxm_adatas[i].T + xxxm_adatas[i].X = sparse.csr_matrix(xxxm_adatas[i].X) + assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format, join=join_type) def test_output_dir_exists(tmp_path): From 95e3b0edd32e05ab190451965b450d8c250cace3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 22 May 2024 14:59:33 +0200 Subject: [PATCH 2/8] (recactor): don't need to redeclare types --- tests/test_concatenate_disk.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index cba0901b1..f1a3aab10 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -166,11 +166,9 @@ def test_anndatas_with_reindex( np.ndarray, pd.DataFrame, ), - varm_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), + varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), ) - a.layers["sparse"] = get_array_type("sparse", axis)(a.layers["sparse"]) - a.varm["sparse"] = get_array_type("sparse", 1 - axis)(a.varm["sparse"]) adatas.append(a) assert_eq_concat_on_disk( From c6ac45561f311fa29cfb932e545dbd12194d077c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 22 May 2024 15:07:18 +0200 Subject: [PATCH 3/8] (chore): release note --- docs/release-notes/0.10.8.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/release-notes/0.10.8.md b/docs/release-notes/0.10.8.md index 52b743866..dd0e7c824 100644 --- a/docs/release-notes/0.10.8.md +++ b/docs/release-notes/0.10.8.md @@ -13,3 +13,5 @@ ```{rubric} Performance ``` + +* Support for `concat_on_disk` outer join {pr}`1504` {user}`ilan-gold` From f504aa797bb793ab2bc6d15718a51fa62ca9ee72 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 22 May 2024 15:53:45 +0200 Subject: [PATCH 4/8] (chore): add more rigorous union/diff check --- tests/test_concatenate_disk.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index f1a3aab10..1afdc1ad3 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -126,10 +126,19 @@ def test_anndatas_without_reindex( sparse_fmt=sparse_fmt, **GEN_ADATA_OOC_CONCAT_ARGS, ) + # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested if axis == 0: a.obs_names = f"{i}-" + a.obs_names + a.var_names = [ + f"{i}-{name}" if var_ind % 2 else name + for var_ind, name in enumerate(a.var_names) + ] else: a.var_names = f"{i}-" + a.var_names + a.obs_names = [ + f"{i}-{name}" if obs_ind % 2 else name + for obs_ind, name in enumerate(a.obs_names) + ] adatas.append(a) assert_eq_concat_on_disk( @@ -153,7 +162,7 @@ def test_anndatas_with_reindex( if axis == 0: sparse_fmt = "csr" - for _ in range(5): + for i in range(5): M = np.random.randint(1, 100) N = np.random.randint(1, 100) @@ -169,6 +178,17 @@ def test_anndatas_with_reindex( varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), ) + # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested + if axis == 1: + a.obs_names = [ + f"{i}-{name}" if obs_ind % 2 else name + for obs_ind, name in enumerate(a.obs_names) + ] + else: + a.var_names = [ + f"{i}-{name}" if var_ind % 2 else name + for var_ind, name in enumerate(a.var_names) + ] adatas.append(a) assert_eq_concat_on_disk( From 68c0b719444cce6e53f128c40fba10aff08efa37 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 28 Jun 2024 11:50:36 +0200 Subject: [PATCH 5/8] simplify --- tests/test_concatenate_disk.py | 47 +++++++++++++++------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index df14174e4..f4b5c8d79 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -1,6 +1,7 @@ from __future__ import annotations from collections.abc import Mapping +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -16,6 +17,10 @@ ) from anndata.utils import asarray +if TYPE_CHECKING: + from pathlib import Path + + GEN_ADATA_OOC_CONCAT_ARGS = dict( obsm_types=( sparse.csr_matrix, @@ -77,13 +82,18 @@ def _adatas_to_paths(adatas, tmp_path, file_format): def assert_eq_concat_on_disk( - adatas, tmp_path, file_format, max_loaded_elems=None, *args, **kwargs + adatas, + tmp_path: Path, + file_format: Literal["zarr", "h5ad"], + max_loaded_elems: int | None = None, + *args, + **kwargs, ): # create one from the concat function res1 = concat(adatas, *args, **kwargs) # create one from the on disk concat function paths = _adatas_to_paths(adatas, tmp_path, file_format) - out_name = tmp_path / ("out." + file_format) + out_name = tmp_path / f"out.{file_format}" if max_loaded_elems is not None: kwargs["max_loaded_elems"] = max_loaded_elems concat_on_disk(paths, out_name, *args, **kwargs) @@ -93,32 +103,26 @@ def assert_eq_concat_on_disk( def get_array_type(array_type, axis): if array_type == "sparse": - if axis == 0: - return sparse.csr_matrix - return sparse.csc_matrix + return sparse.csr_matrix if axis == 0 else sparse.csc_matrix if array_type == "sparse_array": - if axis == 0: - return sparse.csr_array - return sparse.csc_array + return sparse.csr_array if axis == 0 else sparse.csc_array if array_type == "array": return asarray - else: - raise NotImplementedError(f"array_type {array_type} not implemented") + raise NotImplementedError(f"array_type {array_type} not implemented") def test_anndatas_without_reindex( axis, array_type, join_type, tmp_path, max_loaded_elems, file_format ): - N = 50 - M = 50 - sparse_fmt = "csr" + M = N = 50 + sparse_fmt = "csr" if axis == 0 else "csc" + adatas = [] for i in range(5): if axis == 0: M = np.random.randint(1, 100) else: N = np.random.randint(1, 100) - sparse_fmt = "csc" a = gen_adata( (M, N), @@ -154,14 +158,9 @@ def test_anndatas_without_reindex( def test_anndatas_with_reindex( axis, array_type, join_type, tmp_path, file_format, max_loaded_elems ): - N = 50 - M = 50 - adatas = [] - - sparse_fmt = "csc" - if axis == 0: - sparse_fmt = "csr" + sparse_fmt = "csr" if axis == 0 else "csc" + adatas = [] for i in range(5): M = np.random.randint(1, 100) N = np.random.randint(1, 100) @@ -170,11 +169,7 @@ def test_anndatas_with_reindex( (M, N), X_type=get_array_type(array_type, axis), sparse_fmt=sparse_fmt, - obsm_types=( - get_array_type("sparse", 1 - axis), - np.ndarray, - pd.DataFrame, - ), + obsm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), ) From b9aaa3e83e2aece8a9fd1c3fbb0d4f343788f7a8 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 28 Jun 2024 16:17:00 +0200 Subject: [PATCH 6/8] (chore): clean up interleaving naming of axis --- tests/test_concatenate_disk.py | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index f4b5c8d79..ba22c0aac 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -9,6 +9,7 @@ from scipy import sparse from anndata import AnnData, concat +from anndata._core.merge import _resolve_axis from anndata.experimental import read_elem, write_elem from anndata.experimental.merge import as_group, concat_on_disk from anndata.tests.helpers import ( @@ -131,18 +132,10 @@ def test_anndatas_without_reindex( **GEN_ADATA_OOC_CONCAT_ARGS, ) # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested - if axis == 0: - a.obs_names = f"{i}-" + a.obs_names - a.var_names = [ - f"{i}-{name}" if var_ind % 2 else name - for var_ind, name in enumerate(a.var_names) - ] - else: - a.var_names = f"{i}-" + a.var_names - a.obs_names = [ - f"{i}-{name}" if obs_ind % 2 else name - for obs_ind, name in enumerate(a.obs_names) - ] + _, off_axis_name = _resolve_axis(1 - axis) + off_names = getattr(a, f"{off_axis_name}_names").array + off_names[1::2] = f"{i}-" + off_names[1::2] + setattr(a, f"{off_axis_name}_names", off_names) adatas.append(a) assert_eq_concat_on_disk( @@ -174,16 +167,10 @@ def test_anndatas_with_reindex( layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), ) # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested - if axis == 1: - a.obs_names = [ - f"{i}-{name}" if obs_ind % 2 else name - for obs_ind, name in enumerate(a.obs_names) - ] - else: - a.var_names = [ - f"{i}-{name}" if var_ind % 2 else name - for var_ind, name in enumerate(a.var_names) - ] + _, off_axis_name = _resolve_axis(1 - axis) + off_names = getattr(a, f"{off_axis_name}_names").array + off_names[1::2] = f"{i}-" + off_names[1::2] + setattr(a, f"{off_axis_name}_names", off_names) adatas.append(a) assert_eq_concat_on_disk( From 4bf9450bd3a7ffc21444fef57848e9e4f0be14ea Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Mon, 1 Jul 2024 12:10:55 +0200 Subject: [PATCH 7/8] simplify --- tests/test_concatenate_disk.py | 80 +++++++++++----------------------- 1 file changed, 26 insertions(+), 54 deletions(-) diff --git a/tests/test_concatenate_disk.py b/tests/test_concatenate_disk.py index ba22c0aac..216b66e21 100644 --- a/tests/test_concatenate_disk.py +++ b/tests/test_concatenate_disk.py @@ -34,30 +34,28 @@ @pytest.fixture(params=[0, 1]) -def axis(request): +def axis(request) -> Literal[0, 1]: return request.param -@pytest.fixture( - params=["array", "sparse", "sparse_array"], -) -def array_type(request): +@pytest.fixture(params=["array", "sparse", "sparse_array"]) +def array_type(request) -> Literal["array", "sparse", "sparse_array"]: return request.param @pytest.fixture(params=["inner", "outer"]) -def join_type(request): +def join_type(request) -> Literal["inner", "outer"]: return request.param @pytest.fixture(params=["zarr", "h5ad"]) -def file_format(request): +def file_format(request) -> Literal["zarr", "h5ad"]: return request.param # trying with 10 should be slow but will guarantee that the feature is being used @pytest.fixture(params=[10, 100_000_000]) -def max_loaded_elems(request): +def max_loaded_elems(request) -> int: return request.param @@ -112,62 +110,36 @@ def get_array_type(array_type, axis): raise NotImplementedError(f"array_type {array_type} not implemented") -def test_anndatas_without_reindex( - axis, array_type, join_type, tmp_path, max_loaded_elems, file_format +@pytest.mark.parametrize("reindex", [True, False], ids=["reindex", "no_reindex"]) +def test_anndatas( + axis: Literal[0, 1], + array_type: Literal["array", "sparse", "sparse_array"], + join_type: Literal["inner", "outer"], + tmp_path: Path, + max_loaded_elems: int, + file_format: Literal["zarr", "h5ad"], + reindex: bool, ): - M = N = 50 + _, off_axis_name = _resolve_axis(1 - axis) + random_axes = {0, 1} if reindex else {axis} sparse_fmt = "csr" if axis == 0 else "csc" - - adatas = [] - for i in range(5): - if axis == 0: - M = np.random.randint(1, 100) - else: - N = np.random.randint(1, 100) - - a = gen_adata( - (M, N), - X_type=get_array_type(array_type, axis), - sparse_fmt=sparse_fmt, - **GEN_ADATA_OOC_CONCAT_ARGS, + kw = ( + GEN_ADATA_OOC_CONCAT_ARGS + if not reindex + else dict( + obsm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), + varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), + layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), ) - # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested - _, off_axis_name = _resolve_axis(1 - axis) - off_names = getattr(a, f"{off_axis_name}_names").array - off_names[1::2] = f"{i}-" + off_names[1::2] - setattr(a, f"{off_axis_name}_names", off_names) - adatas.append(a) - - assert_eq_concat_on_disk( - adatas, - tmp_path, - file_format, - max_loaded_elems, - axis=axis, - join=join_type, ) - -def test_anndatas_with_reindex( - axis, array_type, join_type, tmp_path, file_format, max_loaded_elems -): - sparse_fmt = "csr" if axis == 0 else "csc" - adatas = [] for i in range(5): - M = np.random.randint(1, 100) - N = np.random.randint(1, 100) - + M, N = (np.random.randint(1, 100) if a in random_axes else 50 for a in (0, 1)) a = gen_adata( - (M, N), - X_type=get_array_type(array_type, axis), - sparse_fmt=sparse_fmt, - obsm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), - varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame), - layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame), + (M, N), X_type=get_array_type(array_type, axis), sparse_fmt=sparse_fmt, **kw ) # ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested - _, off_axis_name = _resolve_axis(1 - axis) off_names = getattr(a, f"{off_axis_name}_names").array off_names[1::2] = f"{i}-" + off_names[1::2] setattr(a, f"{off_axis_name}_names", off_names) From ff68f995f07686c6360f81e34cf208cab5a5432e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jul 2024 15:07:52 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/release-notes/0.10.9.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md index f61218cde..1fc7a1ca7 100644 --- a/docs/release-notes/0.10.9.md +++ b/docs/release-notes/0.10.9.md @@ -12,4 +12,3 @@ #### Performance * Support for `concat_on_disk` outer join {pr}`1504` {user}`ilan-gold` -