Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(feat): allow join=outer for concat_on_disk #1504

Merged
merged 13 commits into from
Jul 4, 2024
Merged
2 changes: 2 additions & 0 deletions docs/release-notes/0.10.9.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,5 @@

```{rubric} Performance
```

* Support for `concat_on_disk` outer join {pr}`1504` {user}`ilan-gold`
2 changes: 0 additions & 2 deletions src/anndata/experimental/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,8 +538,6 @@ def concat_on_disk(
# Argument normalization
if pairwise:
raise NotImplementedError("pairwise concatenation not yet implemented")
if join != "inner":
raise NotImplementedError("only inner join is currently supported")
Comment on lines -541 to -542
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am trying to become more familiar with the code, but it's shocking this was all that was needed. That being said, the tests do pass and I looked at the results so 🤷

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, looking at the code path, it makes some sense. concat_on_disk constructs sensible-looking reindexers for the alt_dim axis and then passes those on so there shouldn't be a need for the join argument anywhere else despite the option to use it


merge = resolve_merge_strategy(merge)
uns_merge = resolve_merge_strategy(uns_merge)
Expand Down
38 changes: 30 additions & 8 deletions tests/test_concatenate_disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def array_type(request):
return request.param


@pytest.fixture(params=["inner"])
@pytest.fixture(params=["inner", "outer"])
def join_type(request):
return request.param

Expand Down Expand Up @@ -126,10 +126,19 @@ def test_anndatas_without_reindex(
sparse_fmt=sparse_fmt,
**GEN_ADATA_OOC_CONCAT_ARGS,
)
# ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested
if axis == 0:
a.obs_names = f"{i}-" + a.obs_names
a.var_names = [
f"{i}-{name}" if var_ind % 2 else name
for var_ind, name in enumerate(a.var_names)
]
else:
a.var_names = f"{i}-" + a.var_names
a.obs_names = [
f"{i}-{name}" if obs_ind % 2 else name
for obs_ind, name in enumerate(a.obs_names)
]
adatas.append(a)

assert_eq_concat_on_disk(
Expand All @@ -153,7 +162,7 @@ def test_anndatas_with_reindex(
if axis == 0:
sparse_fmt = "csr"

for _ in range(5):
for i in range(5):
M = np.random.randint(1, 100)
N = np.random.randint(1, 100)

Expand All @@ -166,11 +175,20 @@ def test_anndatas_with_reindex(
np.ndarray,
pd.DataFrame,
),
varm_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame),
varm_types=(get_array_type("sparse", 1 - axis), np.ndarray, pd.DataFrame),
layers_types=(get_array_type("sparse", axis), np.ndarray, pd.DataFrame),
)
a.layers["sparse"] = get_array_type("sparse", axis)(a.layers["sparse"])
a.varm["sparse"] = get_array_type("sparse", 1 - axis)(a.varm["sparse"])
# ensure some names overlap, others do not, for the off-axis so that inner/outer is properly tested
if axis == 1:
a.obs_names = [
f"{i}-{name}" if obs_ind % 2 else name
for obs_ind, name in enumerate(a.obs_names)
]
else:
a.var_names = [
f"{i}-{name}" if var_ind % 2 else name
for var_ind, name in enumerate(a.var_names)
]
ilan-gold marked this conversation as resolved.
Show resolved Hide resolved
adatas.append(a)

assert_eq_concat_on_disk(
Expand Down Expand Up @@ -208,7 +226,7 @@ def test_concat_ordered_categoricals_retained(tmp_path, file_format):


@pytest.fixture()
def obsm_adatas():
def xxxm_adatas():
def gen_index(n):
return [f"cell{i}" for i in range(n)]

Expand Down Expand Up @@ -256,8 +274,12 @@ def gen_index(n):
]


def test_concatenate_obsm_inner(obsm_adatas, tmp_path, file_format):
assert_eq_concat_on_disk(obsm_adatas, tmp_path, file_format, join="inner")
def test_concatenate_xxxm(xxxm_adatas, tmp_path, file_format, join_type):
if join_type == "outer":
for i in range(len(xxxm_adatas)):
xxxm_adatas[i] = xxxm_adatas[i].T
xxxm_adatas[i].X = sparse.csr_matrix(xxxm_adatas[i].X)
assert_eq_concat_on_disk(xxxm_adatas, tmp_path, file_format, join=join_type)


def test_output_dir_exists(tmp_path):
Expand Down
Loading