From 517db29ca8d1ec75ade1f438163b32f166ecb61d Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Wed, 14 Aug 2024 14:22:34 +0100
Subject: [PATCH 1/7] Add a --use-cubed option to pytest

---
 conftest.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/conftest.py b/conftest.py
index bcccb6e8a..9343393f3 100644
--- a/conftest.py
+++ b/conftest.py
@@ -2,6 +2,30 @@
 collect_ignore_glob = ["benchmarks/**", "sgkit/io/vcf/*.py", ".github/scripts/*.py"]
 
 
+def pytest_addoption(parser):
+    parser.addoption(
+        "--use-cubed", action="store_true", default=False, help="run with cubed"
+    )
+
+
+def use_cubed():
+    import dask
+    import xarray as xr
+
+    # set xarray to use cubed by default
+    xr.set_options(chunk_manager="cubed")
+
+    # ensure that dask compute raises if it is ever called
+    class AlwaysRaiseScheduler:
+        def __call__(self, dsk, keys, **kwargs):
+            raise RuntimeError("Dask 'compute' was called")
+
+    dask.config.set(scheduler=AlwaysRaiseScheduler())
+
+
 def pytest_configure(config) -> None:  # type: ignore
     # Add "gpu" marker
     config.addinivalue_line("markers", "gpu:Run tests that run on GPU")
+
+    if config.getoption("--use-cubed"):
+        use_cubed()

From 3637f0342f66eff9bdc453ef97dc13711bb39695 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 3 Sep 2024 13:37:20 +0100
Subject: [PATCH 2/7] Introduce sgkit.distarray to switch between Dask and
 Cubed

---
 sgkit/distarray.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 sgkit/distarray.py

diff --git a/sgkit/distarray.py b/sgkit/distarray.py
new file mode 100644
index 000000000..ddd914ae4
--- /dev/null
+++ b/sgkit/distarray.py
@@ -0,0 +1,10 @@
+from xarray.namedarray.parallelcompat import guess_chunkmanager
+
+# use the xarray chunk manager to determine the distributed array module to use
+cm = guess_chunkmanager(None)
+
+if cm.array_cls.__module__.split(".")[0] == "cubed":
+    from cubed import *  # noqa: F401, F403
+else:
+    # default to dask
+    from dask.array import *  # noqa: F401, F403

From 28f58a49f4ad67b0952e455d99efef1409802f99 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 3 Sep 2024 08:34:59 +0100
Subject: [PATCH 3/7] Use sgkit.distarray for count_call_alleles

---
 sgkit/stats/aggregation.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py
index 41adcc221..fe197c3bf 100644
--- a/sgkit/stats/aggregation.py
+++ b/sgkit/stats/aggregation.py
@@ -1,11 +1,11 @@
 from typing import Hashable
 
-import dask.array as da
 import numpy as np
 import xarray as xr
 from typing_extensions import Literal
 from xarray import Dataset
 
+import sgkit.distarray as da
 from sgkit import variables
 from sgkit.display import genotype_as_bytes
 from sgkit.utils import (
@@ -85,7 +85,13 @@ def count_call_alleles(
             variables.call_allele_count: (
                 ("variants", "samples", "alleles"),
                 da.map_blocks(
-                    count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2
+                    count_alleles,
+                    G,
+                    N,
+                    chunks=shape,
+                    dtype=np.uint8,
+                    drop_axis=2,
+                    new_axis=2,
                 ),
             )
         }

From 899c7d9a58c72b3b5d29d45ad2a2b7357ed6174a Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Wed, 14 Aug 2024 14:23:18 +0100
Subject: [PATCH 4/7] Only rechunk in non-core dims in
 test_count_call_alleles__chunked

---
 sgkit/tests/test_aggregation.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py
index a5e68645e..34df9eddd 100644
--- a/sgkit/tests/test_aggregation.py
+++ b/sgkit/tests/test_aggregation.py
@@ -265,10 +265,12 @@ def test_count_call_alleles__chunked():
     calls = rs.randint(0, 1, size=(50, 10, 2))
     ds = get_dataset(calls)
     ac1 = count_call_alleles(ds)
-    # Coerce from numpy to multiple chunks in all dimensions
-    ds["call_genotype"] = ds["call_genotype"].chunk(chunks=(5, 5, 1))
+    # Coerce from numpy to multiple chunks in all non-core dimensions
+    ds["call_genotype"] = ds["call_genotype"].chunk(
+        chunks={"variants": 5, "samples": 5}
+    )
     ac2 = count_call_alleles(ds)
-    assert isinstance(ac2["call_allele_count"].data, da.Array)
+    assert hasattr(ac2["call_allele_count"].data, "chunks")
     xr.testing.assert_equal(ac1, ac2)
 
 

From 0ec6634fe28bbe00749bcde43fe5022e53f6985a Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Fri, 6 Sep 2024 13:16:24 +0100
Subject: [PATCH 5/7] Fix coverage

---
 sgkit/distarray.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sgkit/distarray.py b/sgkit/distarray.py
index ddd914ae4..fe3bda790 100644
--- a/sgkit/distarray.py
+++ b/sgkit/distarray.py
@@ -4,7 +4,7 @@
 cm = guess_chunkmanager(None)
 
 if cm.array_cls.__module__.split(".")[0] == "cubed":
-    from cubed import *  # noqa: F401, F403
+    from cubed import *  # pragma: no cover # noqa: F401, F403
 else:
     # default to dask
     from dask.array import *  # noqa: F401, F403

From b1dd259b178e5dad1c34be08d62a4d29084df2a3 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Mon, 19 Aug 2024 15:44:12 +0100
Subject: [PATCH 6/7] Add GH actions workflow to test on Cubed

---
 .github/workflows/cubed.yml | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/cubed.yml

diff --git a/.github/workflows/cubed.yml b/.github/workflows/cubed.yml
new file mode 100644
index 000000000..2b9cde4e8
--- /dev/null
+++ b/.github/workflows/cubed.yml
@@ -0,0 +1,33 @@
+name: Cubed
+
+on:
+  push:
+  pull_request:
+  # manual trigger
+  workflow_dispatch:
+
+jobs:
+  build:
+    # This workflow only runs on the origin org
+    # if: github.repository_owner == 'sgkit-dev'
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install deps and sgkit
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install -r requirements.txt -r requirements-dev.txt
+        python -m pip install -U git+https://github.com/cubed-dev/cubed.git -U git+https://github.com/cubed-dev/cubed-xarray.git -U git+https://github.com/pydata/xarray.git
+
+    - name: Test with pytest
+      run: |
+        pytest -v sgkit/tests/test_aggregation.py -k "test_count_call_alleles" --use-cubed

From 471d8378be57f88bb5b31fc3471159268e4b25df Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Mon, 9 Sep 2024 12:14:29 +0100
Subject: [PATCH 7/7] Don't allow chunking in ploidy dimension

---
 sgkit/stats/aggregation.py      |  5 +++++
 sgkit/tests/test_aggregation.py | 14 ++++++++++++--
 sgkit/tests/test_popgen.py      |  4 ++--
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py
index fe197c3bf..f76dbe691 100644
--- a/sgkit/stats/aggregation.py
+++ b/sgkit/stats/aggregation.py
@@ -77,6 +77,11 @@ def count_call_alleles(
     variables.validate(ds, {call_genotype: variables.call_genotype_spec})
     n_alleles = ds.sizes["alleles"]
     G = da.asarray(ds[call_genotype])
+    if G.numblocks[2] > 1:
+        raise ValueError(
+            f"Variable {call_genotype} must have only a single chunk in the ploidy dimension. "
+            "Consider rechunking to change the size of chunks."
+        )
     shape = (G.chunks[0], G.chunks[1], n_alleles)
     # use numpy array to avoid dask task dependencies between chunks
     N = np.empty(n_alleles, dtype=np.uint8)
diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py
index 34df9eddd..e2719c1c1 100644
--- a/sgkit/tests/test_aggregation.py
+++ b/sgkit/tests/test_aggregation.py
@@ -139,8 +139,10 @@ def test_count_variant_alleles__chunked(using):
     calls = rs.randint(0, 1, size=(50, 10, 2))
     ds = get_dataset(calls)
     ac1 = count_variant_alleles(ds, using=using)
-    # Coerce from numpy to multiple chunks in all dimensions
-    ds["call_genotype"] = ds["call_genotype"].chunk(chunks=(5, 5, 1))
+    # Coerce from numpy to multiple chunks in all non-core dimensions
+    ds["call_genotype"] = ds["call_genotype"].chunk(
+        chunks={"variants": 5, "samples": 5}
+    )
     ac2 = count_variant_alleles(ds, using=using)
     assert isinstance(ac2["variant_allele_count"].data, da.Array)
     xr.testing.assert_equal(ac1, ac2)
@@ -273,6 +275,14 @@ def test_count_call_alleles__chunked():
     assert hasattr(ac2["call_allele_count"].data, "chunks")
     xr.testing.assert_equal(ac1, ac2)
 
+    # Multiple chunks in core dimension should fail
+    ds["call_genotype"] = ds["call_genotype"].chunk(chunks={"ploidy": 1})
+    with pytest.raises(
+        ValueError,
+        match="Variable call_genotype must have only a single chunk in the ploidy dimension",
+    ):
+        count_call_alleles(ds)
+
 
 def test_count_cohort_alleles__multi_variant_multi_sample():
     ds = get_dataset(
diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py
index 50fc9bb4c..3014b9b31 100644
--- a/sgkit/tests/test_popgen.py
+++ b/sgkit/tests/test_popgen.py
@@ -533,7 +533,7 @@ def test_Garud_h__raise_on_no_windows():
 
 
 @pytest.mark.filterwarnings("ignore::RuntimeWarning")
-@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (2, 2))])
+@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (4))])
 def test_observed_heterozygosity(chunks):
     ds = simulate_genotype_call_dataset(
         n_variant=4,
@@ -599,7 +599,7 @@ def test_observed_heterozygosity(chunks):
 
 
 @pytest.mark.filterwarnings("ignore::RuntimeWarning")
-@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (2, 2))])
+@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (4,))])
 @pytest.mark.parametrize(
     "cohorts,expectation",
     [