From 517db29ca8d1ec75ade1f438163b32f166ecb61d Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 14 Aug 2024 14:22:34 +0100 Subject: [PATCH 1/7] Add a --use-cubed option to pytest --- conftest.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/conftest.py b/conftest.py index bcccb6e8a..9343393f3 100644 --- a/conftest.py +++ b/conftest.py @@ -2,6 +2,30 @@ collect_ignore_glob = ["benchmarks/**", "sgkit/io/vcf/*.py", ".github/scripts/*.py"] +def pytest_addoption(parser): + parser.addoption( + "--use-cubed", action="store_true", default=False, help="run with cubed" + ) + + +def use_cubed(): + import dask + import xarray as xr + + # set xarray to use cubed by default + xr.set_options(chunk_manager="cubed") + + # ensure that dask compute raises if it is ever called + class AlwaysRaiseScheduler: + def __call__(self, dsk, keys, **kwargs): + raise RuntimeError("Dask 'compute' was called") + + dask.config.set(scheduler=AlwaysRaiseScheduler()) + + def pytest_configure(config) -> None: # type: ignore # Add "gpu" marker config.addinivalue_line("markers", "gpu:Run tests that run on GPU") + + if config.getoption("--use-cubed"): + use_cubed() From 3637f0342f66eff9bdc453ef97dc13711bb39695 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 3 Sep 2024 13:37:20 +0100 Subject: [PATCH 2/7] Introduce sgkit.distarray to switch between Dask and Cubed --- sgkit/distarray.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 sgkit/distarray.py diff --git a/sgkit/distarray.py b/sgkit/distarray.py new file mode 100644 index 000000000..ddd914ae4 --- /dev/null +++ b/sgkit/distarray.py @@ -0,0 +1,10 @@ +from xarray.namedarray.parallelcompat import guess_chunkmanager + +# use the xarray chunk manager to determine the distributed array module to use +cm = guess_chunkmanager(None) + +if cm.array_cls.__module__.split(".")[0] == "cubed": + from cubed import * # noqa: F401, F403 +else: + # default to dask + from dask.array import * # noqa: F401, F403 From 28f58a49f4ad67b0952e455d99efef1409802f99 Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 3 Sep 2024 08:34:59 +0100 Subject: [PATCH 3/7] Use sgkit.distarray for count_call_alleles --- sgkit/stats/aggregation.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index 41adcc221..fe197c3bf 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -1,11 +1,11 @@ from typing import Hashable -import dask.array as da import numpy as np import xarray as xr from typing_extensions import Literal from xarray import Dataset +import sgkit.distarray as da from sgkit import variables from sgkit.display import genotype_as_bytes from sgkit.utils import ( @@ -85,7 +85,13 @@ def count_call_alleles( variables.call_allele_count: ( ("variants", "samples", "alleles"), da.map_blocks( - count_alleles, G, N, chunks=shape, drop_axis=2, new_axis=2 + count_alleles, + G, + N, + chunks=shape, + dtype=np.uint8, + drop_axis=2, + new_axis=2, ), ) } From 899c7d9a58c72b3b5d29d45ad2a2b7357ed6174a Mon Sep 17 00:00:00 2001 From: Tom White Date: Wed, 14 Aug 2024 14:23:18 +0100 Subject: [PATCH 4/7] Only rechunk in non-core dims in test_count_call_alleles__chunked --- sgkit/tests/test_aggregation.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py index a5e68645e..34df9eddd 100644 --- a/sgkit/tests/test_aggregation.py +++ b/sgkit/tests/test_aggregation.py @@ -265,10 +265,12 @@ def test_count_call_alleles__chunked(): calls = rs.randint(0, 1, size=(50, 10, 2)) ds = get_dataset(calls) ac1 = count_call_alleles(ds) - # Coerce from numpy to multiple chunks in all dimensions - ds["call_genotype"] = ds["call_genotype"].chunk(chunks=(5, 5, 1)) + # Coerce from numpy to multiple chunks in all non-core dimensions + ds["call_genotype"] = ds["call_genotype"].chunk( + chunks={"variants": 5, "samples": 5} + ) ac2 = count_call_alleles(ds) - assert isinstance(ac2["call_allele_count"].data, da.Array) + assert hasattr(ac2["call_allele_count"].data, "chunks") xr.testing.assert_equal(ac1, ac2) From 0ec6634fe28bbe00749bcde43fe5022e53f6985a Mon Sep 17 00:00:00 2001 From: Tom White Date: Fri, 6 Sep 2024 13:16:24 +0100 Subject: [PATCH 5/7] Fix coverage --- sgkit/distarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sgkit/distarray.py b/sgkit/distarray.py index ddd914ae4..fe3bda790 100644 --- a/sgkit/distarray.py +++ b/sgkit/distarray.py @@ -4,7 +4,7 @@ cm = guess_chunkmanager(None) if cm.array_cls.__module__.split(".")[0] == "cubed": - from cubed import * # noqa: F401, F403 + from cubed import * # pragma: no cover # noqa: F401, F403 else: # default to dask from dask.array import * # noqa: F401, F403 From b1dd259b178e5dad1c34be08d62a4d29084df2a3 Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 19 Aug 2024 15:44:12 +0100 Subject: [PATCH 6/7] Add GH actions workflow to test on Cubed --- .github/workflows/cubed.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/cubed.yml diff --git a/.github/workflows/cubed.yml b/.github/workflows/cubed.yml new file mode 100644 index 000000000..2b9cde4e8 --- /dev/null +++ b/.github/workflows/cubed.yml @@ -0,0 +1,33 @@ +name: Cubed + +on: + push: + pull_request: + # manual trigger + workflow_dispatch: + +jobs: + build: + # This workflow only runs on the origin org + # if: github.repository_owner == 'sgkit-dev' + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install deps and sgkit + run: | + python -m pip install --upgrade pip + python -m pip install -r requirements.txt -r requirements-dev.txt + python -m pip install -U git+https://github.com/cubed-dev/cubed.git -U git+https://github.com/cubed-dev/cubed-xarray.git -U git+https://github.com/pydata/xarray.git + + - name: Test with pytest + run: | + pytest -v sgkit/tests/test_aggregation.py -k "test_count_call_alleles" --use-cubed From 471d8378be57f88bb5b31fc3471159268e4b25df Mon Sep 17 00:00:00 2001 From: Tom White Date: Mon, 9 Sep 2024 12:14:29 +0100 Subject: [PATCH 7/7] Don't allow chunking in ploidy dimension --- sgkit/stats/aggregation.py | 5 +++++ sgkit/tests/test_aggregation.py | 14 ++++++++++++-- sgkit/tests/test_popgen.py | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/sgkit/stats/aggregation.py b/sgkit/stats/aggregation.py index fe197c3bf..f76dbe691 100644 --- a/sgkit/stats/aggregation.py +++ b/sgkit/stats/aggregation.py @@ -77,6 +77,11 @@ def count_call_alleles( variables.validate(ds, {call_genotype: variables.call_genotype_spec}) n_alleles = ds.sizes["alleles"] G = da.asarray(ds[call_genotype]) + if G.numblocks[2] > 1: + raise ValueError( + f"Variable {call_genotype} must have only a single chunk in the ploidy dimension. " + "Consider rechunking to change the size of chunks." + ) shape = (G.chunks[0], G.chunks[1], n_alleles) # use numpy array to avoid dask task dependencies between chunks N = np.empty(n_alleles, dtype=np.uint8) diff --git a/sgkit/tests/test_aggregation.py b/sgkit/tests/test_aggregation.py index 34df9eddd..e2719c1c1 100644 --- a/sgkit/tests/test_aggregation.py +++ b/sgkit/tests/test_aggregation.py @@ -139,8 +139,10 @@ def test_count_variant_alleles__chunked(using): calls = rs.randint(0, 1, size=(50, 10, 2)) ds = get_dataset(calls) ac1 = count_variant_alleles(ds, using=using) - # Coerce from numpy to multiple chunks in all dimensions - ds["call_genotype"] = ds["call_genotype"].chunk(chunks=(5, 5, 1)) + # Coerce from numpy to multiple chunks in all non-core dimensions + ds["call_genotype"] = ds["call_genotype"].chunk( + chunks={"variants": 5, "samples": 5} + ) ac2 = count_variant_alleles(ds, using=using) assert isinstance(ac2["variant_allele_count"].data, da.Array) xr.testing.assert_equal(ac1, ac2) @@ -273,6 +275,14 @@ def test_count_call_alleles__chunked(): assert hasattr(ac2["call_allele_count"].data, "chunks") xr.testing.assert_equal(ac1, ac2) + # Multiple chunks in core dimension should fail + ds["call_genotype"] = ds["call_genotype"].chunk(chunks={"ploidy": 1}) + with pytest.raises( + ValueError, + match="Variable call_genotype must have only a single chunk in the ploidy dimension", + ): + count_call_alleles(ds) + def test_count_cohort_alleles__multi_variant_multi_sample(): ds = get_dataset( diff --git a/sgkit/tests/test_popgen.py b/sgkit/tests/test_popgen.py index 50fc9bb4c..3014b9b31 100644 --- a/sgkit/tests/test_popgen.py +++ b/sgkit/tests/test_popgen.py @@ -533,7 +533,7 @@ def test_Garud_h__raise_on_no_windows(): @pytest.mark.filterwarnings("ignore::RuntimeWarning") -@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (2, 2))]) +@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (4))]) def test_observed_heterozygosity(chunks): ds = simulate_genotype_call_dataset( n_variant=4, @@ -599,7 +599,7 @@ def test_observed_heterozygosity(chunks): @pytest.mark.filterwarnings("ignore::RuntimeWarning") -@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (2, 2))]) +@pytest.mark.parametrize("chunks", [((4,), (6,), (4,)), ((2, 2), (3, 3), (4,))]) @pytest.mark.parametrize( "cohorts,expectation", [