From b6f0bcfc1f5def12555e1da2eda2406fda302c30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Ordyna?= Date: Fri, 14 Jul 2023 14:43:55 +0200 Subject: [PATCH 1/3] make it possible to manually set chunks when loading dask arrays --- docs/source/analysis/dask.rst | 3 + src/binding/python/openpmd_api/DaskArray.py | 102 +++++++++++--------- 2 files changed, 57 insertions(+), 48 deletions(-) diff --git a/docs/source/analysis/dask.rst b/docs/source/analysis/dask.rst index e00dce0e98..3eb403368d 100644 --- a/docs/source/analysis/dask.rst +++ b/docs/source/analysis/dask.rst @@ -41,6 +41,9 @@ The central Python API calls to convert to DASK datatypes are the ``ParticleSpec # note: no series.flush() needed +The ``to_dask_array`` method will automaticaly set dask array chunking based on the availible chunks. +The default behaviour can be overriden by passing an additional keyword argument ``chunks``, see ``dask.array.from_array`` documentation for more details. +For example, to chunk only along the outhermost axis in a 3D dataset using the default dask array chunk size call ``to_dask_array(chunks={0: 'auto', 1: -1, 2: -1})``. Example ------- diff --git a/src/binding/python/openpmd_api/DaskArray.py b/src/binding/python/openpmd_api/DaskArray.py index 8c6fc3001b..84bc9adc23 100644 --- a/src/binding/python/openpmd_api/DaskArray.py +++ b/src/binding/python/openpmd_api/DaskArray.py @@ -1,8 +1,8 @@ """ This file is part of the openPMD-api. -Copyright 2021 openPMD contributors -Authors: Axel Huebl +Copyright 2023 openPMD contributors +Authors: Axel Huebl, Pawel Ordyna License: LGPLv3+ """ import math @@ -49,7 +49,7 @@ def __getitem__(self, slices): return data -def record_component_to_daskarray(record_component): +def record_component_to_daskarray(record_component, chunks=None): """ Load a RecordComponent into a Dask.array. @@ -57,6 +57,10 @@ def record_component_to_daskarray(record_component): ---------- record_component : openpmd_api.Record_Component A record component class in openPMD-api. + chunks : chunks parameter to pass to dask.array.from_array. + See dask documentation for more details. + When set to None (default) the chunking will be automaticaly + determined based on record_component.available_chunks(). Returns ------- @@ -84,54 +88,56 @@ def record_component_to_daskarray(record_component): if not found_dask: raise ImportError("dask NOT found. Install dask for Dask DataFrame " "support.") - - # get optimal chunks - chunks = record_component.available_chunks() - - # sort and prepare the chunks for Dask's array API - # https://docs.dask.org/en/latest/array-chunks.html - # https://docs.dask.org/en/latest/array-api.html?highlight=from_array#other-functions - # sorted and unique - offsets_per_dim = list(map(list, zip(*[chunk.offset for chunk in chunks]))) - offsets_sorted_unique_per_dim = [sorted(set(o)) for o in offsets_per_dim] - - # print("offsets_sorted_unique_per_dim=", - # list(offsets_sorted_unique_per_dim)) - - # case 1: PIConGPU static load balancing (works with Dask assumptions, - # chunk option no. 3) - # all chunks in the same column have the same column width although - # individual columns have different widths - # case 2: AMReX boxes - # all chunks are multiple of a common block size, offsets are a multiple - # of a common blocksize - # problem: too limited description in Dask - # https://github.com/dask/dask/issues/7475 - # work-around: create smaller chunks (this incurs a read cost) by forcing - # into case 1 - # (this can lead to larger blocks than using the gcd of the - # extents aka AMReX block size) - common_chunk_widths_per_dim = list() - for d, offsets_in_dim in enumerate(offsets_sorted_unique_per_dim): - # print("d=", d, offsets_in_dim, record_component.shape[d]) - offsets_in_dim_arr = np.array(offsets_in_dim) - # note: this is in the right order of rows/columns, contrary to a - # sorted extent list from chunks - extents_in_dim = np.zeros_like(offsets_in_dim_arr) - extents_in_dim[:-1] = offsets_in_dim_arr[1:] - extents_in_dim[-1] = record_component.shape[d] - if len(extents_in_dim) > 1: - extents_in_dim[:-1] -= offsets_in_dim_arr[:-1] - extents_in_dim[-1] -= offsets_in_dim_arr[-1] - # print("extents_in_dim=", extents_in_dim) - common_chunk_widths_per_dim.append(tuple(extents_in_dim)) - - common_chunk_widths_per_dim = tuple(common_chunk_widths_per_dim) - # print("common_chunk_widths_per_dim=", common_chunk_widths_per_dim) + if chunks is None: + # get optimal chunks + chunks = record_component.available_chunks() + + # sort and prepare the chunks for Dask's array API + # https://docs.dask.org/en/latest/array-chunks.html + # https://docs.dask.org/en/latest/array-api.html?highlight=from_array#other-functions + # sorted and unique + offsets_per_dim = list( + map(list, zip(*[chunk.offset for chunk in chunks]))) + offsets_sorted_unique_per_dim = [ + sorted(set(o)) for o in offsets_per_dim] + + # print("offsets_sorted_unique_per_dim=", + # list(offsets_sorted_unique_per_dim)) + + # case 1: PIConGPU static load balancing (works with Dask assumptions, + # chunk option no. 3) + # all chunks in the same column have the same column width although + # individual columns have different widths + # case 2: AMReX boxes + # all chunks are multiple of a common block size, offsets + # are a multiple of a common blocksize + # problem: too limited description in Dask + # https://github.com/dask/dask/issues/7475 + # work-around: create smaller chunks (this incurs a read cost) + # by forcing into case 1 + # (this can lead to larger blocks than using + # the gcd of the extents aka AMReX block size) + common_chunk_widths_per_dim = list() + for d, offsets_in_dim in enumerate(offsets_sorted_unique_per_dim): + # print("d=", d, offsets_in_dim, record_component.shape[d]) + offsets_in_dim_arr = np.array(offsets_in_dim) + # note: this is in the right order of rows/columns, contrary to a + # sorted extent list from chunks + extents_in_dim = np.zeros_like(offsets_in_dim_arr) + extents_in_dim[:-1] = offsets_in_dim_arr[1:] + extents_in_dim[-1] = record_component.shape[d] + if len(extents_in_dim) > 1: + extents_in_dim[:-1] -= offsets_in_dim_arr[:-1] + extents_in_dim[-1] -= offsets_in_dim_arr[-1] + # print("extents_in_dim=", extents_in_dim) + common_chunk_widths_per_dim.append(tuple(extents_in_dim)) + + chunks = tuple(common_chunk_widths_per_dim) + # print("chunks=", chunks) da = from_array( DaskRecordComponent(record_component), - chunks=common_chunk_widths_per_dim, + chunks=chunks, # name=None, asarray=True, fancy=False, From acedd469ea2539049ae1596e06e0102eddaa186d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 14 Jul 2023 13:08:08 +0000 Subject: [PATCH 2/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/binding/python/openpmd_api/DaskArray.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/binding/python/openpmd_api/DaskArray.py b/src/binding/python/openpmd_api/DaskArray.py index 84bc9adc23..2ef1ad9dd0 100644 --- a/src/binding/python/openpmd_api/DaskArray.py +++ b/src/binding/python/openpmd_api/DaskArray.py @@ -59,7 +59,7 @@ def record_component_to_daskarray(record_component, chunks=None): A record component class in openPMD-api. chunks : chunks parameter to pass to dask.array.from_array. See dask documentation for more details. - When set to None (default) the chunking will be automaticaly + When set to None (default) the chunking will be automaticaly determined based on record_component.available_chunks(). Returns @@ -113,9 +113,9 @@ def record_component_to_daskarray(record_component, chunks=None): # are a multiple of a common blocksize # problem: too limited description in Dask # https://github.com/dask/dask/issues/7475 - # work-around: create smaller chunks (this incurs a read cost) + # work-around: create smaller chunks (this incurs a read cost) # by forcing into case 1 - # (this can lead to larger blocks than using + # (this can lead to larger blocks than using # the gcd of the extents aka AMReX block size) common_chunk_widths_per_dim = list() for d, offsets_in_dim in enumerate(offsets_sorted_unique_per_dim): From 17cc4de8a7764f3aec076f3265d8d4812ca0747d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Ordyna?= Date: Mon, 14 Aug 2023 11:22:09 +0200 Subject: [PATCH 3/3] Apply suggestions from code review Copyright and typos in setting chunks for dask arrays. Co-authored-by: Axel Huebl --- docs/source/analysis/dask.rst | 6 +++--- src/binding/python/openpmd_api/DaskArray.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/source/analysis/dask.rst b/docs/source/analysis/dask.rst index 3eb403368d..432bdd5790 100644 --- a/docs/source/analysis/dask.rst +++ b/docs/source/analysis/dask.rst @@ -41,9 +41,9 @@ The central Python API calls to convert to DASK datatypes are the ``ParticleSpec # note: no series.flush() needed -The ``to_dask_array`` method will automaticaly set dask array chunking based on the availible chunks. -The default behaviour can be overriden by passing an additional keyword argument ``chunks``, see ``dask.array.from_array`` documentation for more details. -For example, to chunk only along the outhermost axis in a 3D dataset using the default dask array chunk size call ``to_dask_array(chunks={0: 'auto', 1: -1, 2: -1})``. +The ``to_dask_array`` method will automatically set Dask array chunking based on the available chunks in the read data set. +The default behavior can be overridden by passing an additional keyword argument ``chunks``, see the `dask.array.from_array documentation `__ for more details. +For example, to chunk only along the outermost axis in a 3D dataset using the default Dask array chunk size, call ``to_dask_array(chunks={0: 'auto', 1: -1, 2: -1})``. Example ------- diff --git a/src/binding/python/openpmd_api/DaskArray.py b/src/binding/python/openpmd_api/DaskArray.py index 2ef1ad9dd0..709b668be8 100644 --- a/src/binding/python/openpmd_api/DaskArray.py +++ b/src/binding/python/openpmd_api/DaskArray.py @@ -1,7 +1,7 @@ """ This file is part of the openPMD-api. -Copyright 2023 openPMD contributors +Copyright 2021-2023 openPMD contributors Authors: Axel Huebl, Pawel Ordyna License: LGPLv3+ """