From 5cdf6c8bf1129c307578398fd0903fc7f52f7137 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 26 Aug 2017 22:18:38 -0400 Subject: [PATCH 01/68] added HiddenKeyDict class --- xarray/core/utils.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 89d1462328c..cccefe3d3c6 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -489,3 +489,38 @@ def ensure_us_time_resolution(val): elif np.issubdtype(val.dtype, np.timedelta64): val = val.astype('timedelta64[us]') return val + + +class HiddenKeyDict(MutableMapping): + ''' + Acts like a normal dictionary, but hides certain keys. + ''' + # ``__init__`` method required to create instance from class. + def __init__(self, data, *hidden_keys): + self._data = data + self._hidden_keys = hidden_keys + + def _raise_if_hidden(self, key): + if key in self._hidden_keys: + raise KeyError('Key is hidden.') + + # The next five methods are requirements of the ABC. + def __setitem__(self, key, value): + self._raise_if_hidden(key) + self._data[key] = value + + def __getitem__(self, key): + self._raise_if_hidden(key) + return self._data[key] + + def __delitem__(self, key): + self._raise_if_hidden(key) + del self._data[key] + + def __iter__(self): + for k in self._data: + if k not in self._hidden_keys: + yield k + + def __len__(self): + return len(list(self.__iter__())) From f305c25bfe59390bf4f43ede55001a49a189182b Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 26 Aug 2017 22:19:11 -0400 Subject: [PATCH 02/68] new zarr backend --- xarray/backends/zarr.py | 172 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 xarray/backends/zarr.py diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py new file mode 100644 index 00000000000..bc3be1306d2 --- /dev/null +++ b/xarray/backends/zarr.py @@ -0,0 +1,172 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import functools +import warnings +from itertools import product +from collections import MutableMapping + +from .. import Variable +from ..core import indexing +from ..core.utils import FrozenOrderedDict, close_on_error, HiddenKeyDict +from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict + +from .common import (WritableCFDataStore, AbstractWritableDataStore, + DataStorePickleMixin) + + + + +# most of the other stores have some kind of wrapper class like +# class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): +# class H5NetCDFArrayWrapper(BaseNetCDF4Array): +# class NioArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): +# we problaby need something like this + +# the first question is whether it should be based on BaseNetCDF4Array or +# NdimSizeLenMixing? + +# or maybe we don't need wrappers at all? probably not true + + +# also most have a custom opener + +# keyword args for zarr.group +# store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None +# the group name is called "path" in the zarr lexicon + +def _open_zarr_group(store, overwrite, chunk_store, synchronizer, path): + import zarr + zarr_group = zarr.group(store=store, overwrite=overwrite, + chunk_store=chunk_store, synchronizer=synchronizer, path=path) + return zarr_group + + +def _dask_chunks_to_zarr_chunks(chunks): + # zarr chunks needs to be uniform for each array + # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + # dask chunks can be variable sized + # http://dask.pydata.org/en/latest/array-design.html#chunks + # this function dask chunks syntax to zarr chunks + if chunks is None: + return chunks + + all_chunks = product(*chunks) + first_chunk = all_chunks.next() + for this_chunk in all_chunks: + if not (this_chunk == first_chunk): + raise ValueError("zarr requires uniform chunk sizes, found %s" % + repr(chunks)) + return first_chunk + + +def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): + # Zarr arrays do not have dimenions. To get around this problem, we add + # an attribute that specifies the dimension. We have to hide this attribute + # when we send the attributes to the user. + # zarr_obj can be either a zarr group or zarr array + dimensions = zarr_obj.attrs.get(dimension_key) + attributes = HiddenKeyDict(zarr_obj.attrs, dimension_key) + return dimensions, attributes + + +class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): + """Store for reading and writing data via zarr + """ + + # need some special secret attributes to tell us the dimensions + _dimension_key = '_XARRAY_DIMENSIONS' + + def __init__(self, store=None, overwrite=False, chunk_store=None, + synchronizer=None, path=None, writer=None, autoclose=False): + opener = functools.partial(_open_zarr_group, store, overwrite, + chunk_store, synchronizer, path) + self.ds = opener() + if autoclose: + raise NotImplementedError('autoclose=True is not implemented ' + 'for the zarr backend') + self._autoclose = False + self._isopen = True + self._opener = opener + + # initialize hidden dimension attribute + self.ds.attrs[self._dimension_key] = {} + + # do we need to define attributes for all of the opener keyword args? + super(ZarrStore, self).__init__(writer) + + def open_store_variable(self, name, zarr_array): + # I don't see why it is necessary to wrap self.ds[name] + # zarr seems to implement the required ndarray interface + # TODO: possibly wrap zarr array in dask with aligned chunks + data = indexing.LazilyIndexedArray(zarr_array) + dimensions, attributes = _get_zarr_dims_and_attrs( + zarr_array, self._dimension_key) + return Variable(dimensions, data, attributes) + + def get_variables(self): + with self.ensure_open(autoclose=False): + return FrozenOrderedDict((k, self.open_store_variable(k, v)) + for k, v in self.ds.arrays()) + + def get_attrs(self): + with self.ensure_open(autoclose=True): + _, attributes = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + attrs = FrozenOrderedDict(attributes) + return attrs + + def get_dimensions(self): + with self.ensure_open(autoclose=True): + dimensions, _ = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + return dimensions + + def set_dimension(self, name, length): + with self.ensure_open(autoclose=False): + self.ds.attrs[self._dimension_key][name] = length + + def set_attribute(self, key, value): + with self.ensure_open(autoclose=False): + _, attributes = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + attributes[key] = value + + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): + + attrs = variable.attrs.copy() + dims = variable.dims + dtype = variable.dtype + shape = variable.shape + chunks = _dask_chunks_to_zarr_chunks(variable.chunks) + + # TODO: figure ouw how zarr should deal with unlimited dimensions + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) + + # let's try keeping this fill value stuff + fill_value = attrs.pop('_FillValue', None) + if fill_value in ['\x00']: + fill_value = None + + # TODO: figure out what encoding is needed for zarr + + ### arguments for zarr.create + # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', + # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, + # path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) + + # TODO: figure out how to pass along all those other arguments + + zarr_array = self.ds.create(name, shape=shape, dtype=dtype, + chunks=chunks, fill_value=fill_value) + zarr_array.attrs[self._dimension_key] = dims + _, attributes = _get_zarr_dims_and_attrs(zarr_array, + self._dimension_key) + + for k, v in iteritems(attrs): + attributes[k] = v + + return zarr_array, variable.data + + # sync() and close() methods should not be needed with zarr From 2ea21c562125eb54b54be9c539cb61e8e02c8bfb Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 26 Aug 2017 22:18:38 -0400 Subject: [PATCH 03/68] added HiddenKeyDict class --- xarray/core/utils.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 89d1462328c..cccefe3d3c6 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -489,3 +489,38 @@ def ensure_us_time_resolution(val): elif np.issubdtype(val.dtype, np.timedelta64): val = val.astype('timedelta64[us]') return val + + +class HiddenKeyDict(MutableMapping): + ''' + Acts like a normal dictionary, but hides certain keys. + ''' + # ``__init__`` method required to create instance from class. + def __init__(self, data, *hidden_keys): + self._data = data + self._hidden_keys = hidden_keys + + def _raise_if_hidden(self, key): + if key in self._hidden_keys: + raise KeyError('Key is hidden.') + + # The next five methods are requirements of the ABC. + def __setitem__(self, key, value): + self._raise_if_hidden(key) + self._data[key] = value + + def __getitem__(self, key): + self._raise_if_hidden(key) + return self._data[key] + + def __delitem__(self, key): + self._raise_if_hidden(key) + del self._data[key] + + def __iter__(self): + for k in self._data: + if k not in self._hidden_keys: + yield k + + def __len__(self): + return len(list(self.__iter__())) From d92bf2ff3c736e8323706e2f4c7b8d6cc1946dc8 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 26 Aug 2017 22:19:11 -0400 Subject: [PATCH 04/68] new zarr backend --- xarray/backends/zarr.py | 172 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 xarray/backends/zarr.py diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py new file mode 100644 index 00000000000..bc3be1306d2 --- /dev/null +++ b/xarray/backends/zarr.py @@ -0,0 +1,172 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import functools +import warnings +from itertools import product +from collections import MutableMapping + +from .. import Variable +from ..core import indexing +from ..core.utils import FrozenOrderedDict, close_on_error, HiddenKeyDict +from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict + +from .common import (WritableCFDataStore, AbstractWritableDataStore, + DataStorePickleMixin) + + + + +# most of the other stores have some kind of wrapper class like +# class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): +# class H5NetCDFArrayWrapper(BaseNetCDF4Array): +# class NioArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): +# we problaby need something like this + +# the first question is whether it should be based on BaseNetCDF4Array or +# NdimSizeLenMixing? + +# or maybe we don't need wrappers at all? probably not true + + +# also most have a custom opener + +# keyword args for zarr.group +# store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None +# the group name is called "path" in the zarr lexicon + +def _open_zarr_group(store, overwrite, chunk_store, synchronizer, path): + import zarr + zarr_group = zarr.group(store=store, overwrite=overwrite, + chunk_store=chunk_store, synchronizer=synchronizer, path=path) + return zarr_group + + +def _dask_chunks_to_zarr_chunks(chunks): + # zarr chunks needs to be uniform for each array + # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + # dask chunks can be variable sized + # http://dask.pydata.org/en/latest/array-design.html#chunks + # this function dask chunks syntax to zarr chunks + if chunks is None: + return chunks + + all_chunks = product(*chunks) + first_chunk = all_chunks.next() + for this_chunk in all_chunks: + if not (this_chunk == first_chunk): + raise ValueError("zarr requires uniform chunk sizes, found %s" % + repr(chunks)) + return first_chunk + + +def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): + # Zarr arrays do not have dimenions. To get around this problem, we add + # an attribute that specifies the dimension. We have to hide this attribute + # when we send the attributes to the user. + # zarr_obj can be either a zarr group or zarr array + dimensions = zarr_obj.attrs.get(dimension_key) + attributes = HiddenKeyDict(zarr_obj.attrs, dimension_key) + return dimensions, attributes + + +class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): + """Store for reading and writing data via zarr + """ + + # need some special secret attributes to tell us the dimensions + _dimension_key = '_XARRAY_DIMENSIONS' + + def __init__(self, store=None, overwrite=False, chunk_store=None, + synchronizer=None, path=None, writer=None, autoclose=False): + opener = functools.partial(_open_zarr_group, store, overwrite, + chunk_store, synchronizer, path) + self.ds = opener() + if autoclose: + raise NotImplementedError('autoclose=True is not implemented ' + 'for the zarr backend') + self._autoclose = False + self._isopen = True + self._opener = opener + + # initialize hidden dimension attribute + self.ds.attrs[self._dimension_key] = {} + + # do we need to define attributes for all of the opener keyword args? + super(ZarrStore, self).__init__(writer) + + def open_store_variable(self, name, zarr_array): + # I don't see why it is necessary to wrap self.ds[name] + # zarr seems to implement the required ndarray interface + # TODO: possibly wrap zarr array in dask with aligned chunks + data = indexing.LazilyIndexedArray(zarr_array) + dimensions, attributes = _get_zarr_dims_and_attrs( + zarr_array, self._dimension_key) + return Variable(dimensions, data, attributes) + + def get_variables(self): + with self.ensure_open(autoclose=False): + return FrozenOrderedDict((k, self.open_store_variable(k, v)) + for k, v in self.ds.arrays()) + + def get_attrs(self): + with self.ensure_open(autoclose=True): + _, attributes = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + attrs = FrozenOrderedDict(attributes) + return attrs + + def get_dimensions(self): + with self.ensure_open(autoclose=True): + dimensions, _ = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + return dimensions + + def set_dimension(self, name, length): + with self.ensure_open(autoclose=False): + self.ds.attrs[self._dimension_key][name] = length + + def set_attribute(self, key, value): + with self.ensure_open(autoclose=False): + _, attributes = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + attributes[key] = value + + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): + + attrs = variable.attrs.copy() + dims = variable.dims + dtype = variable.dtype + shape = variable.shape + chunks = _dask_chunks_to_zarr_chunks(variable.chunks) + + # TODO: figure ouw how zarr should deal with unlimited dimensions + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) + + # let's try keeping this fill value stuff + fill_value = attrs.pop('_FillValue', None) + if fill_value in ['\x00']: + fill_value = None + + # TODO: figure out what encoding is needed for zarr + + ### arguments for zarr.create + # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', + # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, + # path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) + + # TODO: figure out how to pass along all those other arguments + + zarr_array = self.ds.create(name, shape=shape, dtype=dtype, + chunks=chunks, fill_value=fill_value) + zarr_array.attrs[self._dimension_key] = dims + _, attributes = _get_zarr_dims_and_attrs(zarr_array, + self._dimension_key) + + for k, v in iteritems(attrs): + attributes[k] = v + + return zarr_array, variable.data + + # sync() and close() methods should not be needed with zarr From 79da971f1c8b5ae493173670a0ceb16a2eb3402f Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 5 Oct 2017 16:08:35 -0700 Subject: [PATCH 05/68] add zarr to ci reqs --- ci/requirements-py27-cdat+pynio.yml | 1 + ci/requirements-py27-windows.yml | 1 + ci/requirements-py35.yml | 1 + ci/requirements-py36-windows.yml | 1 + ci/requirements-py36.yml | 1 + 5 files changed, 5 insertions(+) diff --git a/ci/requirements-py27-cdat+pynio.yml b/ci/requirements-py27-cdat+pynio.yml index 113714cbfd6..a9a50bf345d 100644 --- a/ci/requirements-py27-cdat+pynio.yml +++ b/ci/requirements-py27-cdat+pynio.yml @@ -19,6 +19,7 @@ dependencies: - seaborn - toolz - rasterio + - zarr - pip: - coveralls - pytest-cov diff --git a/ci/requirements-py27-windows.yml b/ci/requirements-py27-windows.yml index cfd3d4262cc..a5e72c3fc4f 100644 --- a/ci/requirements-py27-windows.yml +++ b/ci/requirements-py27-windows.yml @@ -16,3 +16,4 @@ dependencies: - seaborn - toolz - rasterio + - zarr diff --git a/ci/requirements-py35.yml b/ci/requirements-py35.yml index 1c7a4558c91..48643b75243 100644 --- a/ci/requirements-py35.yml +++ b/ci/requirements-py35.yml @@ -16,6 +16,7 @@ dependencies: - seaborn - toolz - rasterio + - zarr - pip: - coveralls - pytest-cov diff --git a/ci/requirements-py36-windows.yml b/ci/requirements-py36-windows.yml index 70ff3e50a1b..ea366bd04f7 100644 --- a/ci/requirements-py36-windows.yml +++ b/ci/requirements-py36-windows.yml @@ -16,3 +16,4 @@ dependencies: - seaborn - toolz - rasterio + - zarr diff --git a/ci/requirements-py36.yml b/ci/requirements-py36.yml index 3022c1a0886..4ed629ec9c0 100644 --- a/ci/requirements-py36.yml +++ b/ci/requirements-py36.yml @@ -16,6 +16,7 @@ dependencies: - seaborn - toolz - rasterio + - zarr - pip: - coveralls - pytest-cov From 31e4409c03a001da850de6c140ccb42260346cd9 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 5 Oct 2017 16:15:33 -0700 Subject: [PATCH 06/68] add zarr api to docs --- doc/api.rst | 2 ++ doc/installing.rst | 1 + doc/io.rst | 12 ++++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 433aa93c9de..57ad8a6f86d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -419,7 +419,9 @@ Dataset methods open_dataset open_mfdataset open_rasterio + open_zarr Dataset.to_netcdf + Dataset.to_zarr save_mfdataset Dataset.to_array Dataset.to_dataframe diff --git a/doc/installing.rst b/doc/installing.rst index b3ec68f12cf..5edb0429bd8 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -24,6 +24,7 @@ For netCDF and IO reading and writing netCDF4 files that does not use the netCDF-C libraries - `pynio `__: for reading GRIB and other geoscience specific file formats +- `zarr Date: Thu, 5 Oct 2017 17:47:30 -0700 Subject: [PATCH 07/68] some zarr tests passing --- xarray/__init__.py | 1 + xarray/backends/__init__.py | 1 + xarray/backends/zarr.py | 87 +++++++++++++++++++++++++++++++++-- xarray/tests/test_backends.py | 15 ++++++ 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 654ed77b28a..9bfe569995f 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -18,6 +18,7 @@ from .backends.api import (open_dataset, open_dataarray, open_mfdataset, save_mfdataset) from .backends.rasterio_ import open_rasterio +from .backends.zarr import open_zarr from .conventions import decode_cf diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py index a082bd53e5e..a8a4afc359a 100644 --- a/xarray/backends/__init__.py +++ b/xarray/backends/__init__.py @@ -10,3 +10,4 @@ from .pynio_ import NioDataStore from .scipy_ import ScipyDataStore from .h5netcdf_ import H5NetCDFStore +from .zarr import ZarrStore diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index bc3be1306d2..ebe5a59dc70 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -14,8 +14,7 @@ from .common import (WritableCFDataStore, AbstractWritableDataStore, DataStorePickleMixin) - - +from .. import conventions # most of the other stores have some kind of wrapper class like # class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): @@ -52,7 +51,7 @@ def _dask_chunks_to_zarr_chunks(chunks): return chunks all_chunks = product(*chunks) - first_chunk = all_chunks.next() + first_chunk = next(all_chunks) for this_chunk in all_chunks: if not (this_chunk == first_chunk): raise ValueError("zarr requires uniform chunk sizes, found %s" % @@ -70,7 +69,8 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): return dimensions, attributes -class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): +class ZarrStore(WritableCFDataStore, DataStorePickleMixin): +#class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): """Store for reading and writing data via zarr """ @@ -170,3 +170,82 @@ def prepare_variable(self, name, variable, check_encoding=False, return zarr_array, variable.data # sync() and close() methods should not be needed with zarr + + +def open_zarr(store, decode_cf=True, + mask_and_scale=True, decode_times=True, autoclose=False, + concat_characters=True, decode_coords=True, + cache=None, drop_variables=None): + """Load and decode a dataset from a file or file-like object. + + Parameters + ---------- + store : MutableMapping or str + Store or path to directory in file system. + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. + decode_times : bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + autoclose : bool, optional + If True, automatically close files to avoid OS Error of too many files + being open. However, this option doesn't work with streams, e.g., + BytesIO. + concat_characters : bool, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + decode_coords : bool, optional + If True, decode the 'coordinates' attribute to identify coordinates in + the resulting dataset. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + drop_variables: string or iterable, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + Returns + ------- + dataset : Dataset + The newly created dataset. + + See Also + -------- + open_dataset + """ + if not decode_cf: + mask_and_scale = False + decode_times = False + concat_characters = False + decode_coords = False + + def maybe_decode_store(store, lock=False): + ds = conventions.decode_cf( + store, mask_and_scale=mask_and_scale, decode_times=decode_times, + concat_characters=concat_characters, decode_coords=decode_coords, + drop_variables=drop_variables) + + # this is how we would apply caching + # but do we want it for zarr stores? + #_protect_dataset_variables_inplace(ds, cache) + + return ds + + zarr_store = ZarrStore(store=store) + return maybe_decode_store(zarr_store) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c7bf5349c7a..619e13a7735 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -881,6 +881,21 @@ class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): autoclose = True +class ZarrDataTest(CFEncodedDataTest, TestCase): + @contextlib.contextmanager + def create_store(self): + with create_tmp_file(suffix='.zarr') as tmp: + yield backends.ZarrStore(store=tmp) + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}, + allow_cleanup_failure=False): + with create_tmp_file(suffix='.zarr') as tmp: + zs = backends.ZarrStore(store=tmp) + data.dump_to_store(zs) + yield xr.open_zarr(tmp) + + @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, Only32BitTypes, TestCase): @contextlib.contextmanager From 3f013655648607790a898a0af18f503dc1df95e6 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 5 Oct 2017 21:07:22 -0700 Subject: [PATCH 08/68] requires zarr decorator --- xarray/tests/__init__.py | 1 + xarray/tests/test_backends.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 0ef32601b77..a916145bfb6 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -54,6 +54,7 @@ def _importorskip(modname, minversion=None): has_bottleneck, requires_bottleneck = _importorskip('bottleneck') has_rasterio, requires_rasterio = _importorskip('rasterio') has_pathlib, requires_pathlib = _importorskip('pathlib') +has_zarr, requires_zarr = _importorskip('zarr') # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 619e13a7735..cbcd6eeb364 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -26,9 +26,9 @@ from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap, requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf, - requires_pynio, requires_pathlib, has_netCDF4, has_scipy, - assert_allclose, flaky, network, requires_rasterio, - assert_identical) + requires_pynio, requires_pathlib, requires_zarr, + requires_rasterio, has_netCDF4, has_scipy, assert_allclose, + flaky, network, assert_identical) from .test_dataset import create_test_data from xarray.tests import mock @@ -881,6 +881,7 @@ class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): autoclose = True +@requires_zarr class ZarrDataTest(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): From fd9fd0fbd008647bfcc34d3325c656ae4312a546 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 14:51:29 -0700 Subject: [PATCH 09/68] wip --- xarray/backends/api.py | 27 +++ xarray/backends/common.py | 8 + xarray/backends/zarr.py | 302 +++++++++++++++++++++++++++++----- xarray/core/dataset.py | 28 ++++ xarray/tests/test_backends.py | 10 +- 5 files changed, 332 insertions(+), 43 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e5a3136f0ca..2394abf4370 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -678,3 +678,30 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None, finally: for store in stores: store.close() + + +def to_zarr(dataset, store=None, mode='a', synchronizer=None, group=None, + encoding=None): + """This function creates an appropriate datastore for writing a dataset to + disk a zarr ztore + + See `Dataset.to_zarr` for full API docs. + """ + if isinstance(store, path_type): + store = str(store) + if encoding is None: + encoding = {} + + # validate Dataset keys, DataArray names, and attr keys/values + _validate_dataset_names(dataset) + _validate_attrs(dataset) + + store = backends.ZarrStore(store=store, mode=mode, + synchronizer=synchronizer, group=group, + writer=None) + + # I think zarr stores should always be sync'd immediately + # TODO: figure out how to properly handle unlimited_dims + print("to_zarr encoding", encoding) + dataset.dump_to_store(store, sync=True, encoding=encoding) + return store diff --git a/xarray/backends/common.py b/xarray/backends/common.py index cec55d22589..185d9ee86e0 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -241,7 +241,15 @@ class WritableCFDataStore(AbstractWritableDataStore): def store(self, variables, attributes, *args, **kwargs): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. + #print('Raw variable encoding:', + # {name: var.encoding for name, var in variables.items()}) + #print('Raw variable attrs:', + # {name: var.attrs for name, var in variables.items()}) cf_variables, cf_attrs = cf_encoder(variables, attributes) + #print('cf_variable encoding:', + # {name: var.encoding for name, var in cf_variables.items()}) + #print('cf_variable attrs:', + # {name: var.attrs for name, var in cf_variables.items()}) AbstractWritableDataStore.store(self, cf_variables, cf_attrs, *args, **kwargs) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 47763d4e1bb..36a54b845ab 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -5,11 +5,14 @@ import warnings from itertools import product from collections import MutableMapping +import operator from .. import Variable from ..core import indexing -from ..core.utils import FrozenOrderedDict, close_on_error, HiddenKeyDict -from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict +from ..core.utils import (FrozenOrderedDict, close_on_error, HiddenKeyDict, + NdimSizeLenMixin,DunderArrayMixin) +from ..core.pycompat import (iteritems, bytes_type, unicode_type, OrderedDict, + basestring) from .common import (WritableCFDataStore, AbstractWritableDataStore, DataStorePickleMixin) @@ -27,25 +30,67 @@ # or maybe we don't need wrappers at all? probably not true - -# also most have a custom opener +class ZarrArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): + def __init__(self, variable_name, datastore): + self.datastore = datastore + self.variable_name = variable_name + + array = self.get_array() + self.shape = array.shape + + dtype = array.dtype + if dtype is str: + # use object dtype because that's the only way in numpy to + # represent variable length strings; it also prevents automatic + # string concatenation via conventions.decode_cf_variable + dtype = np.dtype('O') + self.dtype = dtype + + def get_array(self): + return self.datastore.ds[self.variable_name] + + def __getitem__(self, key): + # TODO: do we want to use robust_getitem for certain types of + # zarr store (e.g. S3)? + #if self.datastore.is_remote: # pragma: no cover + # getitem = functools.partial(robust_getitem, catch=RuntimeError) + #else: + getitem = operator.getitem + try: + data = getitem(self.get_array(), key) + except IndexError: + # Catch IndexError in netCDF4 and return a more informative + # error message. This is most often called when an unsorted + # indexer is used before the data is loaded from disk. + msg = ('The indexing operation you are attempting to perform ' + 'is not valid on zarr.core.Array object. Try loading ' + 'your data into memory first by calling .load().') + if not PY3: + import traceback + msg += '\n\nOriginal traceback:\n' + traceback.format_exc() + raise IndexError(msg) + return data + + # if self.ndim == 0: + # could possibly have a work-around for 0d data here # keyword args for zarr.group # store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None # the group name is called "path" in the zarr lexicon -def _open_zarr_group(store, overwrite, chunk_store, synchronizer, path): +# args for zarr.open_group +# store=None, mode='a', synchronizer=None, path=None + +def _open_zarr_group(store, mode, synchronizer, group): import zarr - zarr_group = zarr.group(store=store, overwrite=overwrite, - chunk_store=chunk_store, synchronizer=synchronizer, path=path) + #zarr_group = zarr.group(store=store, overwrite=overwrite, + # chunk_store=chunk_store, synchronizer=synchronizer, path=path) + zarr_group = zarr.open_group(store=store, mode=mode, + synchronizer=synchronizer, path=group) return zarr_group def _dask_chunks_to_zarr_chunks(chunks): - # zarr chunks needs to be uniform for each array - # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks - # dask chunks can be variable sized - # http://dask.pydata.org/en/latest/array-design.html#chunks # this function dask chunks syntax to zarr chunks if chunks is None: return chunks @@ -54,10 +99,85 @@ def _dask_chunks_to_zarr_chunks(chunks): first_chunk = next(all_chunks) for this_chunk in all_chunks: if not (this_chunk == first_chunk): - raise ValueError("zarr requires uniform chunk sizes, found %s" % - repr(chunks)) + raise ValueError("zarr requires uniform chunk sizes, found %r" % + chunks) return first_chunk +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): + """ + Given encoding chunks (possibly None) and variable chunks (possibly None) + """ + + # zarr chunk spec: + # chunks : int or tuple of ints, optional + # Chunk shape. If not provided, will be guessed from shape and dtype. + + # if there are no chunks in encoding and the variable data is a numpy array, + # then we let zarr use its own heuristics to pick the chunks + if var_chunks is None and enc_chunks is None: + return None + + # if there are no chunks in encoding but there are dask chunks, we try to + # use the same chunks in zarr + # However, zarr chunks needs to be uniform for each array + # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + # while dask chunks can be variable sized + # http://dask.pydata.org/en/latest/array-design.html#chunks + if var_chunks and enc_chunks is None: + all_var_chunks = product(*var_chunks) + first_var_chunk = next(all_var_chunks) + for this_chunk in all_var_chunks: + if not (this_chunk == first_var_chunk): + raise ValueError("zarr requires uniform chunk sizes, but " + "variable has non-uniform chunks %r. " + "Consider rechunking the data using `chunk()`." % + var_chunks) + return first_var_chunk + + # from here on, we are dealing with user-specified chunks in encoding + # zarr allows chunks to be an integer, in which case it uses the same chunk + # size on each dimension. + # Here we re-implement this expansion ourselves. That makes the logic of + # checking chunk compatibility easier + + # this coerces a single int to a tuple but leaves a tuple as is + enc_chunks_tuple = tuple(enc_chunks) + if len(enc_chunks_tuple)==1: + enc_chunks_tuple = ndim * enc_chunks_tuple + + if not len(enc_chunks_tuple) == ndim: + raise ValueError("zarr chunks tuple %r must have same length as " + "variable.ndim %g" % + (enc_chunks_tuple, _DIMENSION_KEY)) + + if not all(x is int for x in enc_chunks_tuple): + raise ValueError("zarr chunks much be an int or a tuple of ints") + + # if there are chunks in encoding and the variabile data is a numpy array, + # we use the specified chunks + if enc_chunks_tuple and var_chunks is None: + return enc_chunks_tuple + + # the hard case + # DESIGN CHOICE: do not allow multiple dask chunks on a single zarr chunk + # this avoids the need to get involved in zarr synchronization / locking + # From zarr docs: + # "If each worker in a parallel computation is writing to a separate region + # of the array, and if region boundaries are perfectly aligned with chunk + # boundaries, then no synchronization is required." + if var_chunks and enc_chunks_tuple: + for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks): + for dchunk in dchunks: + if not dchunk % zchunk == 0: + raise ValueError("Specified zarr chunks %r would" + "overlap multiple dask chunks %r." + "Consider rechunking the data using `chunk()` " + "or specifying different chunks in encoding." + % (enc_chunks_tuple, var_chunks)) + return enc_chunks_tuple + + raise RuntimeError("We should never get here. Function logic must be wrong.") + def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): # Zarr arrays do not have dimenions. To get around this problem, we add @@ -69,27 +189,66 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): return dimensions, attributes +### arguments for zarr.create +# zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', +# fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, +# path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) + +def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): + encoding = variable.encoding.copy() + + valid_encodings = set(['chunks', 'compressor', 'filters', 'cache_metadata']) + + if raise_on_invalid: + invalid = [k for k in encoding if k not in valid_encodings] + if invalid: + raise ValueError('unexpected encoding parameters for zarr backend: ' + ' %r' % invalid) + else: + for k in list(encoding): + if k not in valid_encodings: + del encoding[k] + + chunks = _determine_zarr_chunks(encoding.get('chunks'), variable.chunks, + variable.ndim) + encoding['chunks'] = chunks + + # TODO: figure out how to serialize compressor and filters options + # in zarr these are python objects, not strings + + return encoding + + class ZarrStore(WritableCFDataStore, DataStorePickleMixin): """Store for reading and writing data via zarr """ # need some special secret attributes to tell us the dimensions - _dimension_key = '_XARRAY_DIMENSIONS' + _DIMENSION_KEY = '_ARRAY_DIMENSIONS' - def __init__(self, store=None, overwrite=False, chunk_store=None, - synchronizer=None, path=None, writer=None, autoclose=False): - opener = functools.partial(_open_zarr_group, store, overwrite, - chunk_store, synchronizer, path) + def __init__(self, store=None, mode='a', synchronizer=None, group=None, + auto_chunk=True, writer=None, autoclose=None): + opener = functools.partial(_open_zarr_group, store, mode, + synchronizer, group) self.ds = opener() + + self._mode = mode + self._synchronizer = synchronizer + self._group = group + self._auto_chunk = auto_chunk + + # zarr stores don't need to be opened, closed, or synced. + # So what do we do with all this logical about openers? if autoclose: raise NotImplementedError('autoclose=True is not implemented ' 'for the zarr backend') self._autoclose = False self._isopen = True - self._opener = opener + self._opener = None # initialize hidden dimension attribute - self.ds.attrs[self._dimension_key] = {} + if self._DIMENSION_KEY not in self.ds.attrs: + self.ds.attrs[self._DIMENSION_KEY] = {} # do we need to define attributes for all of the opener keyword args? super(ZarrStore, self).__init__(writer) @@ -98,10 +257,28 @@ def open_store_variable(self, name, zarr_array): # I don't see why it is necessary to wrap self.ds[name] # zarr seems to implement the required ndarray interface # TODO: possibly wrap zarr array in dask with aligned chunks - data = indexing.LazilyIndexedArray(zarr_array) + data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs( - zarr_array, self._dimension_key) - return Variable(dimensions, data, attributes) + zarr_array, self._DIMENSION_KEY) + encoding = {'chunks': zarr_array.chunks, + 'compressor': zarr_array.compressor, + 'filters': zarr_array.filters, + 'fill_value': zarr_array.fill_value} + + var = Variable(dimensions, data, attributes, encoding) + + if self._auto_chunk: + from dask.base import tokenize + # is this token enough? + token = tokenize(zarr_array) + name = 'zarr_array-%s' % token + # do we need to worry about the zarr synchronizer / dask lock? + lock = self._synchronizer + print("Chunking variable") + var = var.chunk(chunks=zarr_array.chunks, name=name, lock=lock) + + return var + def get_variables(self): with self.ensure_open(autoclose=False): @@ -111,24 +288,24 @@ def get_variables(self): def get_attrs(self): with self.ensure_open(autoclose=True): _, attributes = _get_zarr_dims_and_attrs(self.ds, - self._dimension_key) + self._DIMENSION_KEY) attrs = FrozenOrderedDict(attributes) return attrs def get_dimensions(self): with self.ensure_open(autoclose=True): dimensions, _ = _get_zarr_dims_and_attrs(self.ds, - self._dimension_key) + self._DIMENSION_KEY) return dimensions def set_dimension(self, name, length): with self.ensure_open(autoclose=False): - self.ds.attrs[self._dimension_key][name] = length + self.ds.attrs[self._DIMENSION_KEY][name] = length def set_attribute(self, key, value): with self.ensure_open(autoclose=False): _, attributes = _get_zarr_dims_and_attrs(self.ds, - self._dimension_key) + self._DIMENSION_KEY) attributes[key] = value def prepare_variable(self, name, variable, check_encoding=False, @@ -138,30 +315,41 @@ def prepare_variable(self, name, variable, check_encoding=False, dims = variable.dims dtype = variable.dtype shape = variable.shape - chunks = _dask_chunks_to_zarr_chunks(variable.chunks) - # TODO: figure ouw how zarr should deal with unlimited dimensions + # TODO: figure out how zarr should deal with unlimited dimensions self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) - # let's try keeping this fill value stuff - fill_value = attrs.pop('_FillValue', None) + # netcdf uses pop not get...yet it works. Why? + # here we are basically duplicating zarr's own internal fill_value + # in an attribute. This seems redundant and error prone. How can + # we do better? + fill_value = attrs.get('_FillValue', None) if fill_value in ['\x00']: fill_value = None # TODO: figure out what encoding is needed for zarr + encoding = _extract_zarr_variable_encoding( + variable, raise_on_invalid=check_encoding) ### arguments for zarr.create # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, # path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) - - # TODO: figure out how to pass along all those other arguments - zarr_array = self.ds.create(name, shape=shape, dtype=dtype, - chunks=chunks, fill_value=fill_value) - zarr_array.attrs[self._dimension_key] = dims + fill_value=fill_value, **encoding) + # decided not to explicity enumerate encoding options because we + # risk overriding zarr's defaults (e.g. if we specificy + # cache_metadata=None instead of True). Alternative is to have lots of + # logic in _extract_zarr_variable encoding to duplicate zarr defaults. + # chunks=encoding.get('chunks'), + # compressor=encoding.get('compressor'), + # filters=encodings.get('filters'), + # cache_metadata=encoding.get('cache_metadata')) + + # the magic for storing the hidden dimension data + zarr_array.attrs[self._DIMENSION_KEY] = dims _, attributes = _get_zarr_dims_and_attrs(zarr_array, - self._dimension_key) + self._DIMENSION_KEY) for k, v in iteritems(attrs): attributes[k] = v @@ -171,7 +359,31 @@ def prepare_variable(self, name, variable, check_encoding=False, # sync() and close() methods should not be needed with zarr -def open_zarr(store, decode_cf=True, +# from zarr docs + +# Zarr arrays can be used as either the source or sink for data in parallel +# computations. Both multi-threaded and multi-process parallelism are supported. +# The Python global interpreter lock (GIL) is released for both compression and +# decompression operations, so Zarr will not block other Python threads from running. +# +# A Zarr array can be read concurrently by multiple threads or processes. No +# synchronization (i.e., locking) is required for concurrent reads. +# +# A Zarr array can also be written to concurrently by multiple threads or +# processes. Some synchronization may be required, depending on the way the data +# is being written. + +# If each worker in a parallel computation is writing to a separate region of +# the array, and if region boundaries are perfectly aligned with chunk +# boundaries, then no synchronization is required. However, if region and chunk +# boundaries are not perfectly aligned, then synchronization is required to +# avoid two workers attempting to modify the same chunk at the same time. + + + + +def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, + decode_cf=True, mask_and_scale=True, decode_times=True, autoclose=False, concat_characters=True, decode_coords=True, cache=None, drop_variables=None): @@ -181,6 +393,17 @@ def open_zarr(store, decode_cf=True, ---------- store : MutableMapping or str Store or path to directory in file system. + mode : {‘r’, ‘r+’} + Persistence mode: ‘r’ means read only (must exist); ‘r+’ means + read/write (must exist) + synchronizer : object, optional + Array synchronizer + group : str, obtional + Group path. (a.k.a. `path` in zarr terminology.) + auto_chunk : bool, optional + Whether to automatically create dask chunks corresponding to each + variable's zarr chunks. If False, zarr array data will lazily convert + to numpy arrays upon access. decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -246,5 +469,6 @@ def maybe_decode_store(store, lock=False): return ds - zarr_store = ZarrStore(store=store) + zarr_store = ZarrStore(store=store, mode=mode, synchronizer=synchronizer, + group=group, auto_chunk=auto_chunk) return maybe_decode_store(zarr_store) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 5d975ffd281..e89694cbf7c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1007,6 +1007,34 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, engine=engine, encoding=encoding, unlimited_dims=unlimited_dims) + def to_zarr(self, store=None, mode='a', synchronizer=None, group=None, + encoding=None): + """Write dataset contents to a zarr group. + + Parameters + ---------- + store : MutableMapping or str, optional + Store or path to directory in file system. + mode : {‘r’, ‘r+’, ‘a’, ‘w’, ‘w-‘} + Persistence mode: ‘r’ means read only (must exist); ‘r+’ means + read/write (must exist); ‘a’ means read/write (create if doesn’t + exist); ‘w’ means create (overwrite if exists); ‘w-‘ means create + (fail if exists). + synchronizer : object, optional + Array synchronizer + group : str, obtional + Group path. (a.k.a. `path` in zarr terminology.) + encoding : dict, optional + Nested dictionary with variable names as keys and dictionaries of + variable specific encodings as values, e.g., + ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,}, ...}`` + """ + if encoding is None: + encoding = {} + from ..backends.api import to_zarr + return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer, + group=group, encoding=encoding) + def __unicode__(self): return formatting.dataset_repr(self) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cbcd6eeb364..ef23e866251 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -135,6 +135,7 @@ def create_store(self): def roundtrip(self, data, **kwargs): raise NotImplementedError + # note: zero dimensional arrays are not suppoerted by zarr backend def test_zero_dimensional_variable(self): expected = create_test_data() expected['float_var'] = ([], 1.0e9, {'units': 'units of awesome'}) @@ -539,6 +540,7 @@ def test_default_fill_value(self): ds = Dataset({'x': ('y', np.arange(10.0))}) kwargs = dict(encoding={'x': {'dtype': 'f4'}}) with self.roundtrip(ds, save_kwargs=kwargs) as actual: + print("actual.x.encoding", actual.x.encoding) self.assertEqual(actual.x.encoding['_FillValue'], np.nan) self.assertEqual(ds.x.encoding, {}) @@ -891,10 +893,10 @@ def create_store(self): @contextlib.contextmanager def roundtrip(self, data, save_kwargs={}, open_kwargs={}, allow_cleanup_failure=False): - with create_tmp_file(suffix='.zarr') as tmp: - zs = backends.ZarrStore(store=tmp) - data.dump_to_store(zs) - yield xr.open_zarr(tmp) + with create_tmp_file(suffix='.zarr', + allow_cleanup_failure=allow_cleanup_failure) as tmp_file: + data.to_zarr(store=tmp_file, **save_kwargs) + yield xr.open_zarr(tmp_file, **open_kwargs) @requires_scipy From 9f16e8f3a3f7320b0b2d5f1676ac2c27e66a25a3 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 22:43:27 -0400 Subject: [PATCH 10/68] added chunking test --- xarray/tests/test_backends.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ef23e866251..2af6ef105a3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -898,6 +898,25 @@ def roundtrip(self, data, save_kwargs={}, open_kwargs={}, data.to_zarr(store=tmp_file, **save_kwargs) yield xr.open_zarr(tmp_file, **open_kwargs) + def test_auto_chunk(self): + original = create_test_data ().chunk() + + with self.roundtrip(original, + open_kwargs={'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + self.assertEqual(v._in_memory, k in actual.dims) + # there should be no chunks + self.assertEqual(v.chunks, None) + + with self.roundtrip(original, + open_kwargs={'auto_chunk': True}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + self.assertEqual(v._in_memory, k in actual.dims) + # chunk size should be the same as original + self.assertEqual(v.chunks, original[k].chunks) + @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, Only32BitTypes, TestCase): From fe9ebe7c2081711567d76d38cb51837eca7cbfc2 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 23:27:08 -0400 Subject: [PATCH 11/68] remove debuggin statements --- xarray/backends/api.py | 1 - xarray/backends/common.py | 8 -------- 2 files changed, 9 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 2394abf4370..82b96caf13c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -702,6 +702,5 @@ def to_zarr(dataset, store=None, mode='a', synchronizer=None, group=None, # I think zarr stores should always be sync'd immediately # TODO: figure out how to properly handle unlimited_dims - print("to_zarr encoding", encoding) dataset.dump_to_store(store, sync=True, encoding=encoding) return store diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 185d9ee86e0..cec55d22589 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -241,15 +241,7 @@ class WritableCFDataStore(AbstractWritableDataStore): def store(self, variables, attributes, *args, **kwargs): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. - #print('Raw variable encoding:', - # {name: var.encoding for name, var in variables.items()}) - #print('Raw variable attrs:', - # {name: var.attrs for name, var in variables.items()}) cf_variables, cf_attrs = cf_encoder(variables, attributes) - #print('cf_variable encoding:', - # {name: var.encoding for name, var in cf_variables.items()}) - #print('cf_variable attrs:', - # {name: var.attrs for name, var in cf_variables.items()}) AbstractWritableDataStore.store(self, cf_variables, cf_attrs, *args, **kwargs) From c01cd09d296c8be42703d60da007453b78500040 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 23:41:57 -0400 Subject: [PATCH 12/68] fixed HiddenKeyDict --- xarray/core/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index cccefe3d3c6..774ae3fe5c1 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -496,8 +496,10 @@ class HiddenKeyDict(MutableMapping): Acts like a normal dictionary, but hides certain keys. ''' # ``__init__`` method required to create instance from class. - def __init__(self, data, *hidden_keys): + def __init__(self, data, hidden_keys): self._data = data + if type(hidden_keys) is not list: + hidden_keys = [ hidden_keys ] self._hidden_keys = hidden_keys def _raise_if_hidden(self, key): @@ -523,4 +525,5 @@ def __iter__(self): yield k def __len__(self): - return len(list(self.__iter__())) + num_hidden = sum([k in self._hidden_keys for k in self._data]) + return len(self._data) - num_hidden From b3e5d7697c5d2cffaab9b8d484f834485ec56d64 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 26 Aug 2017 22:18:38 -0400 Subject: [PATCH 13/68] added HiddenKeyDict class --- xarray/core/utils.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 89d1462328c..cccefe3d3c6 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -489,3 +489,38 @@ def ensure_us_time_resolution(val): elif np.issubdtype(val.dtype, np.timedelta64): val = val.astype('timedelta64[us]') return val + + +class HiddenKeyDict(MutableMapping): + ''' + Acts like a normal dictionary, but hides certain keys. + ''' + # ``__init__`` method required to create instance from class. + def __init__(self, data, *hidden_keys): + self._data = data + self._hidden_keys = hidden_keys + + def _raise_if_hidden(self, key): + if key in self._hidden_keys: + raise KeyError('Key is hidden.') + + # The next five methods are requirements of the ABC. + def __setitem__(self, key, value): + self._raise_if_hidden(key) + self._data[key] = value + + def __getitem__(self, key): + self._raise_if_hidden(key) + return self._data[key] + + def __delitem__(self, key): + self._raise_if_hidden(key) + del self._data[key] + + def __iter__(self): + for k in self._data: + if k not in self._hidden_keys: + yield k + + def __len__(self): + return len(list(self.__iter__())) From 45375b20d7afeb44657466aa2fb6540f270fe111 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 26 Aug 2017 22:19:11 -0400 Subject: [PATCH 14/68] new zarr backend --- xarray/backends/zarr.py | 172 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 xarray/backends/zarr.py diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py new file mode 100644 index 00000000000..bc3be1306d2 --- /dev/null +++ b/xarray/backends/zarr.py @@ -0,0 +1,172 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import functools +import warnings +from itertools import product +from collections import MutableMapping + +from .. import Variable +from ..core import indexing +from ..core.utils import FrozenOrderedDict, close_on_error, HiddenKeyDict +from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict + +from .common import (WritableCFDataStore, AbstractWritableDataStore, + DataStorePickleMixin) + + + + +# most of the other stores have some kind of wrapper class like +# class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): +# class H5NetCDFArrayWrapper(BaseNetCDF4Array): +# class NioArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): +# we problaby need something like this + +# the first question is whether it should be based on BaseNetCDF4Array or +# NdimSizeLenMixing? + +# or maybe we don't need wrappers at all? probably not true + + +# also most have a custom opener + +# keyword args for zarr.group +# store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None +# the group name is called "path" in the zarr lexicon + +def _open_zarr_group(store, overwrite, chunk_store, synchronizer, path): + import zarr + zarr_group = zarr.group(store=store, overwrite=overwrite, + chunk_store=chunk_store, synchronizer=synchronizer, path=path) + return zarr_group + + +def _dask_chunks_to_zarr_chunks(chunks): + # zarr chunks needs to be uniform for each array + # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + # dask chunks can be variable sized + # http://dask.pydata.org/en/latest/array-design.html#chunks + # this function dask chunks syntax to zarr chunks + if chunks is None: + return chunks + + all_chunks = product(*chunks) + first_chunk = all_chunks.next() + for this_chunk in all_chunks: + if not (this_chunk == first_chunk): + raise ValueError("zarr requires uniform chunk sizes, found %s" % + repr(chunks)) + return first_chunk + + +def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): + # Zarr arrays do not have dimenions. To get around this problem, we add + # an attribute that specifies the dimension. We have to hide this attribute + # when we send the attributes to the user. + # zarr_obj can be either a zarr group or zarr array + dimensions = zarr_obj.attrs.get(dimension_key) + attributes = HiddenKeyDict(zarr_obj.attrs, dimension_key) + return dimensions, attributes + + +class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): + """Store for reading and writing data via zarr + """ + + # need some special secret attributes to tell us the dimensions + _dimension_key = '_XARRAY_DIMENSIONS' + + def __init__(self, store=None, overwrite=False, chunk_store=None, + synchronizer=None, path=None, writer=None, autoclose=False): + opener = functools.partial(_open_zarr_group, store, overwrite, + chunk_store, synchronizer, path) + self.ds = opener() + if autoclose: + raise NotImplementedError('autoclose=True is not implemented ' + 'for the zarr backend') + self._autoclose = False + self._isopen = True + self._opener = opener + + # initialize hidden dimension attribute + self.ds.attrs[self._dimension_key] = {} + + # do we need to define attributes for all of the opener keyword args? + super(ZarrStore, self).__init__(writer) + + def open_store_variable(self, name, zarr_array): + # I don't see why it is necessary to wrap self.ds[name] + # zarr seems to implement the required ndarray interface + # TODO: possibly wrap zarr array in dask with aligned chunks + data = indexing.LazilyIndexedArray(zarr_array) + dimensions, attributes = _get_zarr_dims_and_attrs( + zarr_array, self._dimension_key) + return Variable(dimensions, data, attributes) + + def get_variables(self): + with self.ensure_open(autoclose=False): + return FrozenOrderedDict((k, self.open_store_variable(k, v)) + for k, v in self.ds.arrays()) + + def get_attrs(self): + with self.ensure_open(autoclose=True): + _, attributes = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + attrs = FrozenOrderedDict(attributes) + return attrs + + def get_dimensions(self): + with self.ensure_open(autoclose=True): + dimensions, _ = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + return dimensions + + def set_dimension(self, name, length): + with self.ensure_open(autoclose=False): + self.ds.attrs[self._dimension_key][name] = length + + def set_attribute(self, key, value): + with self.ensure_open(autoclose=False): + _, attributes = _get_zarr_dims_and_attrs(self.ds, + self._dimension_key) + attributes[key] = value + + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): + + attrs = variable.attrs.copy() + dims = variable.dims + dtype = variable.dtype + shape = variable.shape + chunks = _dask_chunks_to_zarr_chunks(variable.chunks) + + # TODO: figure ouw how zarr should deal with unlimited dimensions + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) + + # let's try keeping this fill value stuff + fill_value = attrs.pop('_FillValue', None) + if fill_value in ['\x00']: + fill_value = None + + # TODO: figure out what encoding is needed for zarr + + ### arguments for zarr.create + # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', + # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, + # path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) + + # TODO: figure out how to pass along all those other arguments + + zarr_array = self.ds.create(name, shape=shape, dtype=dtype, + chunks=chunks, fill_value=fill_value) + zarr_array.attrs[self._dimension_key] = dims + _, attributes = _get_zarr_dims_and_attrs(zarr_array, + self._dimension_key) + + for k, v in iteritems(attrs): + attributes[k] = v + + return zarr_array, variable.data + + # sync() and close() methods should not be needed with zarr From 0e79718be73ca8c9542affb7c2712408613c7a17 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 5 Oct 2017 16:08:35 -0700 Subject: [PATCH 15/68] add zarr to ci reqs --- ci/requirements-py27-cdat+pynio.yml | 1 + ci/requirements-py27-windows.yml | 1 + ci/requirements-py35.yml | 1 + ci/requirements-py36-windows.yml | 1 + ci/requirements-py36.yml | 1 + 5 files changed, 5 insertions(+) diff --git a/ci/requirements-py27-cdat+pynio.yml b/ci/requirements-py27-cdat+pynio.yml index ccd3fbf9cb4..120a6c0bf06 100644 --- a/ci/requirements-py27-cdat+pynio.yml +++ b/ci/requirements-py27-cdat+pynio.yml @@ -21,6 +21,7 @@ dependencies: - seaborn - toolz - rasterio + - zarr - pip: - coveralls - pytest-cov diff --git a/ci/requirements-py27-windows.yml b/ci/requirements-py27-windows.yml index 73baca68dfa..0b7ea897f5a 100644 --- a/ci/requirements-py27-windows.yml +++ b/ci/requirements-py27-windows.yml @@ -18,3 +18,4 @@ dependencies: - seaborn - toolz - rasterio + - zarr diff --git a/ci/requirements-py35.yml b/ci/requirements-py35.yml index 1c7a4558c91..48643b75243 100644 --- a/ci/requirements-py35.yml +++ b/ci/requirements-py35.yml @@ -16,6 +16,7 @@ dependencies: - seaborn - toolz - rasterio + - zarr - pip: - coveralls - pytest-cov diff --git a/ci/requirements-py36-windows.yml b/ci/requirements-py36-windows.yml index 70ff3e50a1b..ea366bd04f7 100644 --- a/ci/requirements-py36-windows.yml +++ b/ci/requirements-py36-windows.yml @@ -16,3 +16,4 @@ dependencies: - seaborn - toolz - rasterio + - zarr diff --git a/ci/requirements-py36.yml b/ci/requirements-py36.yml index 0d22fb26b79..168a72b0f58 100644 --- a/ci/requirements-py36.yml +++ b/ci/requirements-py36.yml @@ -17,6 +17,7 @@ dependencies: - toolz - rasterio - bottleneck + - zarr - pip: - coveralls - pytest-cov From 3d39ade88fcfeb88151caed9d6d0e28c08e33267 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 5 Oct 2017 16:15:33 -0700 Subject: [PATCH 16/68] add zarr api to docs --- doc/api.rst | 2 ++ doc/installing.rst | 1 + doc/io.rst | 12 ++++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 433aa93c9de..57ad8a6f86d 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -419,7 +419,9 @@ Dataset methods open_dataset open_mfdataset open_rasterio + open_zarr Dataset.to_netcdf + Dataset.to_zarr save_mfdataset Dataset.to_array Dataset.to_dataframe diff --git a/doc/installing.rst b/doc/installing.rst index 62d2a673e22..75d79c296bc 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -24,6 +24,7 @@ For netCDF and IO reading and writing netCDF4 files that does not use the netCDF-C libraries - `pynio `__: for reading GRIB and other geoscience specific file formats +- `zarr Date: Thu, 5 Oct 2017 17:47:30 -0700 Subject: [PATCH 17/68] some zarr tests passing --- xarray/__init__.py | 1 + xarray/backends/__init__.py | 1 + xarray/backends/zarr.py | 87 +++++++++++++++++++++++++++++++++-- xarray/tests/test_backends.py | 15 ++++++ 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 654ed77b28a..9bfe569995f 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -18,6 +18,7 @@ from .backends.api import (open_dataset, open_dataarray, open_mfdataset, save_mfdataset) from .backends.rasterio_ import open_rasterio +from .backends.zarr import open_zarr from .conventions import decode_cf diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py index a082bd53e5e..a8a4afc359a 100644 --- a/xarray/backends/__init__.py +++ b/xarray/backends/__init__.py @@ -10,3 +10,4 @@ from .pynio_ import NioDataStore from .scipy_ import ScipyDataStore from .h5netcdf_ import H5NetCDFStore +from .zarr import ZarrStore diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index bc3be1306d2..ebe5a59dc70 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -14,8 +14,7 @@ from .common import (WritableCFDataStore, AbstractWritableDataStore, DataStorePickleMixin) - - +from .. import conventions # most of the other stores have some kind of wrapper class like # class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): @@ -52,7 +51,7 @@ def _dask_chunks_to_zarr_chunks(chunks): return chunks all_chunks = product(*chunks) - first_chunk = all_chunks.next() + first_chunk = next(all_chunks) for this_chunk in all_chunks: if not (this_chunk == first_chunk): raise ValueError("zarr requires uniform chunk sizes, found %s" % @@ -70,7 +69,8 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): return dimensions, attributes -class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): +class ZarrStore(WritableCFDataStore, DataStorePickleMixin): +#class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): """Store for reading and writing data via zarr """ @@ -170,3 +170,82 @@ def prepare_variable(self, name, variable, check_encoding=False, return zarr_array, variable.data # sync() and close() methods should not be needed with zarr + + +def open_zarr(store, decode_cf=True, + mask_and_scale=True, decode_times=True, autoclose=False, + concat_characters=True, decode_coords=True, + cache=None, drop_variables=None): + """Load and decode a dataset from a file or file-like object. + + Parameters + ---------- + store : MutableMapping or str + Store or path to directory in file system. + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. + decode_times : bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + autoclose : bool, optional + If True, automatically close files to avoid OS Error of too many files + being open. However, this option doesn't work with streams, e.g., + BytesIO. + concat_characters : bool, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + decode_coords : bool, optional + If True, decode the 'coordinates' attribute to identify coordinates in + the resulting dataset. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. + drop_variables: string or iterable, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + Returns + ------- + dataset : Dataset + The newly created dataset. + + See Also + -------- + open_dataset + """ + if not decode_cf: + mask_and_scale = False + decode_times = False + concat_characters = False + decode_coords = False + + def maybe_decode_store(store, lock=False): + ds = conventions.decode_cf( + store, mask_and_scale=mask_and_scale, decode_times=decode_times, + concat_characters=concat_characters, decode_coords=decode_coords, + drop_variables=drop_variables) + + # this is how we would apply caching + # but do we want it for zarr stores? + #_protect_dataset_variables_inplace(ds, cache) + + return ds + + zarr_store = ZarrStore(store=store) + return maybe_decode_store(zarr_store) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index c7bf5349c7a..619e13a7735 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -881,6 +881,21 @@ class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): autoclose = True +class ZarrDataTest(CFEncodedDataTest, TestCase): + @contextlib.contextmanager + def create_store(self): + with create_tmp_file(suffix='.zarr') as tmp: + yield backends.ZarrStore(store=tmp) + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}, + allow_cleanup_failure=False): + with create_tmp_file(suffix='.zarr') as tmp: + zs = backends.ZarrStore(store=tmp) + data.dump_to_store(zs) + yield xr.open_zarr(tmp) + + @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, Only32BitTypes, TestCase): @contextlib.contextmanager From 0b4a27afe6e5fc5e561792aee08ccd7915263b34 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 5 Oct 2017 21:07:22 -0700 Subject: [PATCH 18/68] requires zarr decorator --- xarray/tests/__init__.py | 1 + xarray/tests/test_backends.py | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 0ef32601b77..a916145bfb6 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -54,6 +54,7 @@ def _importorskip(modname, minversion=None): has_bottleneck, requires_bottleneck = _importorskip('bottleneck') has_rasterio, requires_rasterio = _importorskip('rasterio') has_pathlib, requires_pathlib = _importorskip('pathlib') +has_zarr, requires_zarr = _importorskip('zarr') # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 619e13a7735..cbcd6eeb364 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -26,9 +26,9 @@ from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap, requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf, - requires_pynio, requires_pathlib, has_netCDF4, has_scipy, - assert_allclose, flaky, network, requires_rasterio, - assert_identical) + requires_pynio, requires_pathlib, requires_zarr, + requires_rasterio, has_netCDF4, has_scipy, assert_allclose, + flaky, network, assert_identical) from .test_dataset import create_test_data from xarray.tests import mock @@ -881,6 +881,7 @@ class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): autoclose = True +@requires_zarr class ZarrDataTest(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): From f39035cc4ff6eadc0fee1735d4e77939bd278624 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 14:51:29 -0700 Subject: [PATCH 19/68] wip --- xarray/backends/api.py | 27 +++ xarray/backends/common.py | 8 + xarray/backends/zarr.py | 302 +++++++++++++++++++++++++++++----- xarray/core/dataset.py | 28 ++++ xarray/tests/test_backends.py | 10 +- 5 files changed, 332 insertions(+), 43 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index e5a3136f0ca..2394abf4370 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -678,3 +678,30 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None, finally: for store in stores: store.close() + + +def to_zarr(dataset, store=None, mode='a', synchronizer=None, group=None, + encoding=None): + """This function creates an appropriate datastore for writing a dataset to + disk a zarr ztore + + See `Dataset.to_zarr` for full API docs. + """ + if isinstance(store, path_type): + store = str(store) + if encoding is None: + encoding = {} + + # validate Dataset keys, DataArray names, and attr keys/values + _validate_dataset_names(dataset) + _validate_attrs(dataset) + + store = backends.ZarrStore(store=store, mode=mode, + synchronizer=synchronizer, group=group, + writer=None) + + # I think zarr stores should always be sync'd immediately + # TODO: figure out how to properly handle unlimited_dims + print("to_zarr encoding", encoding) + dataset.dump_to_store(store, sync=True, encoding=encoding) + return store diff --git a/xarray/backends/common.py b/xarray/backends/common.py index cec55d22589..185d9ee86e0 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -241,7 +241,15 @@ class WritableCFDataStore(AbstractWritableDataStore): def store(self, variables, attributes, *args, **kwargs): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. + #print('Raw variable encoding:', + # {name: var.encoding for name, var in variables.items()}) + #print('Raw variable attrs:', + # {name: var.attrs for name, var in variables.items()}) cf_variables, cf_attrs = cf_encoder(variables, attributes) + #print('cf_variable encoding:', + # {name: var.encoding for name, var in cf_variables.items()}) + #print('cf_variable attrs:', + # {name: var.attrs for name, var in cf_variables.items()}) AbstractWritableDataStore.store(self, cf_variables, cf_attrs, *args, **kwargs) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index ebe5a59dc70..a79a39c8db0 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -5,11 +5,14 @@ import warnings from itertools import product from collections import MutableMapping +import operator from .. import Variable from ..core import indexing -from ..core.utils import FrozenOrderedDict, close_on_error, HiddenKeyDict -from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict +from ..core.utils import (FrozenOrderedDict, close_on_error, HiddenKeyDict, + NdimSizeLenMixin,DunderArrayMixin) +from ..core.pycompat import (iteritems, bytes_type, unicode_type, OrderedDict, + basestring) from .common import (WritableCFDataStore, AbstractWritableDataStore, DataStorePickleMixin) @@ -27,25 +30,67 @@ # or maybe we don't need wrappers at all? probably not true - -# also most have a custom opener +class ZarrArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): + def __init__(self, variable_name, datastore): + self.datastore = datastore + self.variable_name = variable_name + + array = self.get_array() + self.shape = array.shape + + dtype = array.dtype + if dtype is str: + # use object dtype because that's the only way in numpy to + # represent variable length strings; it also prevents automatic + # string concatenation via conventions.decode_cf_variable + dtype = np.dtype('O') + self.dtype = dtype + + def get_array(self): + return self.datastore.ds[self.variable_name] + + def __getitem__(self, key): + # TODO: do we want to use robust_getitem for certain types of + # zarr store (e.g. S3)? + #if self.datastore.is_remote: # pragma: no cover + # getitem = functools.partial(robust_getitem, catch=RuntimeError) + #else: + getitem = operator.getitem + try: + data = getitem(self.get_array(), key) + except IndexError: + # Catch IndexError in netCDF4 and return a more informative + # error message. This is most often called when an unsorted + # indexer is used before the data is loaded from disk. + msg = ('The indexing operation you are attempting to perform ' + 'is not valid on zarr.core.Array object. Try loading ' + 'your data into memory first by calling .load().') + if not PY3: + import traceback + msg += '\n\nOriginal traceback:\n' + traceback.format_exc() + raise IndexError(msg) + return data + + # if self.ndim == 0: + # could possibly have a work-around for 0d data here # keyword args for zarr.group # store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None # the group name is called "path" in the zarr lexicon -def _open_zarr_group(store, overwrite, chunk_store, synchronizer, path): +# args for zarr.open_group +# store=None, mode='a', synchronizer=None, path=None + +def _open_zarr_group(store, mode, synchronizer, group): import zarr - zarr_group = zarr.group(store=store, overwrite=overwrite, - chunk_store=chunk_store, synchronizer=synchronizer, path=path) + #zarr_group = zarr.group(store=store, overwrite=overwrite, + # chunk_store=chunk_store, synchronizer=synchronizer, path=path) + zarr_group = zarr.open_group(store=store, mode=mode, + synchronizer=synchronizer, path=group) return zarr_group def _dask_chunks_to_zarr_chunks(chunks): - # zarr chunks needs to be uniform for each array - # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks - # dask chunks can be variable sized - # http://dask.pydata.org/en/latest/array-design.html#chunks # this function dask chunks syntax to zarr chunks if chunks is None: return chunks @@ -54,10 +99,85 @@ def _dask_chunks_to_zarr_chunks(chunks): first_chunk = next(all_chunks) for this_chunk in all_chunks: if not (this_chunk == first_chunk): - raise ValueError("zarr requires uniform chunk sizes, found %s" % - repr(chunks)) + raise ValueError("zarr requires uniform chunk sizes, found %r" % + chunks) return first_chunk +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): + """ + Given encoding chunks (possibly None) and variable chunks (possibly None) + """ + + # zarr chunk spec: + # chunks : int or tuple of ints, optional + # Chunk shape. If not provided, will be guessed from shape and dtype. + + # if there are no chunks in encoding and the variable data is a numpy array, + # then we let zarr use its own heuristics to pick the chunks + if var_chunks is None and enc_chunks is None: + return None + + # if there are no chunks in encoding but there are dask chunks, we try to + # use the same chunks in zarr + # However, zarr chunks needs to be uniform for each array + # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + # while dask chunks can be variable sized + # http://dask.pydata.org/en/latest/array-design.html#chunks + if var_chunks and enc_chunks is None: + all_var_chunks = product(*var_chunks) + first_var_chunk = next(all_var_chunks) + for this_chunk in all_var_chunks: + if not (this_chunk == first_var_chunk): + raise ValueError("zarr requires uniform chunk sizes, but " + "variable has non-uniform chunks %r. " + "Consider rechunking the data using `chunk()`." % + var_chunks) + return first_var_chunk + + # from here on, we are dealing with user-specified chunks in encoding + # zarr allows chunks to be an integer, in which case it uses the same chunk + # size on each dimension. + # Here we re-implement this expansion ourselves. That makes the logic of + # checking chunk compatibility easier + + # this coerces a single int to a tuple but leaves a tuple as is + enc_chunks_tuple = tuple(enc_chunks) + if len(enc_chunks_tuple)==1: + enc_chunks_tuple = ndim * enc_chunks_tuple + + if not len(enc_chunks_tuple) == ndim: + raise ValueError("zarr chunks tuple %r must have same length as " + "variable.ndim %g" % + (enc_chunks_tuple, _DIMENSION_KEY)) + + if not all(x is int for x in enc_chunks_tuple): + raise ValueError("zarr chunks much be an int or a tuple of ints") + + # if there are chunks in encoding and the variabile data is a numpy array, + # we use the specified chunks + if enc_chunks_tuple and var_chunks is None: + return enc_chunks_tuple + + # the hard case + # DESIGN CHOICE: do not allow multiple dask chunks on a single zarr chunk + # this avoids the need to get involved in zarr synchronization / locking + # From zarr docs: + # "If each worker in a parallel computation is writing to a separate region + # of the array, and if region boundaries are perfectly aligned with chunk + # boundaries, then no synchronization is required." + if var_chunks and enc_chunks_tuple: + for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks): + for dchunk in dchunks: + if not dchunk % zchunk == 0: + raise ValueError("Specified zarr chunks %r would" + "overlap multiple dask chunks %r." + "Consider rechunking the data using `chunk()` " + "or specifying different chunks in encoding." + % (enc_chunks_tuple, var_chunks)) + return enc_chunks_tuple + + raise RuntimeError("We should never get here. Function logic must be wrong.") + def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): # Zarr arrays do not have dimenions. To get around this problem, we add @@ -69,28 +189,67 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): return dimensions, attributes +### arguments for zarr.create +# zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', +# fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, +# path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) + +def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): + encoding = variable.encoding.copy() + + valid_encodings = set(['chunks', 'compressor', 'filters', 'cache_metadata']) + + if raise_on_invalid: + invalid = [k for k in encoding if k not in valid_encodings] + if invalid: + raise ValueError('unexpected encoding parameters for zarr backend: ' + ' %r' % invalid) + else: + for k in list(encoding): + if k not in valid_encodings: + del encoding[k] + + chunks = _determine_zarr_chunks(encoding.get('chunks'), variable.chunks, + variable.ndim) + encoding['chunks'] = chunks + + # TODO: figure out how to serialize compressor and filters options + # in zarr these are python objects, not strings + + return encoding + + class ZarrStore(WritableCFDataStore, DataStorePickleMixin): #class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): """Store for reading and writing data via zarr """ # need some special secret attributes to tell us the dimensions - _dimension_key = '_XARRAY_DIMENSIONS' + _DIMENSION_KEY = '_ARRAY_DIMENSIONS' - def __init__(self, store=None, overwrite=False, chunk_store=None, - synchronizer=None, path=None, writer=None, autoclose=False): - opener = functools.partial(_open_zarr_group, store, overwrite, - chunk_store, synchronizer, path) + def __init__(self, store=None, mode='a', synchronizer=None, group=None, + auto_chunk=True, writer=None, autoclose=None): + opener = functools.partial(_open_zarr_group, store, mode, + synchronizer, group) self.ds = opener() + + self._mode = mode + self._synchronizer = synchronizer + self._group = group + self._auto_chunk = auto_chunk + + # zarr stores don't need to be opened, closed, or synced. + # So what do we do with all this logical about openers? if autoclose: raise NotImplementedError('autoclose=True is not implemented ' 'for the zarr backend') self._autoclose = False self._isopen = True - self._opener = opener + self._opener = None # initialize hidden dimension attribute - self.ds.attrs[self._dimension_key] = {} + if self._DIMENSION_KEY not in self.ds.attrs: + self.ds.attrs[self._DIMENSION_KEY] = {} # do we need to define attributes for all of the opener keyword args? super(ZarrStore, self).__init__(writer) @@ -99,10 +258,28 @@ def open_store_variable(self, name, zarr_array): # I don't see why it is necessary to wrap self.ds[name] # zarr seems to implement the required ndarray interface # TODO: possibly wrap zarr array in dask with aligned chunks - data = indexing.LazilyIndexedArray(zarr_array) + data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs( - zarr_array, self._dimension_key) - return Variable(dimensions, data, attributes) + zarr_array, self._DIMENSION_KEY) + encoding = {'chunks': zarr_array.chunks, + 'compressor': zarr_array.compressor, + 'filters': zarr_array.filters, + 'fill_value': zarr_array.fill_value} + + var = Variable(dimensions, data, attributes, encoding) + + if self._auto_chunk: + from dask.base import tokenize + # is this token enough? + token = tokenize(zarr_array) + name = 'zarr_array-%s' % token + # do we need to worry about the zarr synchronizer / dask lock? + lock = self._synchronizer + print("Chunking variable") + var = var.chunk(chunks=zarr_array.chunks, name=name, lock=lock) + + return var + def get_variables(self): with self.ensure_open(autoclose=False): @@ -112,24 +289,24 @@ def get_variables(self): def get_attrs(self): with self.ensure_open(autoclose=True): _, attributes = _get_zarr_dims_and_attrs(self.ds, - self._dimension_key) + self._DIMENSION_KEY) attrs = FrozenOrderedDict(attributes) return attrs def get_dimensions(self): with self.ensure_open(autoclose=True): dimensions, _ = _get_zarr_dims_and_attrs(self.ds, - self._dimension_key) + self._DIMENSION_KEY) return dimensions def set_dimension(self, name, length): with self.ensure_open(autoclose=False): - self.ds.attrs[self._dimension_key][name] = length + self.ds.attrs[self._DIMENSION_KEY][name] = length def set_attribute(self, key, value): with self.ensure_open(autoclose=False): _, attributes = _get_zarr_dims_and_attrs(self.ds, - self._dimension_key) + self._DIMENSION_KEY) attributes[key] = value def prepare_variable(self, name, variable, check_encoding=False, @@ -139,30 +316,41 @@ def prepare_variable(self, name, variable, check_encoding=False, dims = variable.dims dtype = variable.dtype shape = variable.shape - chunks = _dask_chunks_to_zarr_chunks(variable.chunks) - # TODO: figure ouw how zarr should deal with unlimited dimensions + # TODO: figure out how zarr should deal with unlimited dimensions self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) - # let's try keeping this fill value stuff - fill_value = attrs.pop('_FillValue', None) + # netcdf uses pop not get...yet it works. Why? + # here we are basically duplicating zarr's own internal fill_value + # in an attribute. This seems redundant and error prone. How can + # we do better? + fill_value = attrs.get('_FillValue', None) if fill_value in ['\x00']: fill_value = None # TODO: figure out what encoding is needed for zarr + encoding = _extract_zarr_variable_encoding( + variable, raise_on_invalid=check_encoding) ### arguments for zarr.create # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, # path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) - - # TODO: figure out how to pass along all those other arguments - zarr_array = self.ds.create(name, shape=shape, dtype=dtype, - chunks=chunks, fill_value=fill_value) - zarr_array.attrs[self._dimension_key] = dims + fill_value=fill_value, **encoding) + # decided not to explicity enumerate encoding options because we + # risk overriding zarr's defaults (e.g. if we specificy + # cache_metadata=None instead of True). Alternative is to have lots of + # logic in _extract_zarr_variable encoding to duplicate zarr defaults. + # chunks=encoding.get('chunks'), + # compressor=encoding.get('compressor'), + # filters=encodings.get('filters'), + # cache_metadata=encoding.get('cache_metadata')) + + # the magic for storing the hidden dimension data + zarr_array.attrs[self._DIMENSION_KEY] = dims _, attributes = _get_zarr_dims_and_attrs(zarr_array, - self._dimension_key) + self._DIMENSION_KEY) for k, v in iteritems(attrs): attributes[k] = v @@ -172,7 +360,31 @@ def prepare_variable(self, name, variable, check_encoding=False, # sync() and close() methods should not be needed with zarr -def open_zarr(store, decode_cf=True, +# from zarr docs + +# Zarr arrays can be used as either the source or sink for data in parallel +# computations. Both multi-threaded and multi-process parallelism are supported. +# The Python global interpreter lock (GIL) is released for both compression and +# decompression operations, so Zarr will not block other Python threads from running. +# +# A Zarr array can be read concurrently by multiple threads or processes. No +# synchronization (i.e., locking) is required for concurrent reads. +# +# A Zarr array can also be written to concurrently by multiple threads or +# processes. Some synchronization may be required, depending on the way the data +# is being written. + +# If each worker in a parallel computation is writing to a separate region of +# the array, and if region boundaries are perfectly aligned with chunk +# boundaries, then no synchronization is required. However, if region and chunk +# boundaries are not perfectly aligned, then synchronization is required to +# avoid two workers attempting to modify the same chunk at the same time. + + + + +def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, + decode_cf=True, mask_and_scale=True, decode_times=True, autoclose=False, concat_characters=True, decode_coords=True, cache=None, drop_variables=None): @@ -182,6 +394,17 @@ def open_zarr(store, decode_cf=True, ---------- store : MutableMapping or str Store or path to directory in file system. + mode : {‘r’, ‘r+’} + Persistence mode: ‘r’ means read only (must exist); ‘r+’ means + read/write (must exist) + synchronizer : object, optional + Array synchronizer + group : str, obtional + Group path. (a.k.a. `path` in zarr terminology.) + auto_chunk : bool, optional + Whether to automatically create dask chunks corresponding to each + variable's zarr chunks. If False, zarr array data will lazily convert + to numpy arrays upon access. decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -247,5 +470,6 @@ def maybe_decode_store(store, lock=False): return ds - zarr_store = ZarrStore(store=store) + zarr_store = ZarrStore(store=store, mode=mode, synchronizer=synchronizer, + group=group, auto_chunk=auto_chunk) return maybe_decode_store(zarr_store) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 8eaecdde548..1dff88893a5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1008,6 +1008,34 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, engine=engine, encoding=encoding, unlimited_dims=unlimited_dims) + def to_zarr(self, store=None, mode='a', synchronizer=None, group=None, + encoding=None): + """Write dataset contents to a zarr group. + + Parameters + ---------- + store : MutableMapping or str, optional + Store or path to directory in file system. + mode : {‘r’, ‘r+’, ‘a’, ‘w’, ‘w-‘} + Persistence mode: ‘r’ means read only (must exist); ‘r+’ means + read/write (must exist); ‘a’ means read/write (create if doesn’t + exist); ‘w’ means create (overwrite if exists); ‘w-‘ means create + (fail if exists). + synchronizer : object, optional + Array synchronizer + group : str, obtional + Group path. (a.k.a. `path` in zarr terminology.) + encoding : dict, optional + Nested dictionary with variable names as keys and dictionaries of + variable specific encodings as values, e.g., + ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,}, ...}`` + """ + if encoding is None: + encoding = {} + from ..backends.api import to_zarr + return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer, + group=group, encoding=encoding) + def __unicode__(self): return formatting.dataset_repr(self) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cbcd6eeb364..ef23e866251 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -135,6 +135,7 @@ def create_store(self): def roundtrip(self, data, **kwargs): raise NotImplementedError + # note: zero dimensional arrays are not suppoerted by zarr backend def test_zero_dimensional_variable(self): expected = create_test_data() expected['float_var'] = ([], 1.0e9, {'units': 'units of awesome'}) @@ -539,6 +540,7 @@ def test_default_fill_value(self): ds = Dataset({'x': ('y', np.arange(10.0))}) kwargs = dict(encoding={'x': {'dtype': 'f4'}}) with self.roundtrip(ds, save_kwargs=kwargs) as actual: + print("actual.x.encoding", actual.x.encoding) self.assertEqual(actual.x.encoding['_FillValue'], np.nan) self.assertEqual(ds.x.encoding, {}) @@ -891,10 +893,10 @@ def create_store(self): @contextlib.contextmanager def roundtrip(self, data, save_kwargs={}, open_kwargs={}, allow_cleanup_failure=False): - with create_tmp_file(suffix='.zarr') as tmp: - zs = backends.ZarrStore(store=tmp) - data.dump_to_store(zs) - yield xr.open_zarr(tmp) + with create_tmp_file(suffix='.zarr', + allow_cleanup_failure=allow_cleanup_failure) as tmp_file: + data.to_zarr(store=tmp_file, **save_kwargs) + yield xr.open_zarr(tmp_file, **open_kwargs) @requires_scipy From 6446ea28c8f4b83f70f43186f490e04c999ff742 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 22:43:27 -0400 Subject: [PATCH 20/68] added chunking test --- xarray/tests/test_backends.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ef23e866251..2af6ef105a3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -898,6 +898,25 @@ def roundtrip(self, data, save_kwargs={}, open_kwargs={}, data.to_zarr(store=tmp_file, **save_kwargs) yield xr.open_zarr(tmp_file, **open_kwargs) + def test_auto_chunk(self): + original = create_test_data ().chunk() + + with self.roundtrip(original, + open_kwargs={'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + self.assertEqual(v._in_memory, k in actual.dims) + # there should be no chunks + self.assertEqual(v.chunks, None) + + with self.roundtrip(original, + open_kwargs={'auto_chunk': True}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + self.assertEqual(v._in_memory, k in actual.dims) + # chunk size should be the same as original + self.assertEqual(v.chunks, original[k].chunks) + @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, Only32BitTypes, TestCase): From 9136064a580ef9c4b1e2023656ee83a80e22f07a Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 23:27:08 -0400 Subject: [PATCH 21/68] remove debuggin statements --- xarray/backends/api.py | 1 - xarray/backends/common.py | 8 -------- 2 files changed, 9 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 2394abf4370..82b96caf13c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -702,6 +702,5 @@ def to_zarr(dataset, store=None, mode='a', synchronizer=None, group=None, # I think zarr stores should always be sync'd immediately # TODO: figure out how to properly handle unlimited_dims - print("to_zarr encoding", encoding) dataset.dump_to_store(store, sync=True, encoding=encoding) return store diff --git a/xarray/backends/common.py b/xarray/backends/common.py index 185d9ee86e0..cec55d22589 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -241,15 +241,7 @@ class WritableCFDataStore(AbstractWritableDataStore): def store(self, variables, attributes, *args, **kwargs): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. - #print('Raw variable encoding:', - # {name: var.encoding for name, var in variables.items()}) - #print('Raw variable attrs:', - # {name: var.attrs for name, var in variables.items()}) cf_variables, cf_attrs = cf_encoder(variables, attributes) - #print('cf_variable encoding:', - # {name: var.encoding for name, var in cf_variables.items()}) - #print('cf_variable attrs:', - # {name: var.attrs for name, var in cf_variables.items()}) AbstractWritableDataStore.store(self, cf_variables, cf_attrs, *args, **kwargs) From 2966100fa8c9b4e3d5c57a8315543d5168d8d766 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 7 Oct 2017 23:41:57 -0400 Subject: [PATCH 22/68] fixed HiddenKeyDict --- xarray/core/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index cccefe3d3c6..774ae3fe5c1 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -496,8 +496,10 @@ class HiddenKeyDict(MutableMapping): Acts like a normal dictionary, but hides certain keys. ''' # ``__init__`` method required to create instance from class. - def __init__(self, data, *hidden_keys): + def __init__(self, data, hidden_keys): self._data = data + if type(hidden_keys) is not list: + hidden_keys = [ hidden_keys ] self._hidden_keys = hidden_keys def _raise_if_hidden(self, key): @@ -523,4 +525,5 @@ def __iter__(self): yield k def __len__(self): - return len(list(self.__iter__())) + num_hidden = sum([k in self._hidden_keys for k in self._data]) + return len(self._data) - num_hidden From 6bedf22c90551372fe7ceb9abfc2ecff00f08e53 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Sat, 14 Oct 2017 14:46:09 -0400 Subject: [PATCH 23/68] wip --- xarray/backends/zarr.py | 52 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 36a54b845ab..45ad49f926d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -7,6 +7,8 @@ from collections import MutableMapping import operator +import numpy as np + from .. import Variable from ..core import indexing from ..core.utils import (FrozenOrderedDict, close_on_error, HiddenKeyDict, @@ -30,6 +32,42 @@ # or maybe we don't need wrappers at all? probably not true +# zarr attributes have to be serializable as json +# many xarray datasets / variables have numpy arrays and values +# these functions handle encoding / decoding of such items +def _encode_zarr_attr_value(value): + # what is the most duck-type friendly way to do this check + print("encoding zarr attr value %r" % value) + if isinstance(value, np.ndarray): + return value.tolist() + # I don't know how to check generically if something is a numpy scalar + # i.e. np.float32 or np.int8, etc. without checking against each dtype + # manually. This was the best I could come up with + try: + # np.string_('X').item() returns a type `bytes` + # zarr still doesn't like that + # causes some fill_value encoding to fail + return value.item() + except AttributeError: + return value + + +def _encode_zarr_attrs(attrs): + return OrderedDict([(k, _encode_zarr_attr_value(v)) + for k, v in attrs.items()]) + + +def _decode_zarr_attr_value(value): + # what happens if we just don't decode anything? + # does it matter that we don't convert back to numpy types? + return value + + +def _decode_zarr_attrs(attrs): + return OrderedDict([(k, _decode_zarr_attr_value(v)) + for k, v in attrs.items()]) + + class ZarrArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): def __init__(self, variable_name, datastore): self.datastore = datastore @@ -260,6 +298,7 @@ def open_store_variable(self, name, zarr_array): data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs( zarr_array, self._DIMENSION_KEY) + attributes = _decode_zarr_attrs(attributes) encoding = {'chunks': zarr_array.chunks, 'compressor': zarr_array.compressor, 'filters': zarr_array.filters, @@ -289,8 +328,7 @@ def get_attrs(self): with self.ensure_open(autoclose=True): _, attributes = _get_zarr_dims_and_attrs(self.ds, self._DIMENSION_KEY) - attrs = FrozenOrderedDict(attributes) - return attrs + return _decode_zarr_attrs(attributes) def get_dimensions(self): with self.ensure_open(autoclose=True): @@ -306,7 +344,7 @@ def set_attribute(self, key, value): with self.ensure_open(autoclose=False): _, attributes = _get_zarr_dims_and_attrs(self.ds, self._DIMENSION_KEY) - attributes[key] = value + attributes[key] = _encode_zarr_attr_value(value) def prepare_variable(self, name, variable, check_encoding=False, unlimited_dims=None): @@ -323,7 +361,8 @@ def prepare_variable(self, name, variable, check_encoding=False, # here we are basically duplicating zarr's own internal fill_value # in an attribute. This seems redundant and error prone. How can # we do better? - fill_value = attrs.get('_FillValue', None) + # Also, this needs to be encoded as a zarr attr + fill_value = _encode_zarr_attr_value(attrs.get('_FillValue', None)) if fill_value in ['\x00']: fill_value = None @@ -331,6 +370,9 @@ def prepare_variable(self, name, variable, check_encoding=False, encoding = _extract_zarr_variable_encoding( variable, raise_on_invalid=check_encoding) + + print('preparing variable with attributes %r' % attrs) + print('preparing variable with encoding %r' % encoding) ### arguments for zarr.create # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, @@ -352,7 +394,7 @@ def prepare_variable(self, name, variable, check_encoding=False, self._DIMENSION_KEY) for k, v in iteritems(attrs): - attributes[k] = v + attributes[k] = _encode_zarr_attr_value(v) return zarr_array, variable.data From e461cdb7d8e87fa27f8099bc2a491317a0eeebfc Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 16 Oct 2017 12:37:47 -0400 Subject: [PATCH 24/68] finished merge --- xarray/backends/zarr.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index c1d4d3bf5d5..45ad49f926d 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -7,11 +7,8 @@ from collections import MutableMapping import operator -<<<<<<< HEAD -======= import numpy as np ->>>>>>> origin/zarr_backend from .. import Variable from ..core import indexing from ..core.utils import (FrozenOrderedDict, close_on_error, HiddenKeyDict, @@ -35,8 +32,6 @@ # or maybe we don't need wrappers at all? probably not true -<<<<<<< HEAD -======= # zarr attributes have to be serializable as json # many xarray datasets / variables have numpy arrays and values # these functions handle encoding / decoding of such items @@ -73,7 +68,6 @@ def _decode_zarr_attrs(attrs): for k, v in attrs.items()]) ->>>>>>> origin/zarr_backend class ZarrArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): def __init__(self, variable_name, datastore): self.datastore = datastore @@ -264,10 +258,6 @@ def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): class ZarrStore(WritableCFDataStore, DataStorePickleMixin): -<<<<<<< HEAD -#class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): -======= ->>>>>>> origin/zarr_backend """Store for reading and writing data via zarr """ @@ -308,10 +298,7 @@ def open_store_variable(self, name, zarr_array): data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs( zarr_array, self._DIMENSION_KEY) -<<<<<<< HEAD -======= attributes = _decode_zarr_attrs(attributes) ->>>>>>> origin/zarr_backend encoding = {'chunks': zarr_array.chunks, 'compressor': zarr_array.compressor, 'filters': zarr_array.filters, @@ -341,12 +328,7 @@ def get_attrs(self): with self.ensure_open(autoclose=True): _, attributes = _get_zarr_dims_and_attrs(self.ds, self._DIMENSION_KEY) -<<<<<<< HEAD - attrs = FrozenOrderedDict(attributes) - return attrs -======= return _decode_zarr_attrs(attributes) ->>>>>>> origin/zarr_backend def get_dimensions(self): with self.ensure_open(autoclose=True): @@ -362,11 +344,7 @@ def set_attribute(self, key, value): with self.ensure_open(autoclose=False): _, attributes = _get_zarr_dims_and_attrs(self.ds, self._DIMENSION_KEY) -<<<<<<< HEAD - attributes[key] = value -======= attributes[key] = _encode_zarr_attr_value(value) ->>>>>>> origin/zarr_backend def prepare_variable(self, name, variable, check_encoding=False, unlimited_dims=None): @@ -383,12 +361,8 @@ def prepare_variable(self, name, variable, check_encoding=False, # here we are basically duplicating zarr's own internal fill_value # in an attribute. This seems redundant and error prone. How can # we do better? -<<<<<<< HEAD - fill_value = attrs.get('_FillValue', None) -======= # Also, this needs to be encoded as a zarr attr fill_value = _encode_zarr_attr_value(attrs.get('_FillValue', None)) ->>>>>>> origin/zarr_backend if fill_value in ['\x00']: fill_value = None @@ -396,12 +370,9 @@ def prepare_variable(self, name, variable, check_encoding=False, encoding = _extract_zarr_variable_encoding( variable, raise_on_invalid=check_encoding) -<<<<<<< HEAD -======= print('preparing variable with attributes %r' % attrs) print('preparing variable with encoding %r' % encoding) ->>>>>>> origin/zarr_backend ### arguments for zarr.create # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, @@ -423,11 +394,7 @@ def prepare_variable(self, name, variable, check_encoding=False, self._DIMENSION_KEY) for k, v in iteritems(attrs): -<<<<<<< HEAD - attributes[k] = v -======= attributes[k] = _encode_zarr_attr_value(v) ->>>>>>> origin/zarr_backend return zarr_array, variable.data From 049bf9e8afb50155f65b167c7daa4326c681d95a Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 16 Oct 2017 13:24:11 -0400 Subject: [PATCH 25/68] create opener object --- xarray/backends/zarr.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 45ad49f926d..89accd61f06 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -119,12 +119,13 @@ def __getitem__(self, key): # args for zarr.open_group # store=None, mode='a', synchronizer=None, path=None -def _open_zarr_group(store, mode, synchronizer, group): +def _open_zarr_group(store=None, overwrite=None, synchronizer=None, + group=None, mode=None): import zarr #zarr_group = zarr.group(store=store, overwrite=overwrite, # chunk_store=chunk_store, synchronizer=synchronizer, path=path) - zarr_group = zarr.open_group(store=store, mode=mode, - synchronizer=synchronizer, path=group) + zarr_group = zarr.open_group(store=store, mode=mode, + synchronizer=synchronizer, path=group) return zarr_group @@ -266,10 +267,6 @@ class ZarrStore(WritableCFDataStore, DataStorePickleMixin): def __init__(self, store=None, mode='a', synchronizer=None, group=None, auto_chunk=True, writer=None, autoclose=None): - opener = functools.partial(_open_zarr_group, store, mode, - synchronizer, group) - self.ds = opener() - self._mode = mode self._synchronizer = synchronizer self._group = group @@ -282,7 +279,12 @@ def __init__(self, store=None, mode='a', synchronizer=None, group=None, 'for the zarr backend') self._autoclose = False self._isopen = True - self._opener = None + + opener = functools.partial(_open_zarr_group, store=store, + synchronizer=synchronizer, group=group) + self._opener = opener + self.ds = self._opener(mode=mode) + # initialize hidden dimension attribute if self._DIMENSION_KEY not in self.ds.attrs: From c169128116ccba70b4b905a92f19cd59f27ccae2 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 16 Oct 2017 13:30:57 -0400 Subject: [PATCH 26/68] trying to get caching working --- xarray/backends/zarr.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 89accd61f06..9ed92b1b9cd 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -21,6 +21,9 @@ from .. import conventions +# this is a private method but we need it for open_zar +from .api import _protect_dataset_variables_inplace + # most of the other stores have some kind of wrapper class like # class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): # class H5NetCDFArrayWrapper(BaseNetCDF4Array): @@ -430,7 +433,7 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, decode_cf=True, mask_and_scale=True, decode_times=True, autoclose=False, concat_characters=True, decode_coords=True, - cache=None, drop_variables=None): + cache=False, drop_variables=None): """Load and decode a dataset from a file or file-like object. Parameters @@ -509,7 +512,7 @@ def maybe_decode_store(store, lock=False): # this is how we would apply caching # but do we want it for zarr stores? - #_protect_dataset_variables_inplace(ds, cache) + _protect_dataset_variables_inplace(ds, cache) return ds From 82ef45613332d72f52a24dce7ac924aa73af3e7a Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 16 Oct 2017 16:46:49 -0400 Subject: [PATCH 27/68] caching still not working --- xarray/backends/zarr.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9ed92b1b9cd..1d69136d7e2 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -46,12 +46,21 @@ def _encode_zarr_attr_value(value): # I don't know how to check generically if something is a numpy scalar # i.e. np.float32 or np.int8, etc. without checking against each dtype # manually. This was the best I could come up with - try: + elif isinstance(value, np.generic): # np.string_('X').item() returns a type `bytes` # zarr still doesn't like that # causes some fill_value encoding to fail return value.item() - except AttributeError: + else: + return value + + +def _ensure_valid_fill_value(value, dtype): + print('ensure_valid_fill_value (%r, %r)' % (value, dtype)) + if dtype.type == np.string_ and type(value) == bytes: + print('decoding ascii') + return value.decode('ascii') + else: return value @@ -367,10 +376,16 @@ def prepare_variable(self, name, variable, check_encoding=False, # in an attribute. This seems redundant and error prone. How can # we do better? # Also, this needs to be encoded as a zarr attr - fill_value = _encode_zarr_attr_value(attrs.get('_FillValue', None)) + fill_value = _ensure_valid_fill_value( + _encode_zarr_attr_value(attrs.get('_FillValue', None)), + dtype) if fill_value in ['\x00']: fill_value = None + # messy! fix + if fill_value is not None: + attrs['_FillValue'] = fill_value + # TODO: figure out what encoding is needed for zarr encoding = _extract_zarr_variable_encoding( variable, raise_on_invalid=check_encoding) From e20c29f6c48ede025f75ceeddda0f4609a91a8b1 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 13 Nov 2017 14:55:17 -0500 Subject: [PATCH 28/68] updating zarr backend with new indexing mixins --- xarray/backends/zarr.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 1d69136d7e2..a5919483822 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -40,7 +40,6 @@ # these functions handle encoding / decoding of such items def _encode_zarr_attr_value(value): # what is the most duck-type friendly way to do this check - print("encoding zarr attr value %r" % value) if isinstance(value, np.ndarray): return value.tolist() # I don't know how to check generically if something is a numpy scalar @@ -56,9 +55,7 @@ def _encode_zarr_attr_value(value): def _ensure_valid_fill_value(value, dtype): - print('ensure_valid_fill_value (%r, %r)' % (value, dtype)) if dtype.type == np.string_ and type(value) == bytes: - print('decoding ascii') return value.decode('ascii') else: return value @@ -80,7 +77,8 @@ def _decode_zarr_attrs(attrs): for k, v in attrs.items()]) -class ZarrArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): +class ZarrArrayWrapper(NdimSizeLenMixin, DunderArrayMixin, + indexing.NDArrayIndexable): def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name @@ -105,6 +103,10 @@ def __getitem__(self, key): #if self.datastore.is_remote: # pragma: no cover # getitem = functools.partial(robust_getitem, catch=RuntimeError) #else: + if isinstance(key, indexing.VectorizedIndexer): + raise NotImplementedError( + 'Vectorized indexing for {} is not implemented. Load your ' + 'data first with .load() or .compute().'.format(type(self))) getitem = operator.getitem try: data = getitem(self.get_array(), key) @@ -327,7 +329,6 @@ def open_store_variable(self, name, zarr_array): name = 'zarr_array-%s' % token # do we need to worry about the zarr synchronizer / dask lock? lock = self._synchronizer - print("Chunking variable") var = var.chunk(chunks=zarr_array.chunks, name=name, lock=lock) return var @@ -350,7 +351,10 @@ def get_dimensions(self): self._DIMENSION_KEY) return dimensions - def set_dimension(self, name, length): + def set_dimension(self, name, length, is_unlimited=False): + if is_unlimited: + raise NotImplementedError("Zarr backend doesn't know how to " + "handle unlimited dimensions.") with self.ensure_open(autoclose=False): self.ds.attrs[self._DIMENSION_KEY][name] = length @@ -390,9 +394,6 @@ def prepare_variable(self, name, variable, check_encoding=False, encoding = _extract_zarr_variable_encoding( variable, raise_on_invalid=check_encoding) - - print('preparing variable with attributes %r' % attrs) - print('preparing variable with encoding %r' % encoding) ### arguments for zarr.create # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, From f82c8c1f48b7d19cd1addd87391c8ff9904b239f Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 13 Nov 2017 15:20:05 -0500 Subject: [PATCH 29/68] added new zarr dev test env --- ci/requirements-py36-zarr-dev.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 ci/requirements-py36-zarr-dev.yml diff --git a/ci/requirements-py36-zarr-dev.yml b/ci/requirements-py36-zarr-dev.yml new file mode 100644 index 00000000000..43c2a84617c --- /dev/null +++ b/ci/requirements-py36-zarr-dev.yml @@ -0,0 +1,20 @@ +name: test_env +channels: + - conda-forge +dependencies: + - python=3.6 + - dask + - distributed + - matplotlib + - pytest + - flake8 + - numpy + - pandas + - scipy + - seaborn + - toolz + - bottleneck + - pip: + - coveralls + - pytest-cov + - git@github.com:alimanfoo/zarr.git From 43e539f33c05db7746ab32710513c45ec23ef660 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 13 Nov 2017 15:29:45 -0500 Subject: [PATCH 30/68] update travis --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index d797e9844bc..3ebd497997d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,6 +43,8 @@ matrix: env: CONDA_ENV=py36-pynio-dev - python: 3.6 env: CONDA_ENV=py36-rasterio1.0alpha + - python: 3.6 + env: CONDA_ENV=py36-zarr-dev allow_failures: - python: 3.6 env: From 66299f0e2b379e3c33026b077ee2d0a2edddf312 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 13 Nov 2017 15:33:21 -0500 Subject: [PATCH 31/68] move zarr-dev to travis allowed failures --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 3ebd497997d..2c6ad370f26 100644 --- a/.travis.yml +++ b/.travis.yml @@ -69,6 +69,8 @@ matrix: env: CONDA_ENV=py36-pynio-dev - python: 3.6 env: CONDA_ENV=py36-rasterio1.0alpha + - python: 3.6 + env: CONDA_ENV=py36-zarr-dev before_install: - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then From 2fce362da566aa93ee56f60ebd596c8940133bd5 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 13 Nov 2017 15:36:58 -0500 Subject: [PATCH 32/68] fix typo in env file --- ci/requirements-py36-zarr-dev.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/requirements-py36-zarr-dev.yml b/ci/requirements-py36-zarr-dev.yml index 43c2a84617c..9be522882c5 100644 --- a/ci/requirements-py36-zarr-dev.yml +++ b/ci/requirements-py36-zarr-dev.yml @@ -17,4 +17,4 @@ dependencies: - pip: - coveralls - pytest-cov - - git@github.com:alimanfoo/zarr.git + - git+https://github.com/alimanfoo/zarr.git From c19b81a1a183548f2156527a3073e1b34c393219 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 16 Nov 2017 20:29:38 -0500 Subject: [PATCH 33/68] wip --- xarray/backends/zarr.py | 41 ++++++++++++++++++++--------------- xarray/tests/test_backends.py | 5 ++++- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index a5919483822..b9061b990af 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -9,7 +9,7 @@ import numpy as np -from .. import Variable +from .. import Variable, Dataset from ..core import indexing from ..core.utils import (FrozenOrderedDict, close_on_error, HiddenKeyDict, NdimSizeLenMixin,DunderArrayMixin) @@ -280,11 +280,10 @@ class ZarrStore(WritableCFDataStore, DataStorePickleMixin): _DIMENSION_KEY = '_ARRAY_DIMENSIONS' def __init__(self, store=None, mode='a', synchronizer=None, group=None, - auto_chunk=True, writer=None, autoclose=None): + writer=None, autoclose=None): self._mode = mode self._synchronizer = synchronizer self._group = group - self._auto_chunk = auto_chunk # zarr stores don't need to be opened, closed, or synced. # So what do we do with all this logical about openers? @@ -320,18 +319,7 @@ def open_store_variable(self, name, zarr_array): 'filters': zarr_array.filters, 'fill_value': zarr_array.fill_value} - var = Variable(dimensions, data, attributes, encoding) - - if self._auto_chunk: - from dask.base import tokenize - # is this token enough? - token = tokenize(zarr_array) - name = 'zarr_array-%s' % token - # do we need to worry about the zarr synchronizer / dask lock? - lock = self._synchronizer - var = var.chunk(chunks=zarr_array.chunks, name=name, lock=lock) - - return var + return Variable(dimensions, data, attributes, encoding) def get_variables(self): @@ -533,5 +521,24 @@ def maybe_decode_store(store, lock=False): return ds zarr_store = ZarrStore(store=store, mode=mode, synchronizer=synchronizer, - group=group, auto_chunk=auto_chunk) - return maybe_decode_store(zarr_store) + group=group) + ds = maybe_decode_store(zarr_store) + + # auto chunking needs to be here and not in ZarrStore because variable + # chunks do not survive decode_cf + if auto_chunk: + # adapted from Dataset.Chunk() + def maybe_chunk(name, var): + chunks = var.encoding.get('chunks') + if (var.ndim > 0) and (chunks is not None): + token2 = tokenize(name, token if token else var._data) + name2 = 'zarr-%s-%s' % (name, token2) + return var.chunk(chunks, name=name2, lock=None) + else: + return var + + variables = OrderedDict([(k, maybe_chunk(k, v)) + for k, v in ds.variables.items()]) + return ds._replace_vars_and_dims(variables) + else: + return ds diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a3e809057b6..b6111774acd 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -30,7 +30,7 @@ requires_pynio, requires_pathlib, requires_zarr, requires_rasterio, has_netCDF4, has_scipy, assert_allclose, flaky, network, assert_identical, raises_regex) - + from .test_dataset import create_test_data from xarray.tests import mock, assert_identical @@ -1088,7 +1088,9 @@ def roundtrip(self, data, save_kwargs={}, open_kwargs={}, allow_cleanup_failure=False): with create_tmp_file(suffix='.zarr', allow_cleanup_failure=allow_cleanup_failure) as tmp_file: + print("__to_zarr__") data.to_zarr(store=tmp_file, **save_kwargs) + print ("__open_zarr__") yield xr.open_zarr(tmp_file, **open_kwargs) def test_auto_chunk(self): @@ -1108,6 +1110,7 @@ def test_auto_chunk(self): # only index variables should be in memory self.assertEqual(v._in_memory, k in actual.dims) # chunk size should be the same as original + print('%s v.variable.chunks' % k, v.variable.chunks) self.assertEqual(v.chunks, original[k].chunks) From 68b8f07be4a5ebd921bca1b94a2d525329a82404 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 16 Nov 2017 20:36:29 -0500 Subject: [PATCH 34/68] fixed zarr auto_chunk --- xarray/backends/zarr.py | 3 ++- xarray/tests/test_backends.py | 5 +---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b9061b990af..4f5fb66a541 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -529,9 +529,10 @@ def maybe_decode_store(store, lock=False): if auto_chunk: # adapted from Dataset.Chunk() def maybe_chunk(name, var): + from dask.base import tokenize chunks = var.encoding.get('chunks') if (var.ndim > 0) and (chunks is not None): - token2 = tokenize(name, token if token else var._data) + token2 = tokenize(name, var._data) name2 = 'zarr-%s-%s' % (name, token2) return var.chunk(chunks, name=name2, lock=None) else: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b6111774acd..ec1203dc902 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1088,13 +1088,11 @@ def roundtrip(self, data, save_kwargs={}, open_kwargs={}, allow_cleanup_failure=False): with create_tmp_file(suffix='.zarr', allow_cleanup_failure=allow_cleanup_failure) as tmp_file: - print("__to_zarr__") data.to_zarr(store=tmp_file, **save_kwargs) - print ("__open_zarr__") yield xr.open_zarr(tmp_file, **open_kwargs) def test_auto_chunk(self): - original = create_test_data ().chunk() + original = create_test_data().chunk() with self.roundtrip(original, open_kwargs={'auto_chunk': False}) as actual: @@ -1110,7 +1108,6 @@ def test_auto_chunk(self): # only index variables should be in memory self.assertEqual(v._in_memory, k in actual.dims) # chunk size should be the same as original - print('%s v.variable.chunks' % k, v.variable.chunks) self.assertEqual(v.chunks, original[k].chunks) From 0ea0dadb83dd5332bf3445a2553530e6baae6124 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 16 Nov 2017 21:06:53 -0500 Subject: [PATCH 35/68] refactored zarr tests --- xarray/tests/test_backends.py | 46 ++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ec1203dc902..7a35e201650 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1077,20 +1077,7 @@ class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): @requires_zarr -class ZarrDataTest(CFEncodedDataTest, TestCase): - @contextlib.contextmanager - def create_store(self): - with create_tmp_file(suffix='.zarr') as tmp: - yield backends.ZarrStore(store=tmp) - - @contextlib.contextmanager - def roundtrip(self, data, save_kwargs={}, open_kwargs={}, - allow_cleanup_failure=False): - with create_tmp_file(suffix='.zarr', - allow_cleanup_failure=allow_cleanup_failure) as tmp_file: - data.to_zarr(store=tmp_file, **save_kwargs) - yield xr.open_zarr(tmp_file, **open_kwargs) - +class BaseZarrTest(CFEncodedDataTest): def test_auto_chunk(self): original = create_test_data().chunk() @@ -1111,6 +1098,37 @@ def test_auto_chunk(self): self.assertEqual(v.chunks, original[k].chunks) +@requires_zarr +class ZarrDictStoreTest(BaseZarrTest, TestCase): + @contextlib.contextmanager + def create_store(self): + yield backends.ZarrStore(store={}) + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}, + allow_cleanup_failure=False): + dict_store = {} + data.to_zarr(store=dict_store, **save_kwargs) + yield xr.open_zarr(dict_store, **open_kwargs) + + +@requires_zarr +class ZarrDirectoryStoreTest(BaseZarrTest, TestCase): + @contextlib.contextmanager + def create_store(self): + with create_tmp_file(suffix='.zarr') as tmp: + yield backends.ZarrStore(store=tmp) + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}, + allow_cleanup_failure=False): + with create_tmp_file(suffix='.zarr', + allow_cleanup_failure=allow_cleanup_failure) as tmp_file: + data.to_zarr(store=tmp_file, **save_kwargs) + yield xr.open_zarr(tmp_file, **open_kwargs) + + + @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, NetCDF3Only, TestCase): engine = 'scipy' From 58b3bf0ef69ad5db979a19c37905f76712bd6757 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 16 Nov 2017 22:21:25 -0500 Subject: [PATCH 36/68] new encoding test --- xarray/backends/zarr.py | 6 ++++-- xarray/tests/test_backends.py | 11 +++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 4f5fb66a541..506a2b4c4c8 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -203,8 +203,10 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): "variable.ndim %g" % (enc_chunks_tuple, _DIMENSION_KEY)) - if not all(x is int for x in enc_chunks_tuple): - raise ValueError("zarr chunks much be an int or a tuple of ints") + for x in enc_chunks_tuple: + if not isinstance(x, int): + raise ValueError("zarr chunks must be an int or a tuple of ints. " + "Instead found %r" % (enc_chunks_tuple,)) # if there are chunks in encoding and the variabile data is a numpy array, # we use the specified chunks diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7a35e201650..b0e0ef1473c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1097,6 +1097,17 @@ def test_auto_chunk(self): # chunk size should be the same as original self.assertEqual(v.chunks, original[k].chunks) + def test_chunk_encoding(self): + data = create_test_data() + chunks = (5, 5) + data['var2'].encoding.update({'chunks': chunks}) + with self.roundtrip(data) as actual: + self.assertEqual(chunks, actual['var2'].encoding['chunks']) + data['var2'].encoding.update({'chunks': (5, 4.5)}) + with pytest.raises(ValueError): + with self.roundtrip(data) as actual: + pass + @requires_zarr class ZarrDictStoreTest(BaseZarrTest, TestCase): From a8b478543a978bd98c37711609c610432fdc7d07 Mon Sep 17 00:00:00 2001 From: Joe Hamman Date: Thu, 16 Nov 2017 21:03:12 -0800 Subject: [PATCH 37/68] cleanup and buildout ZarrArrayWrapper, vectorized indexing --- xarray/backends/zarr.py | 216 +++++++++++++++------------------- xarray/tests/test_backends.py | 15 ++- 2 files changed, 102 insertions(+), 129 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 506a2b4c4c8..23ade7eb4fa 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -2,38 +2,26 @@ from __future__ import division from __future__ import print_function import functools -import warnings from itertools import product -from collections import MutableMapping -import operator +# import operator import numpy as np -from .. import Variable, Dataset +from .. import Variable from ..core import indexing -from ..core.utils import (FrozenOrderedDict, close_on_error, HiddenKeyDict, - NdimSizeLenMixin,DunderArrayMixin) -from ..core.pycompat import (iteritems, bytes_type, unicode_type, OrderedDict, - basestring) +from ..core.utils import FrozenOrderedDict, HiddenKeyDict +from ..core.pycompat import iteritems, OrderedDict -from .common import (WritableCFDataStore, AbstractWritableDataStore, - DataStorePickleMixin) +from .common import WritableCFDataStore, DataStorePickleMixin, BackendArray from .. import conventions # this is a private method but we need it for open_zar from .api import _protect_dataset_variables_inplace -# most of the other stores have some kind of wrapper class like -# class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): -# class H5NetCDFArrayWrapper(BaseNetCDF4Array): -# class NioArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): -# we problaby need something like this +# need some special secret attributes to tell us the dimensions +_DIMENSION_KEY = '_ARRAY_DIMENSIONS' -# the first question is whether it should be based on BaseNetCDF4Array or -# NdimSizeLenMixing? - -# or maybe we don't need wrappers at all? probably not true # zarr attributes have to be serializable as json # many xarray datasets / variables have numpy arrays and values @@ -61,11 +49,7 @@ def _ensure_valid_fill_value(value, dtype): return value -def _encode_zarr_attrs(attrs): - return OrderedDict([(k, _encode_zarr_attr_value(v)) - for k, v in attrs.items()]) - - +# TODO: cleanup/combine these next two functions def _decode_zarr_attr_value(value): # what happens if we just don't decode anything? # does it matter that we don't convert back to numpy types? @@ -77,8 +61,28 @@ def _decode_zarr_attrs(attrs): for k, v in attrs.items()]) -class ZarrArrayWrapper(NdimSizeLenMixin, DunderArrayMixin, - indexing.NDArrayIndexable): +# untested, but I think this does the appropriate shape munging to make slices +# appear as the last axes of the result array +def _replace_slices_with_arrays(key, shape): + num_slices = sum(1 for k in key if isinstance(k, slice)) + num_arrays = len(shape) - num_slices + new_key = [] + slice_count = 0 + for k, size in zip(key, shape): + if isinstance(k, slice): + array = np.arange(*k.indices(size)) + sl = [np.newaxis] * len(shape) + sl[num_arrays + slice_count] = np.newaxis + k = array[sl] + slice_count += 1 + else: + assert isinstance(k, np.ndarray) + k = k[(slice(None),) * num_arrays + (np.newaxis,) * num_slices] + new_key.append(k) + return tuple(new_key) + + +class ZarrArrayWrapper(BackendArray): def __init__(self, variable_name, datastore): self.datastore = datastore self.variable_name = variable_name @@ -98,53 +102,31 @@ def get_array(self): return self.datastore.ds[self.variable_name] def __getitem__(self, key): - # TODO: do we want to use robust_getitem for certain types of - # zarr store (e.g. S3)? - #if self.datastore.is_remote: # pragma: no cover - # getitem = functools.partial(robust_getitem, catch=RuntimeError) - #else: - if isinstance(key, indexing.VectorizedIndexer): - raise NotImplementedError( - 'Vectorized indexing for {} is not implemented. Load your ' - 'data first with .load() or .compute().'.format(type(self))) - getitem = operator.getitem - try: - data = getitem(self.get_array(), key) - except IndexError: - # Catch IndexError in netCDF4 and return a more informative - # error message. This is most often called when an unsorted - # indexer is used before the data is loaded from disk. - msg = ('The indexing operation you are attempting to perform ' - 'is not valid on zarr.core.Array object. Try loading ' - 'your data into memory first by calling .load().') - if not PY3: - import traceback - msg += '\n\nOriginal traceback:\n' + traceback.format_exc() - raise IndexError(msg) - return data - + array = self.get_array() + if isinstance(key, indexing.BasicIndexer): + return array[key.tuple] + elif isinstance(key, indexing.VectorizedIndexer): + return array.vindex[_replace_slices_with_arrays(key.tuple, + self.shape)] + else: + assert isinstance(key, indexing.OuterIndexer) + return array.oindex[key.tuple] # if self.ndim == 0: # could possibly have a work-around for 0d data here -# keyword args for zarr.group -# store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None -# the group name is called "path" in the zarr lexicon - -# args for zarr.open_group -# store=None, mode='a', synchronizer=None, path=None def _open_zarr_group(store=None, overwrite=None, synchronizer=None, group=None, mode=None): + '''Wrap zarr.open_group''' + import zarr - #zarr_group = zarr.group(store=store, overwrite=overwrite, - # chunk_store=chunk_store, synchronizer=synchronizer, path=path) - zarr_group = zarr.open_group(store=store, mode=mode, - synchronizer=synchronizer, path=group) + zarr_group = zarr.open_group(store=store, mode=mode, + synchronizer=synchronizer, path=group) return zarr_group def _dask_chunks_to_zarr_chunks(chunks): - # this function dask chunks syntax to zarr chunks + '''this function dask chunks syntax to zarr chunks''' if chunks is None: return chunks @@ -156,6 +138,7 @@ def _dask_chunks_to_zarr_chunks(chunks): chunks) return first_chunk + def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): """ Given encoding chunks (possibly None) and variable chunks (possibly None) @@ -165,8 +148,8 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): # chunks : int or tuple of ints, optional # Chunk shape. If not provided, will be guessed from shape and dtype. - # if there are no chunks in encoding and the variable data is a numpy array, - # then we let zarr use its own heuristics to pick the chunks + # if there are no chunks in encoding and the variable data is a numpy + # array, then we let zarr use its own heuristics to pick the chunks if var_chunks is None and enc_chunks is None: return None @@ -181,10 +164,10 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): first_var_chunk = next(all_var_chunks) for this_chunk in all_var_chunks: if not (this_chunk == first_var_chunk): - raise ValueError("zarr requires uniform chunk sizes, but " - "variable has non-uniform chunks %r. " - "Consider rechunking the data using `chunk()`." % - var_chunks) + raise ValueError( + "zarr requires uniform chunk sizes, but variable has " + "non-uniform chunks %r. Consider rechunking the data " + "using `chunk()`." % var_chunks) return first_var_chunk # from here on, we are dealing with user-specified chunks in encoding @@ -195,7 +178,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): # this coerces a single int to a tuple but leaves a tuple as is enc_chunks_tuple = tuple(enc_chunks) - if len(enc_chunks_tuple)==1: + if len(enc_chunks_tuple) == 1: enc_chunks_tuple = ndim * enc_chunks_tuple if not len(enc_chunks_tuple) == ndim: @@ -217,21 +200,22 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): # DESIGN CHOICE: do not allow multiple dask chunks on a single zarr chunk # this avoids the need to get involved in zarr synchronization / locking # From zarr docs: - # "If each worker in a parallel computation is writing to a separate region - # of the array, and if region boundaries are perfectly aligned with chunk - # boundaries, then no synchronization is required." + # "If each worker in a parallel computation is writing to a separate + # region of the array, and if region boundaries are perfectly aligned + # with chunk boundaries, then no synchronization is required." if var_chunks and enc_chunks_tuple: for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks): for dchunk in dchunks: if not dchunk % zchunk == 0: - raise ValueError("Specified zarr chunks %r would" - "overlap multiple dask chunks %r." - "Consider rechunking the data using `chunk()` " - "or specifying different chunks in encoding." - % (enc_chunks_tuple, var_chunks)) + raise ValueError( + "Specified zarr chunks %r would overlap multiple dask " + "chunks %r. Consider rechunking the data using " + "`chunk()` or specifying different chunks in encoding." + % (enc_chunks_tuple, var_chunks)) return enc_chunks_tuple - raise RuntimeError("We should never get here. Function logic must be wrong.") + raise RuntimeError( + "We should never get here. Function logic must be wrong.") def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): @@ -244,21 +228,17 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): return dimensions, attributes -### arguments for zarr.create -# zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', -# fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, -# path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) - def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): encoding = variable.encoding.copy() - valid_encodings = set(['chunks', 'compressor', 'filters', 'cache_metadata']) + valid_encodings = set(['chunks', 'compressor', 'filters', + 'cache_metadata']) if raise_on_invalid: invalid = [k for k in encoding if k not in valid_encodings] if invalid: - raise ValueError('unexpected encoding parameters for zarr backend: ' - ' %r' % invalid) + raise ValueError('unexpected encoding parameters for zarr ' + 'backend: %r' % invalid) else: for k in list(encoding): if k not in valid_encodings: @@ -278,9 +258,6 @@ class ZarrStore(WritableCFDataStore, DataStorePickleMixin): """Store for reading and writing data via zarr """ - # need some special secret attributes to tell us the dimensions - _DIMENSION_KEY = '_ARRAY_DIMENSIONS' - def __init__(self, store=None, mode='a', synchronizer=None, group=None, writer=None, autoclose=None): self._mode = mode @@ -300,10 +277,9 @@ def __init__(self, store=None, mode='a', synchronizer=None, group=None, self._opener = opener self.ds = self._opener(mode=mode) - # initialize hidden dimension attribute - if self._DIMENSION_KEY not in self.ds.attrs: - self.ds.attrs[self._DIMENSION_KEY] = {} + if _DIMENSION_KEY not in self.ds.attrs: + self.ds.attrs[_DIMENSION_KEY] = {} # do we need to define attributes for all of the opener keyword args? super(ZarrStore, self).__init__(writer) @@ -313,8 +289,8 @@ def open_store_variable(self, name, zarr_array): # zarr seems to implement the required ndarray interface # TODO: possibly wrap zarr array in dask with aligned chunks data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) - dimensions, attributes = _get_zarr_dims_and_attrs( - zarr_array, self._DIMENSION_KEY) + dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, + _DIMENSION_KEY) attributes = _decode_zarr_attrs(attributes) encoding = {'chunks': zarr_array.chunks, 'compressor': zarr_array.compressor, @@ -323,7 +299,6 @@ def open_store_variable(self, name, zarr_array): return Variable(dimensions, data, attributes, encoding) - def get_variables(self): with self.ensure_open(autoclose=False): return FrozenOrderedDict((k, self.open_store_variable(k, v)) @@ -331,28 +306,25 @@ def get_variables(self): def get_attrs(self): with self.ensure_open(autoclose=True): - _, attributes = _get_zarr_dims_and_attrs(self.ds, - self._DIMENSION_KEY) + _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) return _decode_zarr_attrs(attributes) def get_dimensions(self): with self.ensure_open(autoclose=True): - dimensions, _ = _get_zarr_dims_and_attrs(self.ds, - self._DIMENSION_KEY) + dimensions, _ = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) return dimensions def set_dimension(self, name, length, is_unlimited=False): if is_unlimited: - raise NotImplementedError("Zarr backend doesn't know how to " - "handle unlimited dimensions.") + raise NotImplementedError( + "Zarr backend doesn't know how to handle unlimited dimensions") with self.ensure_open(autoclose=False): - self.ds.attrs[self._DIMENSION_KEY][name] = length + self.ds.attrs[_DIMENSION_KEY][name] = length def set_attribute(self, key, value): with self.ensure_open(autoclose=False): - _, attributes = _get_zarr_dims_and_attrs(self.ds, - self._DIMENSION_KEY) - attributes[key] = _encode_zarr_attr_value(value) + _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) + attributes[key] = _encode_zarr_attr_value(value) def prepare_variable(self, name, variable, check_encoding=False, unlimited_dims=None): @@ -384,10 +356,11 @@ def prepare_variable(self, name, variable, check_encoding=False, encoding = _extract_zarr_variable_encoding( variable, raise_on_invalid=check_encoding) - ### arguments for zarr.create - # zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', - # fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, - # path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) + # arguments for zarr.create: + # zarr.creation.create(shape, chunks=None, dtype=None, + # compressor='default', fill_value=0, order='C', store=None, + # synchronizer=None, overwrite=False, path=None, chunk_store=None, + # filters=None, cache_metadata=True, **kwargs) zarr_array = self.ds.create(name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding) # decided not to explicity enumerate encoding options because we @@ -400,9 +373,8 @@ def prepare_variable(self, name, variable, check_encoding=False, # cache_metadata=encoding.get('cache_metadata')) # the magic for storing the hidden dimension data - zarr_array.attrs[self._DIMENSION_KEY] = dims - _, attributes = _get_zarr_dims_and_attrs(zarr_array, - self._DIMENSION_KEY) + zarr_array.attrs[_DIMENSION_KEY] = dims + _, attributes = _get_zarr_dims_and_attrs(zarr_array, _DIMENSION_KEY) for k, v in iteritems(attrs): attributes[k] = _encode_zarr_attr_value(v) @@ -415,16 +387,17 @@ def prepare_variable(self, name, variable, check_encoding=False, # from zarr docs # Zarr arrays can be used as either the source or sink for data in parallel -# computations. Both multi-threaded and multi-process parallelism are supported. -# The Python global interpreter lock (GIL) is released for both compression and -# decompression operations, so Zarr will not block other Python threads from running. +# computations. Both multi-threaded and multi-process parallelism are +# supported. The Python global interpreter lock (GIL) is released for both +# compression and decompression operations, so Zarr will not block other Python +# threads from running. # # A Zarr array can be read concurrently by multiple threads or processes. No # synchronization (i.e., locking) is required for concurrent reads. # # A Zarr array can also be written to concurrently by multiple threads or -# processes. Some synchronization may be required, depending on the way the data -# is being written. +# processes. Some synchronization may be required, depending on the way the +# data is being written. # If each worker in a parallel computation is writing to a separate region of # the array, and if region boundaries are perfectly aligned with chunk @@ -433,13 +406,10 @@ def prepare_variable(self, name, variable, check_encoding=False, # avoid two workers attempting to modify the same chunk at the same time. - - def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, - decode_cf=True, - mask_and_scale=True, decode_times=True, autoclose=False, - concat_characters=True, decode_coords=True, - cache=False, drop_variables=None): + decode_cf=True, mask_and_scale=True, decode_times=True, + autoclose=False, concat_characters=True, decode_coords=True, + cache=False, drop_variables=None): """Load and decode a dataset from a file or file-like object. Parameters diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5b3acd83bb7..66c7e3539be 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1081,16 +1081,16 @@ class BaseZarrTest(CFEncodedDataTest): def test_auto_chunk(self): original = create_test_data().chunk() - with self.roundtrip(original, - open_kwargs={'auto_chunk': False}) as actual: + with self.roundtrip( + original, open_kwargs={'auto_chunk': False}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory self.assertEqual(v._in_memory, k in actual.dims) # there should be no chunks self.assertEqual(v.chunks, None) - with self.roundtrip(original, - open_kwargs={'auto_chunk': True}) as actual: + with self.roundtrip( + original, open_kwargs={'auto_chunk': True}) as actual: for k, v in actual.variables.items(): # only index variables should be in memory self.assertEqual(v._in_memory, k in actual.dims) @@ -1108,6 +1108,9 @@ def test_chunk_encoding(self): with self.roundtrip(data) as actual: pass + def test_vectorized_indexing(self): + self._test_vectorized_indexing(vindex_support=True) + @requires_zarr class ZarrDictStoreTest(BaseZarrTest, TestCase): @@ -1133,13 +1136,13 @@ def create_store(self): @contextlib.contextmanager def roundtrip(self, data, save_kwargs={}, open_kwargs={}, allow_cleanup_failure=False): - with create_tmp_file(suffix='.zarr', + with create_tmp_file( + suffix='.zarr', allow_cleanup_failure=allow_cleanup_failure) as tmp_file: data.to_zarr(store=tmp_file, **save_kwargs) yield xr.open_zarr(tmp_file, **open_kwargs) - @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, NetCDF3Only, TestCase): engine = 'scipy' From 021d3bab623c6562b61415df01d3c0e88085ad23 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 27 Nov 2017 13:36:30 -0500 Subject: [PATCH 38/68] more wip --- xarray/backends/zarr.py | 25 ++++++++++++++++--------- xarray/tests/test_backends.py | 1 - 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 23ade7eb4fa..dced7386dc1 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -44,9 +44,10 @@ def _encode_zarr_attr_value(value): def _ensure_valid_fill_value(value, dtype): if dtype.type == np.string_ and type(value) == bytes: - return value.decode('ascii') + valid = value.decode('ascii') else: - return value + valid = value + return _encode_zarr_attr_value(value) # TODO: cleanup/combine these next two functions @@ -294,8 +295,14 @@ def open_store_variable(self, name, zarr_array): attributes = _decode_zarr_attrs(attributes) encoding = {'chunks': zarr_array.chunks, 'compressor': zarr_array.compressor, - 'filters': zarr_array.filters, - 'fill_value': zarr_array.fill_value} + 'filters': zarr_array.filters} + # I use _FillValue as the encoding key to be consistent + # with CF / netCDF. Is this right? Could also just call it + # 'fill_value'. Does it matter? + # Am I making things harder by tring to make ZarrStore + # CFDataStore when it is not? + if getattr(zarr_array, 'fill_value') is not None: + encoding['_FillValue'] = zarr_array.fill_value return Variable(dimensions, data, attributes, encoding) @@ -342,15 +349,15 @@ def prepare_variable(self, name, variable, check_encoding=False, # in an attribute. This seems redundant and error prone. How can # we do better? # Also, this needs to be encoded as a zarr attr - fill_value = _ensure_valid_fill_value( - _encode_zarr_attr_value(attrs.get('_FillValue', None)), - dtype) + fill_value = _ensure_valid_fill_value(attrs.pop('_FillValue', None), + dtype) + # TODO: figure out what this is for (it's copied from netCDF4) if fill_value in ['\x00']: fill_value = None # messy! fix - if fill_value is not None: - attrs['_FillValue'] = fill_value + #if fill_value is not None: + # attrs['_FillValue'] = fill_value # TODO: figure out what encoding is needed for zarr encoding = _extract_zarr_variable_encoding( diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 66c7e3539be..4a898623934 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -671,7 +671,6 @@ def test_default_fill_value(self): ds = Dataset({'x': ('y', np.arange(10.0))}) kwargs = dict(encoding={'x': {'dtype': 'f4'}}) with self.roundtrip(ds, save_kwargs=kwargs) as actual: - print("actual.x.encoding", actual.x.encoding) self.assertEqual(actual.x.encoding['_FillValue'], np.nan) self.assertEqual(ds.x.encoding, {}) From a4b024e77f5ff58e0ad8ff5c4bc8e283743d9be6 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 27 Nov 2017 15:13:38 -0500 Subject: [PATCH 39/68] very close to passing all tests --- xarray/backends/zarr.py | 26 +++++++------------------- xarray/tests/test_backends.py | 1 + 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index dced7386dc1..65d988acc28 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -47,7 +47,7 @@ def _ensure_valid_fill_value(value, dtype): valid = value.decode('ascii') else: valid = value - return _encode_zarr_attr_value(value) + return _encode_zarr_attr_value(valid) # TODO: cleanup/combine these next two functions @@ -296,13 +296,10 @@ def open_store_variable(self, name, zarr_array): encoding = {'chunks': zarr_array.chunks, 'compressor': zarr_array.compressor, 'filters': zarr_array.filters} - # I use _FillValue as the encoding key to be consistent - # with CF / netCDF. Is this right? Could also just call it - # 'fill_value'. Does it matter? - # Am I making things harder by tring to make ZarrStore - # CFDataStore when it is not? + # _FillValue needs to be in attributes, not encoding, so it will get + # picked up by decode_cf if getattr(zarr_array, 'fill_value') is not None: - encoding['_FillValue'] = zarr_array.fill_value + attributes['_FillValue'] = zarr_array.fill_value return Variable(dimensions, data, attributes, encoding) @@ -344,20 +341,8 @@ def prepare_variable(self, name, variable, check_encoding=False, # TODO: figure out how zarr should deal with unlimited dimensions self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) - # netcdf uses pop not get...yet it works. Why? - # here we are basically duplicating zarr's own internal fill_value - # in an attribute. This seems redundant and error prone. How can - # we do better? - # Also, this needs to be encoded as a zarr attr fill_value = _ensure_valid_fill_value(attrs.pop('_FillValue', None), dtype) - # TODO: figure out what this is for (it's copied from netCDF4) - if fill_value in ['\x00']: - fill_value = None - - # messy! fix - #if fill_value is not None: - # attrs['_FillValue'] = fill_value # TODO: figure out what encoding is needed for zarr encoding = _extract_zarr_variable_encoding( @@ -368,6 +353,7 @@ def prepare_variable(self, name, variable, check_encoding=False, # compressor='default', fill_value=0, order='C', store=None, # synchronizer=None, overwrite=False, path=None, chunk_store=None, # filters=None, cache_metadata=True, **kwargs) + print('creating', name, shape, dtype) zarr_array = self.ds.create(name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding) # decided not to explicity enumerate encoding options because we @@ -510,7 +496,9 @@ def maybe_decode_store(store, lock=False): def maybe_chunk(name, var): from dask.base import tokenize chunks = var.encoding.get('chunks') + print('chunks', chunks) if (var.ndim > 0) and (chunks is not None): + # does this cause any data to be read? token2 = tokenize(name, var._data) name2 = 'zarr-%s-%s' % (name, token2) return var.chunk(chunks, name=name2, lock=None) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9870d6f6b17..b1e7a306848 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -501,6 +501,7 @@ def test_roundtrip_bytes_with_fill_value(self): encoding = {'_FillValue': b'X', 'dtype': 'S1'} original = Dataset({'x': ('t', values, {}, encoding)}) expected = original.copy(deep=True) + print(original) with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual) From 54d116d243739bc5ee92259ca4ef7c4453ca2e2c Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 29 Nov 2017 15:34:55 -0500 Subject: [PATCH 40/68] modified inheritance --- xarray/backends/zarr.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 65d988acc28..4043f8f2bde 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -255,7 +255,7 @@ def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): return encoding -class ZarrStore(WritableCFDataStore, DataStorePickleMixin): +class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): """Store for reading and writing data via zarr """ @@ -373,7 +373,14 @@ def prepare_variable(self, name, variable, check_encoding=False, attributes[k] = _encode_zarr_attr_value(v) return zarr_array, variable.data - + + + def store(self, variables, attributes, *args, **kwargs): + # All NetCDF files get CF encoded by default, without this attempting + # to write times, for example, would fail. + cf_variables, cf_attrs = cf_encoder(variables, attributes) + AbstractWritableDataStore.store(self, cf_variables, cf_attrs, + *args, **kwargs) # sync() and close() methods should not be needed with zarr From 94678f49884b0284bd9638cba51a1df586ef8b7f Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 29 Nov 2017 16:21:31 -0500 Subject: [PATCH 41/68] subclass AbstractWriteableDataStore --- xarray/backends/zarr.py | 47 +++++++++++++++++++++++++++++++++++------ 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 4043f8f2bde..23561b30f5f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -12,7 +12,8 @@ from ..core.utils import FrozenOrderedDict, HiddenKeyDict from ..core.pycompat import iteritems, OrderedDict -from .common import WritableCFDataStore, DataStorePickleMixin, BackendArray +from .common import (AbstractWritableDataStore, DataStorePickleMixin, + BackendArray) from .. import conventions @@ -254,6 +255,41 @@ def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): return encoding +# copied from conventions.encode_cf_variable + +def encode_zarr_variable(var, needs_copy=True, name=None): + """ + Converts an Variable into an Variable which follows some + of the CF conventions: + + - Nans are masked using _FillValue (or the deprecated missing_value) + - Rescaling via: scale_factor and add_offset + - datetimes are converted to the CF 'units since time' format + - dtype encodings are enforced. + + Parameters + ---------- + var : xarray.Variable + A variable holding un-encoded data. + + Returns + ------- + out : xarray.Variable + A variable which has been encoded as described above. + """ + var = conventions.maybe_encode_datetime(var, name=name) + var = conventions.maybe_encode_timedelta(var, name=name) + var, needs_copy = conventions.maybe_encode_offset_and_scale(var, + needs_copy, name=name) + var, needs_copy = conventions.maybe_encode_fill_value(var, needs_copy, + name=name) + var = conventions.maybe_encode_nonstring_dtype(var, name=name) + var = conventions.maybe_default_fill_value(var) + var = conventions.maybe_encode_bools(var) + var = conventions.ensure_dtype_not_object(var, name=name) + var = conventions.maybe_encode_string_dtype(var, name=name) + return var + class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): """Store for reading and writing data via zarr @@ -373,13 +409,12 @@ def prepare_variable(self, name, variable, check_encoding=False, attributes[k] = _encode_zarr_attr_value(v) return zarr_array, variable.data - + def store(self, variables, attributes, *args, **kwargs): - # All NetCDF files get CF encoded by default, without this attempting - # to write times, for example, would fail. - cf_variables, cf_attrs = cf_encoder(variables, attributes) - AbstractWritableDataStore.store(self, cf_variables, cf_attrs, + new_vars = OrderedDict((k, encode_zarr_variable(v, name=k)) + for k, v in iteritems(variables)) + AbstractWritableDataStore.store(self, new_vars, attributes, *args, **kwargs) # sync() and close() methods should not be needed with zarr From f5844563e99ee59620bac1840c43f774670f4a7e Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 1 Dec 2017 13:42:21 -0500 Subject: [PATCH 42/68] xfailed certain tests --- xarray/backends/zarr.py | 5 +++++ xarray/tests/test_backends.py | 28 +++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 23561b30f5f..b9b692c237c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -277,6 +277,11 @@ def encode_zarr_variable(var, needs_copy=True, name=None): out : xarray.Variable A variable which has been encoded as described above. """ + + if var.dtype.kind == 'O': + raise NotImplementedError("Variable `%s` is an object. " + "Zarr store can't yet encode objects." % name) + var = conventions.maybe_encode_datetime(var, name=name) var = conventions.maybe_encode_timedelta(var, name=name) var, needs_copy = conventions.maybe_encode_offset_and_scale(var, diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8ff105b165d..8126be6e355 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1103,6 +1103,8 @@ def test_auto_chunk(self): # chunk size should be the same as original self.assertEqual(v.chunks, original[k].chunks) + + def test_chunk_encoding(self): data = create_test_data() chunks = (5, 5) @@ -1114,14 +1116,34 @@ def test_chunk_encoding(self): with self.roundtrip(data) as actual: pass + def test_vectorized_indexing(self): self._test_vectorized_indexing(vindex_support=True) + # TODO: implement zarr object encoding and make these tests pass + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_multiindex_not_implemented(self): + super(CFEncodedDataTest, self).test_multiindex_not_implemented() + + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_roundtrip_bytes_with_fill_value(self): + super(CFEncodedDataTest, self).test_roundtrip_bytes_with_fill_value() + + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_roundtrip_object_dtype(self): + super(CFEncodedDataTest, self).test_roundtrip_object_dtype() + + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_roundtrip_string_encoded_characters(self): + super(CFEncodedDataTest, + self).test_roundtrip_string_encoded_characters() + + # TODO: someone who understand caching figure out whether chaching + # makes sense for Zarr backend + @pytest.mark.xfail(reason="Zarr caching not implemented") def test_dataset_caching(self): - # TODO: someone who understand caching figure out whether chaching - # makes sense for Zarr backend - pass + super(CFEncodedDataTest, self).test_dataset_caching() @requires_zarr From c43284e3dee32c27546073d9129823a289d7ae8c Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 4 Dec 2017 11:43:32 -0500 Subject: [PATCH 43/68] pr comments wip --- xarray/backends/zarr.py | 109 +++++++++++++++++++--------------- xarray/tests/test_backends.py | 5 +- 2 files changed, 64 insertions(+), 50 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b9b692c237c..1ea96ca3bdf 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -3,14 +3,14 @@ from __future__ import print_function import functools from itertools import product -# import operator +from base64 import b64encode import numpy as np from .. import Variable from ..core import indexing from ..core.utils import FrozenOrderedDict, HiddenKeyDict -from ..core.pycompat import iteritems, OrderedDict +from ..core.pycompat import iteritems, OrderedDict, integer_types from .common import (AbstractWritableDataStore, DataStorePickleMixin, BackendArray) @@ -28,33 +28,29 @@ # many xarray datasets / variables have numpy arrays and values # these functions handle encoding / decoding of such items def _encode_zarr_attr_value(value): - # what is the most duck-type friendly way to do this check if isinstance(value, np.ndarray): - return value.tolist() - # I don't know how to check generically if something is a numpy scalar - # i.e. np.float32 or np.int8, etc. without checking against each dtype - # manually. This was the best I could come up with + encoded = value.tolist() + # this checks if it's a scalar number elif isinstance(value, np.generic): + encoded = value.item() # np.string_('X').item() returns a type `bytes` # zarr still doesn't like that - # causes some fill_value encoding to fail - return value.item() + if type(encoded) is bytes: + encoded = b64encode(encoded) else: - return value + encoded = value + return encoded def _ensure_valid_fill_value(value, dtype): if dtype.type == np.string_ and type(value) == bytes: - valid = value.decode('ascii') + valid = b64encode(value) else: valid = value return _encode_zarr_attr_value(valid) -# TODO: cleanup/combine these next two functions def _decode_zarr_attr_value(value): - # what happens if we just don't decode anything? - # does it matter that we don't convert back to numpy types? return value @@ -63,8 +59,9 @@ def _decode_zarr_attrs(attrs): for k, v in attrs.items()]) -# untested, but I think this does the appropriate shape munging to make slices +# Do the appropriate shape munging to make slices # appear as the last axes of the result array +# TODO: write tests for this def _replace_slices_with_arrays(key, shape): num_slices = sum(1 for k in key if isinstance(k, slice)) num_arrays = len(shape) - num_slices @@ -75,7 +72,7 @@ def _replace_slices_with_arrays(key, shape): array = np.arange(*k.indices(size)) sl = [np.newaxis] * len(shape) sl[num_arrays + slice_count] = np.newaxis - k = array[sl] + k = array[tuple(sl)] slice_count += 1 else: assert isinstance(k, np.ndarray) @@ -93,11 +90,6 @@ def __init__(self, variable_name, datastore): self.shape = array.shape dtype = array.dtype - if dtype is str: - # use object dtype because that's the only way in numpy to - # represent variable length strings; it also prevents automatic - # string concatenation via conventions.decode_cf_variable - dtype = np.dtype('O') self.dtype = dtype def get_array(self): @@ -128,14 +120,14 @@ def _open_zarr_group(store=None, overwrite=None, synchronizer=None, def _dask_chunks_to_zarr_chunks(chunks): - '''this function dask chunks syntax to zarr chunks''' + '''this function translates dask chunk syntax to zarr chunk syntax''' if chunks is None: return chunks all_chunks = product(*chunks) first_chunk = next(all_chunks) for this_chunk in all_chunks: - if not (this_chunk == first_chunk): + if this_chunk != first_chunk: raise ValueError("zarr requires uniform chunk sizes, found %r" % chunks) return first_chunk @@ -162,13 +154,22 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): # while dask chunks can be variable sized # http://dask.pydata.org/en/latest/array-design.html#chunks if var_chunks and enc_chunks is None: - all_var_chunks = product(*var_chunks) - first_var_chunk = next(all_var_chunks) - for this_chunk in all_var_chunks: - if not (this_chunk == first_var_chunk): + all_var_chunks = list(product(*var_chunks)) + first_var_chunk = all_var_chunks[0] + # all but the last chunk have to match exactly + for this_chunk in all_var_chunks[:-1]: + if this_chunk != first_var_chunk: raise ValueError( - "zarr requires uniform chunk sizes, but variable has " - "non-uniform chunks %r. Consider rechunking the data " + "Zarr requires uniform chunk sizes excpet for final chunk. " + "Variable %r has incompatible chunks. Consider rechunking " + "using `chunk()`." % var_chunks) + # last chunk is allowed to be smaller + last_var_chunk = all_var_chunks[-1] + for len_first, len_last in zip(first_var_chunk, last_var_chunk): + if len_last > len_first: + raise ValueError( + "Final chunk of Zarr array must be smaller than first. " + "Variable %r has incompatible chunks. Consider rechunking " "using `chunk()`." % var_chunks) return first_var_chunk @@ -178,24 +179,24 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): # Here we re-implement this expansion ourselves. That makes the logic of # checking chunk compatibility easier - # this coerces a single int to a tuple but leaves a tuple as is - enc_chunks_tuple = tuple(enc_chunks) - if len(enc_chunks_tuple) == 1: - enc_chunks_tuple = ndim * enc_chunks_tuple + if type(enc_chunks) in integer_types: + enc_chunks_tuple = ndim * (enc_chunks,) + else: + enc_chunks_tuple = tuple(enc_chunks) - if not len(enc_chunks_tuple) == ndim: + if len(enc_chunks_tuple) != ndim: raise ValueError("zarr chunks tuple %r must have same length as " "variable.ndim %g" % - (enc_chunks_tuple, _DIMENSION_KEY)) + (enc_chunks_tuple, ndim)) for x in enc_chunks_tuple: if not isinstance(x, int): - raise ValueError("zarr chunks must be an int or a tuple of ints. " + raise TypeError("zarr chunks must be an int or a tuple of ints. " "Instead found %r" % (enc_chunks_tuple,)) - # if there are chunks in encoding and the variabile data is a numpy array, + # if there are chunks in encoding and the variable data is a numpy array, # we use the specified chunks - if enc_chunks_tuple and var_chunks is None: + if var_chunks is None: return enc_chunks_tuple # the hard case @@ -208,7 +209,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): if var_chunks and enc_chunks_tuple: for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks): for dchunk in dchunks: - if not dchunk % zchunk == 0: + if dchunk % zchunk: raise ValueError( "Specified zarr chunks %r would overlap multiple dask " "chunks %r. Consider rechunking the data using " @@ -216,7 +217,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): % (enc_chunks_tuple, var_chunks)) return enc_chunks_tuple - raise RuntimeError( + raise AssertionError( "We should never get here. Function logic must be wrong.") @@ -225,7 +226,12 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): # an attribute that specifies the dimension. We have to hide this attribute # when we send the attributes to the user. # zarr_obj can be either a zarr group or zarr array - dimensions = zarr_obj.attrs.get(dimension_key) + try: + dimensions = zarr_obj.attrs[dimension_key] + except KeyError: + raise KeyError("Zarr object is missing the attribute `%s`, which is" + "required for xarray to determine variable dimensions." + % (dimension_key)) attributes = HiddenKeyDict(zarr_obj.attrs, dimension_key) return dimensions, attributes @@ -255,8 +261,8 @@ def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): return encoding -# copied from conventions.encode_cf_variable - +# Function below is copied from conventions.encode_cf_variable. +# The only change is to raise an error for object dtypes. def encode_zarr_variable(var, needs_copy=True, name=None): """ Converts an Variable into an Variable which follows some @@ -327,9 +333,6 @@ def __init__(self, store=None, mode='a', synchronizer=None, group=None, super(ZarrStore, self).__init__(writer) def open_store_variable(self, name, zarr_array): - # I don't see why it is necessary to wrap self.ds[name] - # zarr seems to implement the required ndarray interface - # TODO: possibly wrap zarr array in dask with aligned chunks data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, _DIMENSION_KEY) @@ -364,6 +367,13 @@ def set_dimension(self, name, length, is_unlimited=False): raise NotImplementedError( "Zarr backend doesn't know how to handle unlimited dimensions") with self.ensure_open(autoclose=False): + # consistency check + if name in self.ds.attrs[_DIMENSION_KEY]: + if self.ds.attrs[_DIMENSION_KEY][name] != length: + raise ValueError("Prexisting array dimensions %r " + "encoded in Zarr attributes are incompatible " + "with newly specified dimension `%s`: %g" % + (self.ds.attrs[_DIMENSION_KEY], name, length)) self.ds.attrs[_DIMENSION_KEY][name] = length def set_attribute(self, key, value): @@ -394,7 +404,6 @@ def prepare_variable(self, name, variable, check_encoding=False, # compressor='default', fill_value=0, order='C', store=None, # synchronizer=None, overwrite=False, path=None, chunk_store=None, # filters=None, cache_metadata=True, **kwargs) - print('creating', name, shape, dtype) zarr_array = self.ds.create(name, shape=shape, dtype=dtype, fill_value=fill_value, **encoding) # decided not to explicity enumerate encoding options because we @@ -448,7 +457,7 @@ def store(self, variables, attributes, *args, **kwargs): def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, decode_cf=True, mask_and_scale=True, decode_times=True, - autoclose=False, concat_characters=True, decode_coords=True, + concat_characters=True, decode_coords=True, cache=False, drop_variables=None): """Load and decode a dataset from a file or file-like object. @@ -514,6 +523,9 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, -------- open_dataset """ + if mode not in ['r', 'r+']: + raise ValueError("Mode must be 'r' or 'r+'.") + if not decode_cf: mask_and_scale = False decode_times = False @@ -543,7 +555,6 @@ def maybe_decode_store(store, lock=False): def maybe_chunk(name, var): from dask.base import tokenize chunks = var.encoding.get('chunks') - print('chunks', chunks) if (var.ndim > 0) and (chunks is not None): # does this cause any data to be read? token2 = tokenize(name, var._data) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8126be6e355..4b2ccdf77f9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1104,6 +1104,9 @@ def test_auto_chunk(self): self.assertEqual(v.chunks, original[k].chunks) + def test_hidden_attrs(self): + pass + def test_chunk_encoding(self): data = create_test_data() @@ -1112,7 +1115,7 @@ def test_chunk_encoding(self): with self.roundtrip(data) as actual: self.assertEqual(chunks, actual['var2'].encoding['chunks']) data['var2'].encoding.update({'chunks': (5, 4.5)}) - with pytest.raises(ValueError): + with pytest.raises(TypeError): with self.roundtrip(data) as actual: pass From 9df6e50b32aa13e59c0e194d1ffe3928b8324b4e Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 4 Dec 2017 11:44:20 -0500 Subject: [PATCH 44/68] removed autoclose --- xarray/backends/zarr.py | 54 +++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 1ea96ca3bdf..906f799615f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -302,28 +302,17 @@ def encode_zarr_variable(var, needs_copy=True, name=None): return var -class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): +class ZarrStore(AbstractWritableDataStore): """Store for reading and writing data via zarr """ def __init__(self, store=None, mode='a', synchronizer=None, group=None, - writer=None, autoclose=None): + writer=None): self._mode = mode self._synchronizer = synchronizer self._group = group - - # zarr stores don't need to be opened, closed, or synced. - # So what do we do with all this logical about openers? - if autoclose: - raise NotImplementedError('autoclose=True is not implemented ' - 'for the zarr backend') - self._autoclose = False - self._isopen = True - - opener = functools.partial(_open_zarr_group, store=store, + self.ds = _open_zarr_group(store=store, mode=mode, synchronizer=synchronizer, group=group) - self._opener = opener - self.ds = self._opener(mode=mode) # initialize hidden dimension attribute if _DIMENSION_KEY not in self.ds.attrs: @@ -348,38 +337,33 @@ def open_store_variable(self, name, zarr_array): return Variable(dimensions, data, attributes, encoding) def get_variables(self): - with self.ensure_open(autoclose=False): - return FrozenOrderedDict((k, self.open_store_variable(k, v)) - for k, v in self.ds.arrays()) + return FrozenOrderedDict((k, self.open_store_variable(k, v)) + for k, v in self.ds.arrays()) def get_attrs(self): - with self.ensure_open(autoclose=True): - _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) - return _decode_zarr_attrs(attributes) + _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) + return _decode_zarr_attrs(attributes) def get_dimensions(self): - with self.ensure_open(autoclose=True): - dimensions, _ = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) - return dimensions + dimensions, _ = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) + return dimensions def set_dimension(self, name, length, is_unlimited=False): if is_unlimited: raise NotImplementedError( "Zarr backend doesn't know how to handle unlimited dimensions") - with self.ensure_open(autoclose=False): - # consistency check - if name in self.ds.attrs[_DIMENSION_KEY]: - if self.ds.attrs[_DIMENSION_KEY][name] != length: - raise ValueError("Prexisting array dimensions %r " - "encoded in Zarr attributes are incompatible " - "with newly specified dimension `%s`: %g" % - (self.ds.attrs[_DIMENSION_KEY], name, length)) - self.ds.attrs[_DIMENSION_KEY][name] = length + # consistency check + if name in self.ds.attrs[_DIMENSION_KEY]: + if self.ds.attrs[_DIMENSION_KEY][name] != length: + raise ValueError("Prexisting array dimensions %r " + "encoded in Zarr attributes are incompatible " + "with newly specified dimension `%s`: %g" % + (self.ds.attrs[_DIMENSION_KEY], name, length)) + self.ds.attrs[_DIMENSION_KEY][name] = length def set_attribute(self, key, value): - with self.ensure_open(autoclose=False): - _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) - attributes[key] = _encode_zarr_attr_value(value) + _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) + attributes[key] = _encode_zarr_attr_value(value) def prepare_variable(self, name, variable, check_encoding=False, unlimited_dims=None): From 012e85880a4d0a2dd7ca084cb1b8ef458e1e9de0 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 4 Dec 2017 12:45:56 -0500 Subject: [PATCH 45/68] new test for chunk encoding --- xarray/backends/zarr.py | 28 +++++++++++++------ xarray/core/dataset.py | 4 +++ xarray/tests/test_backends.py | 52 +++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 8 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 906f799615f..7da197437a1 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -206,13 +206,16 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): # "If each worker in a parallel computation is writing to a separate # region of the array, and if region boundaries are perfectly aligned # with chunk boundaries, then no synchronization is required." + # TODO: incorporate synchronizer to allow writes from multiple dask + # threads if var_chunks and enc_chunks_tuple: for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks): for dchunk in dchunks: if dchunk % zchunk: - raise ValueError( + raise NotImplementedError( "Specified zarr chunks %r would overlap multiple dask " - "chunks %r. Consider rechunking the data using " + "chunks %r. This is not implemented in xarray yet. " + " Consider rechunking the data using " "`chunk()` or specifying different chunks in encoding." % (enc_chunks_tuple, var_chunks)) return enc_chunks_tuple @@ -443,12 +446,21 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, cache=False, drop_variables=None): - """Load and decode a dataset from a file or file-like object. + """Load and decode a dataset from a Zarr store. + + .. note:: Experimental + The Zarr backend is new and experimental. Please report any + unexpected behavior via github issues. + + The `store` object should be a valid store for a Zarr group. `store` + variables must contain dimension metadata encoded in the + `_ARRAY_DIMENSIONS` attribute. Parameters ---------- store : MutableMapping or str - Store or path to directory in file system. + A MutableMapping where a Zarr Group has been stored or a path to a + directory in file system where a Zarr DirectoryStore has been stored. mode : {‘r’, ‘r+’} Persistence mode: ‘r’ means read only (must exist); ‘r+’ means read/write (must exist) @@ -474,10 +486,6 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. - autoclose : bool, optional - If True, automatically close files to avoid OS Error of too many files - being open. However, this option doesn't work with streams, e.g., - BytesIO. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -506,6 +514,10 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, See Also -------- open_dataset + + References + ---------- + http://zarr.readthedocs.io/ """ if mode not in ['r', 'r+']: raise ValueError("Mode must be 'r' or 'r+'.") diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 593bf9c7ee2..cd7d0003e4a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1135,6 +1135,10 @@ def to_zarr(self, store=None, mode='a', synchronizer=None, group=None, encoding=None): """Write dataset contents to a zarr group. + .. note:: Experimental + The Zarr backend is new and experimental. Please report any + unexpected behavior via github issues. + Parameters ---------- store : MutableMapping or str, optional diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4b2ccdf77f9..6fbce766344 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1109,17 +1109,69 @@ def test_hidden_attrs(self): def test_chunk_encoding(self): + # These datasets have no dask chunks. All chunking specified in + # encoding data = create_test_data() chunks = (5, 5) data['var2'].encoding.update({'chunks': chunks}) + with self.roundtrip(data) as actual: self.assertEqual(chunks, actual['var2'].encoding['chunks']) + + # expect an error with non-integer chunks data['var2'].encoding.update({'chunks': (5, 4.5)}) with pytest.raises(TypeError): with self.roundtrip(data) as actual: pass + def test_chunk_encoding_with_dask(self): + # These datasets DO have dask chunks. Need to check for various + # interactions between dask and zarr chunks + ds = xr.DataArray((np.arange(12)), dims='x', name='var1').to_dataset() + + ## no encoding specified ## + + # zarr automatically gets chunk information from dask chunks + ds_chunk4 = ds.chunk({'x': 4}) + with self.roundtrip(ds_chunk4) as actual: + self.assertEqual((4,), actual['var1'].encoding['chunks']) + + # should fail if dask_chunks are irregular... + ds_chunk_irreg = ds.chunk({'x': (5, 4, 3)}) + with pytest.raises(ValueError): + with self.roundtrip(ds_chunk_irreg) as actual: + pass + + # ... except if the last chunk is smaller than the first + ds_chunk_irreg = ds.chunk({'x': (5, 5, 2)}) + with self.roundtrip(ds_chunk_irreg) as actual: + self.assertEqual((5,), actual['var1'].encoding['chunks']) + + ## encoding specified ## + + # specify compatible encodings + for chunk_enc in 4, (4, ): + ds_chunk4['var1'].encoding.update({'chunks': chunk_enc}) + with self.roundtrip(ds_chunk4) as actual: + self.assertEqual((4,), actual['var1'].encoding['chunks']) + + # specify incompatible encoding + ds_chunk4['var1'].encoding.update({'chunks': (5,5)}) + with pytest.raises(ValueError): + with self.roundtrip(ds_chunk4) as actual: + pass + + # TODO: remove this failure once syncronized overlapping writes are + # supported by xarray + ds_chunk4['var1'].encoding.update({'chunks': 5}) + with pytest.raises(NotImplementedError): + with self.roundtrip(ds_chunk4) as actual: + pass + + + + def test_vectorized_indexing(self): self._test_vectorized_indexing(vindex_support=True) From b1819f4e0327aa2c609b085329f4b4cf1d67a13b Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 5 Dec 2017 15:40:11 -0500 Subject: [PATCH 46/68] added another test --- xarray/backends/zarr.py | 4 +-- xarray/tests/test_backends.py | 52 ++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 7da197437a1..9c1fee2bc13 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -232,7 +232,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): try: dimensions = zarr_obj.attrs[dimension_key] except KeyError: - raise KeyError("Zarr object is missing the attribute `%s`, which is" + raise KeyError("Zarr object is missing the attribute `%s`, which is " "required for xarray to determine variable dimensions." % (dimension_key)) attributes = HiddenKeyDict(zarr_obj.attrs, dimension_key) @@ -358,7 +358,7 @@ def set_dimension(self, name, length, is_unlimited=False): # consistency check if name in self.ds.attrs[_DIMENSION_KEY]: if self.ds.attrs[_DIMENSION_KEY][name] != length: - raise ValueError("Prexisting array dimensions %r " + raise ValueError("Pre-existing array dimensions %r " "encoded in Zarr attributes are incompatible " "with newly specified dimension `%s`: %g" % (self.ds.attrs[_DIMENSION_KEY], name, length)) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6fbce766344..dcb06e9d718 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1084,6 +1084,9 @@ class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): @requires_zarr class BaseZarrTest(CFEncodedDataTest): + + DIMENSION_KEY = '_ARRAY_DIMENSIONS' + def test_auto_chunk(self): original = create_test_data().chunk() @@ -1104,10 +1107,6 @@ def test_auto_chunk(self): self.assertEqual(v.chunks, original[k].chunks) - def test_hidden_attrs(self): - pass - - def test_chunk_encoding(self): # These datasets have no dask chunks. All chunking specified in # encoding @@ -1170,12 +1169,45 @@ def test_chunk_encoding_with_dask(self): pass - - def test_vectorized_indexing(self): self._test_vectorized_indexing(vindex_support=True) + def test_hidden_zarr_keys(self): + expected = create_test_data() + with self.create_store(mode='w') as store: + expected.dump_to_store(store) + zarr_group = store.ds + + # check that the global hidden attribute is present + assert self.DIMENSION_KEY in zarr_group.attrs + + # check that a variable hidden attribute is present and correct + # for some reason, one is a list and the other a tuple + for var in expected.variables.keys(): + assert (zarr_group[var].attrs[self.DIMENSION_KEY] + == list(expected[var].dims)) + + with xr.decode_cf(store) as actual: + # make sure it is hidden + assert self.DIMENSION_KEY not in actual.attrs + for var in expected.variables.keys(): + assert self.DIMENSION_KEY not in expected[var].attrs + + # verify that the dataset fails to open if dimension key is missing + del zarr_group.attrs[self.DIMENSION_KEY] + with pytest.raises(KeyError): + with xr.decode_cf(store) as actual: + pass + + # put it back and try something else + zarr_group.attrs[self.DIMENSION_KEY] = {} + del zarr_group.var2.attrs[self.DIMENSION_KEY] + with xr.decode_cf(store) as actual: + pass + + + # TODO: implement zarr object encoding and make these tests pass @pytest.mark.xfail(reason="Zarr object encoding not implemented") def test_multiindex_not_implemented(self): @@ -1204,8 +1236,8 @@ def test_dataset_caching(self): @requires_zarr class ZarrDictStoreTest(BaseZarrTest, TestCase): @contextlib.contextmanager - def create_store(self): - yield backends.ZarrStore(store={}) + def create_store(self, **open_kwargs): + yield backends.ZarrStore(store={}, **open_kwargs) @contextlib.contextmanager def roundtrip(self, data, save_kwargs={}, open_kwargs={}, @@ -1218,9 +1250,9 @@ def roundtrip(self, data, save_kwargs={}, open_kwargs={}, @requires_zarr class ZarrDirectoryStoreTest(BaseZarrTest, TestCase): @contextlib.contextmanager - def create_store(self): + def create_store(self, **open_kwargs): with create_tmp_file(suffix='.zarr') as tmp: - yield backends.ZarrStore(store=tmp) + yield backends.ZarrStore(store=tmp, **open_kwargs) @contextlib.contextmanager def roundtrip(self, data, save_kwargs={}, open_kwargs={}, From 8eb98c91b52bc9cae6c78ce2f8d514283878fca1 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 5 Dec 2017 20:40:57 -0500 Subject: [PATCH 47/68] tests for HiddenKeyDict --- xarray/backends/zarr.py | 2 +- xarray/core/utils.py | 6 +++--- xarray/tests/test_backends.py | 7 ++++--- xarray/tests/test_utils.py | 15 +++++++++++++++ 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 9c1fee2bc13..4ca36fefa78 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -235,7 +235,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): raise KeyError("Zarr object is missing the attribute `%s`, which is " "required for xarray to determine variable dimensions." % (dimension_key)) - attributes = HiddenKeyDict(zarr_obj.attrs, dimension_key) + attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key]) return dimensions, attributes diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 0a988d7a473..7c41c633a52 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -542,13 +542,13 @@ class HiddenKeyDict(MutableMapping): # ``__init__`` method required to create instance from class. def __init__(self, data, hidden_keys): self._data = data - if type(hidden_keys) is not list: - hidden_keys = [ hidden_keys ] + if type(hidden_keys) not in (list, tuple): + raise ValueError("hidden_keys must be a list or tuple") self._hidden_keys = hidden_keys def _raise_if_hidden(self, key): if key in self._hidden_keys: - raise KeyError('Key is hidden.') + raise KeyError('Key `%r` is hidden.' % key) # The next five methods are requirements of the ABC. def __setitem__(self, key, value): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index dcb06e9d718..5e5b8bfe86f 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1200,11 +1200,12 @@ def test_hidden_zarr_keys(self): with xr.decode_cf(store) as actual: pass - # put it back and try something else + # put it back and try removing from a variable zarr_group.attrs[self.DIMENSION_KEY] = {} del zarr_group.var2.attrs[self.DIMENSION_KEY] - with xr.decode_cf(store) as actual: - pass + with pytest.raises(KeyError): + with xr.decode_cf(store) as actual: + pass diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index c1df1da8c86..b2796f59189 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -189,3 +189,18 @@ def test_dask_array_is_scalar(): y = da.arange(8, chunks=4) assert not utils.is_scalar(y) + + +def test_hidden_key_dict(): + hidden_key = '_hidden_key' + data = {'a': 1, 'b': 2, hidden_key: 3} + data_expected = {'a': 1, 'b': 2} + hkd = utils.HiddenKeyDict(data, [hidden_key]) + assert len(hkd) == 2 + assert hidden_key not in hkd + for k, v in data_expected.items(): + assert hkd[k] == v + with pytest.raises(KeyError): + _ = hkd[hidden_key] + with pytest.raises(KeyError): + del hkd[hidden_key] From 64bd76c46bc7bc0755a4a9e93be6ae10707129c3 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 5 Dec 2017 21:10:57 -0500 Subject: [PATCH 48/68] flake8 --- xarray/backends/zarr.py | 27 +++++++++++++-------------- xarray/core/dataset.py | 4 ++-- xarray/tests/test_backends.py | 18 +++++------------- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 4ca36fefa78..e84a7efd33a 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -1,7 +1,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import functools from itertools import product from base64 import b64encode @@ -12,8 +11,7 @@ from ..core.utils import FrozenOrderedDict, HiddenKeyDict from ..core.pycompat import iteritems, OrderedDict, integer_types -from .common import (AbstractWritableDataStore, DataStorePickleMixin, - BackendArray) +from .common import (AbstractWritableDataStore, BackendArray) from .. import conventions @@ -160,9 +158,9 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): for this_chunk in all_var_chunks[:-1]: if this_chunk != first_var_chunk: raise ValueError( - "Zarr requires uniform chunk sizes excpet for final chunk. " - "Variable %r has incompatible chunks. Consider rechunking " - "using `chunk()`." % var_chunks) + "Zarr requires uniform chunk sizes excpet for final chunk." + " Variable %r has incompatible chunks. Consider " + "rechunking using `chunk()`." % var_chunks) # last chunk is allowed to be smaller last_var_chunk = all_var_chunks[-1] for len_first, len_last in zip(first_var_chunk, last_var_chunk): @@ -192,7 +190,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): for x in enc_chunks_tuple: if not isinstance(x, int): raise TypeError("zarr chunks must be an int or a tuple of ints. " - "Instead found %r" % (enc_chunks_tuple,)) + "Instead found %r" % (enc_chunks_tuple,)) # if there are chunks in encoding and the variable data is a numpy array, # we use the specified chunks @@ -264,6 +262,7 @@ def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): return encoding + # Function below is copied from conventions.encode_cf_variable. # The only change is to raise an error for object dtypes. def encode_zarr_variable(var, needs_copy=True, name=None): @@ -288,13 +287,14 @@ def encode_zarr_variable(var, needs_copy=True, name=None): """ if var.dtype.kind == 'O': - raise NotImplementedError("Variable `%s` is an object. " - "Zarr store can't yet encode objects." % name) + raise NotImplementedError("Variable `%s` is an object. Zarr " + "store can't yet encode objects." % name) var = conventions.maybe_encode_datetime(var, name=name) var = conventions.maybe_encode_timedelta(var, name=name) var, needs_copy = conventions.maybe_encode_offset_and_scale(var, - needs_copy, name=name) + needs_copy, + name=name) var, needs_copy = conventions.maybe_encode_fill_value(var, needs_copy, name=name) var = conventions.maybe_encode_nonstring_dtype(var, name=name) @@ -359,9 +359,9 @@ def set_dimension(self, name, length, is_unlimited=False): if name in self.ds.attrs[_DIMENSION_KEY]: if self.ds.attrs[_DIMENSION_KEY][name] != length: raise ValueError("Pre-existing array dimensions %r " - "encoded in Zarr attributes are incompatible " - "with newly specified dimension `%s`: %g" % - (self.ds.attrs[_DIMENSION_KEY], name, length)) + "encoded in Zarr attributes are incompatible " + "with newly specified dimension `%s`: %g" % + (self.ds.attrs[_DIMENSION_KEY], name, length)) self.ds.attrs[_DIMENSION_KEY][name] = length def set_attribute(self, key, value): @@ -411,7 +411,6 @@ def prepare_variable(self, name, variable, check_encoding=False, return zarr_array, variable.data - def store(self, variables, attributes, *args, **kwargs): new_vars = OrderedDict((k, encode_zarr_variable(v, name=k)) for k, v in iteritems(variables)) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index cd7d0003e4a..3c786abd7f4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1086,7 +1086,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, Write ('w') or append ('a') mode. If mode='w', any existing file at this location will be overwritten. If mode='a', existing variables will be overwritten. - format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional + format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT','NETCDF3_CLASSIC'}, optional File format for the resulting netCDF file: * NETCDF4: Data is stored in an HDF5 file, using netCDF4 API @@ -1161,7 +1161,7 @@ def to_zarr(self, store=None, mode='a', synchronizer=None, group=None, encoding = {} from ..backends.api import to_zarr return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer, - group=group, encoding=encoding) + group=group, encoding=encoding) def __unicode__(self): return formatting.dataset_repr(self) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5e5b8bfe86f..65afd5abe59 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -33,7 +33,7 @@ from .test_dataset import create_test_data -from xarray.tests import mock, assert_identical +from xarray.tests import mock try: import netCDF4 as nc4 @@ -1106,7 +1106,6 @@ def test_auto_chunk(self): # chunk size should be the same as original self.assertEqual(v.chunks, original[k].chunks) - def test_chunk_encoding(self): # These datasets have no dask chunks. All chunking specified in # encoding @@ -1123,14 +1122,12 @@ def test_chunk_encoding(self): with self.roundtrip(data) as actual: pass - def test_chunk_encoding_with_dask(self): # These datasets DO have dask chunks. Need to check for various # interactions between dask and zarr chunks ds = xr.DataArray((np.arange(12)), dims='x', name='var1').to_dataset() - ## no encoding specified ## - + # - no encoding specified - # zarr automatically gets chunk information from dask chunks ds_chunk4 = ds.chunk({'x': 4}) with self.roundtrip(ds_chunk4) as actual: @@ -1147,8 +1144,7 @@ def test_chunk_encoding_with_dask(self): with self.roundtrip(ds_chunk_irreg) as actual: self.assertEqual((5,), actual['var1'].encoding['chunks']) - ## encoding specified ## - + # - encoding specified - # specify compatible encodings for chunk_enc in 4, (4, ): ds_chunk4['var1'].encoding.update({'chunks': chunk_enc}) @@ -1156,7 +1152,7 @@ def test_chunk_encoding_with_dask(self): self.assertEqual((4,), actual['var1'].encoding['chunks']) # specify incompatible encoding - ds_chunk4['var1'].encoding.update({'chunks': (5,5)}) + ds_chunk4['var1'].encoding.update({'chunks': (5, 5)}) with pytest.raises(ValueError): with self.roundtrip(ds_chunk4) as actual: pass @@ -1168,11 +1164,9 @@ def test_chunk_encoding_with_dask(self): with self.roundtrip(ds_chunk4) as actual: pass - def test_vectorized_indexing(self): self._test_vectorized_indexing(vindex_support=True) - def test_hidden_zarr_keys(self): expected = create_test_data() with self.create_store(mode='w') as store: @@ -1207,8 +1201,6 @@ def test_hidden_zarr_keys(self): with xr.decode_cf(store) as actual: pass - - # TODO: implement zarr object encoding and make these tests pass @pytest.mark.xfail(reason="Zarr object encoding not implemented") def test_multiindex_not_implemented(self): @@ -1332,7 +1324,7 @@ def create_store(self): def test_array_attrs(self): ds = Dataset(attrs={'foo': [[1, 2], [3, 4]]}) with raises_regex(ValueError, 'must be 1-dimensional'): - with self.roundtrip(ds) as roundtripped: + with self.roundtrip(ds): pass def test_roundtrip_example_1_netcdf_gz(self): From 3b4a941a85f4c8f770e05c21e4a4128a7ca438d8 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 5 Dec 2017 21:20:35 -0500 Subject: [PATCH 49/68] zarr version update --- xarray/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 52a19e90ca8..235c6e9e410 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -71,7 +71,7 @@ def _importorskip(modname, minversion=None): has_bottleneck, requires_bottleneck = _importorskip('bottleneck') has_rasterio, requires_rasterio = _importorskip('rasterio') has_pathlib, requires_pathlib = _importorskip('pathlib') -has_zarr, requires_zarr = _importorskip('zarr') +has_zarr, requires_zarr = _importorskip('zarr', minversion='2.2.0') # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 From 688f415d446d608fe5557422b3aba0aaa48c9a90 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 5 Dec 2017 23:03:53 -0500 Subject: [PATCH 50/68] added more tests --- xarray/backends/zarr.py | 1 + xarray/core/dataset.py | 14 ++++--- xarray/tests/test_backends.py | 77 ++++++++++++++++++++++++++--------- 3 files changed, 66 insertions(+), 26 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e84a7efd33a..80baf813291 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -518,6 +518,7 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, ---------- http://zarr.readthedocs.io/ """ + # note: there is no way to actually use 'r+' yet if mode not in ['r', 'r+']: raise ValueError("Mode must be 'r' or 'r+'.") diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3c786abd7f4..ea89b93a17e 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1131,7 +1131,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, engine=engine, encoding=encoding, unlimited_dims=unlimited_dims) - def to_zarr(self, store=None, mode='a', synchronizer=None, group=None, + def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, encoding=None): """Write dataset contents to a zarr group. @@ -1143,11 +1143,9 @@ def to_zarr(self, store=None, mode='a', synchronizer=None, group=None, ---------- store : MutableMapping or str, optional Store or path to directory in file system. - mode : {‘r’, ‘r+’, ‘a’, ‘w’, ‘w-‘} - Persistence mode: ‘r’ means read only (must exist); ‘r+’ means - read/write (must exist); ‘a’ means read/write (create if doesn’t - exist); ‘w’ means create (overwrite if exists); ‘w-‘ means create - (fail if exists). + mode : {‘w’, ‘w-‘} + Persistence mode: ‘‘w’ means create (overwrite if exists); + ‘w-‘ means create (fail if exists). synchronizer : object, optional Array synchronizer group : str, obtional @@ -1159,6 +1157,10 @@ def to_zarr(self, store=None, mode='a', synchronizer=None, group=None, """ if encoding is None: encoding = {} + if mode not in ['w', 'w-']: + # TODO: figure out how to handle ‘r+’ and ‘a’ + raise ValueError("The only supported options for mode are 'w' " + "and 'w-'.") from ..backends.api import to_zarr return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer, group=group, encoding=encoding) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 65afd5abe59..d25634eda86 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1087,6 +1087,18 @@ class BaseZarrTest(CFEncodedDataTest): DIMENSION_KEY = '_ARRAY_DIMENSIONS' + @contextlib.contextmanager + def create_store(self, **open_kwargs): + with self.create_zarr_target() as store_target: + yield backends.ZarrStore(store=store_target, **open_kwargs) + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}, + allow_cleanup_failure=False): + with self.create_zarr_target() as store_target: + data.to_zarr(store=store_target, **save_kwargs) + yield xr.open_zarr(store_target, **open_kwargs) + def test_auto_chunk(self): original = create_test_data().chunk() @@ -1201,6 +1213,47 @@ def test_hidden_zarr_keys(self): with xr.decode_cf(store) as actual: pass + def test_write_persistence_modes(self): + original = create_test_data() + + # overwrite mode + with self.roundtrip(original, save_kwargs={'mode': 'w'}) as actual: + self.assertDatasetIdentical(original, actual) + + # don't overwrite mode + with self.roundtrip(original, save_kwargs={'mode': 'w-'}) as actual: + self.assertDatasetIdentical(original, actual) + + # make sure overwriting works as expected + with self.create_zarr_target() as store: + original.to_zarr(store) + # should overwrite with no error + original.to_zarr(store, mode='w') + actual = xr.open_zarr(store) + self.assertDatasetIdentical(original, actual) + with pytest.raises(ValueError): + xr.open_zarr(store, mode='w-') + + # check that we can't use other persistence modes + # TODO: reconsider whether other persistence modes should be supported + with pytest.raises(ValueError): + with self.roundtrip(original, save_kwargs={'mode': 'a'}) as actual: + pass + + def test_read_persistence_modes(self): + original = create_test_data() + + with self.roundtrip(original, open_kwargs={'mode': 'r'}) as actual: + self.assertDatasetIdentical(original, actual) + + # TODO: actually do something with 'r+' that is different from 'r' + with self.roundtrip(original, open_kwargs={'mode': 'r+'}) as actual: + self.assertDatasetIdentical(original, actual) + + with pytest.raises(ValueError): + with self.roundtrip(original, open_kwargs={'mode': 'w'}) as actual: + pass + # TODO: implement zarr object encoding and make these tests pass @pytest.mark.xfail(reason="Zarr object encoding not implemented") def test_multiindex_not_implemented(self): @@ -1229,32 +1282,16 @@ def test_dataset_caching(self): @requires_zarr class ZarrDictStoreTest(BaseZarrTest, TestCase): @contextlib.contextmanager - def create_store(self, **open_kwargs): - yield backends.ZarrStore(store={}, **open_kwargs) - - @contextlib.contextmanager - def roundtrip(self, data, save_kwargs={}, open_kwargs={}, - allow_cleanup_failure=False): - dict_store = {} - data.to_zarr(store=dict_store, **save_kwargs) - yield xr.open_zarr(dict_store, **open_kwargs) + def create_zarr_target(self): + yield {} @requires_zarr class ZarrDirectoryStoreTest(BaseZarrTest, TestCase): @contextlib.contextmanager - def create_store(self, **open_kwargs): + def create_zarr_target(self): with create_tmp_file(suffix='.zarr') as tmp: - yield backends.ZarrStore(store=tmp, **open_kwargs) - - @contextlib.contextmanager - def roundtrip(self, data, save_kwargs={}, open_kwargs={}, - allow_cleanup_failure=False): - with create_tmp_file( - suffix='.zarr', - allow_cleanup_failure=allow_cleanup_failure) as tmp_file: - data.to_zarr(store=tmp_file, **save_kwargs) - yield xr.open_zarr(tmp_file, **open_kwargs) + yield tmp @requires_scipy From c115a2b395413bfb02504989618c343696c8e528 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Tue, 5 Dec 2017 23:31:43 -0500 Subject: [PATCH 51/68] added compressor test --- xarray/tests/test_backends.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d25634eda86..7c54cffab51 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1254,6 +1254,15 @@ def test_read_persistence_modes(self): with self.roundtrip(original, open_kwargs={'mode': 'w'}) as actual: pass + def test_compressor_encoding(self): + original = create_test_data() + # specify a custom compressor + import zarr + blosc_comp = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) + save_kwargs = dict(encoding={'var1': {'compressor': blosc_comp}}) + with self.roundtrip(original, save_kwargs=save_kwargs) as actual: + assert actual.var1.encoding['compressor'] == blosc_comp + # TODO: implement zarr object encoding and make these tests pass @pytest.mark.xfail(reason="Zarr object encoding not implemented") def test_multiindex_not_implemented(self): From 4c925312dc1982c5d431b3859871fdfa43745750 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 6 Dec 2017 00:11:27 -0500 Subject: [PATCH 52/68] docs --- doc/conf.py | 2 +- doc/environment.yml | 1 + doc/io.rst | 90 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 90 insertions(+), 3 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 5a62cb59733..8a6d5ae4c4d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -23,7 +23,7 @@ print("python exec:", sys.executable) print("sys.path:", sys.path) for name in ('numpy scipy pandas matplotlib dask IPython seaborn ' - 'cartopy netCDF4 rasterio').split(): + 'cartopy netCDF4 rasterio zarr').split(): try: module = importlib.import_module(name) if name == 'matplotlib': diff --git a/doc/environment.yml b/doc/environment.yml index ae3ddb81719..45fa6417e16 100644 --- a/doc/environment.yml +++ b/doc/environment.yml @@ -16,3 +16,4 @@ dependencies: - cartopy=0.15.1 - rasterio=0.36.0 - sphinx-gallery + - zarr diff --git a/doc/io.rst b/doc/io.rst index 2b9b404569a..ffb8e77b5b9 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -295,7 +295,7 @@ string encoding for character arrays in netCDF files was Technically, you can use `any string encoding recognized by Python `_ if you feel the need to deviate from UTF-8, by setting the ``_Encoding`` field in ``encoding``. But -`we don't recommend it`_. +`we don't recommend it `_. .. warning:: @@ -502,16 +502,102 @@ longitudes and latitudes. .. _test files: https://github.com/mapbox/rasterio/blob/master/tests/data/RGB.byte.tif .. _pyproj: https://github.com/jswhit/pyproj +.. _io.zarr: + Zarr ---- `Zarr`_ is a Python package providing an implementation of chunked, compressed, N-dimensional arrays. +Zarr has the ability to store arrays in a range of ways, including in memory, +in files, and in cloud-based object storage such as `Amazon S3`_ and +`Google Cloud Storage`_. +Xarray's Zarr backend allows xarray to leverage these capabilities. + +.. warning:: + + Zarr support is still an experimental feature. Please report any bugs or + unexepected behavior via github issues. + +Xarray can't open just any zarr dataset, because xarray requires special +metadata (attributes) describing the dataset dimensions and coordinates. +At this time, xarray can only open zarr datasets that have been written by +xarray. To write a dataset to zarr using files, we use the +:py:attr:`Dataset.to_netcdf ` method. +To write to a local directory, we pass a path to a directory + +.. ipython:: python + :suppress: + + ! rm -rf path/to/directory.zarr + +.. ipython:: python + + ds = xr.Dataset({'foo': (('x', 'y'), np.random.rand(4, 5))}, + coords={'x': [10, 20, 30, 40], + 'y': pd.date_range('2000-01-01', periods=5), + 'z': ('x', list('abcd'))}) + ds.to_zarr('path/to/directory.zarr') + +(The suffix ``.zarr`` is optional--just a reminder that a zarr store lives +there.) If the directory does not exist, it will be created. If a a zarr +store is already present at that path an error will be raised, preventing it +from being overwritten. To override this behavior and overwrite and existing +store, add ``mode='w'`` when invoking ``to_zarr``. -TODO: fill in these docs +To read back a zarr dataset that has been created this way, we use the +py:func:`~xarray.open_zarr` method: + +.. ipython:: python + + ds_zarr = xr.open_zarr('path/to/directory.zarr') + ds_zarr + +Cloud Storage Buckets +~~~~~~~~~~~~~~~~~~~~~ + +It is possible to read and write xarray datasets directly from / to cloud +storage buckets using zarr. This example uses the `gcsfs`_ pacakge to provide +a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then +pass to xarray:: + + import gcsfs + fs = gcsfs.GCSFileSystem(project='', token=None) + gcsmap = gcsfs.mapping.GCSMap('', gcs=fs, check=True, create=False) + # write to the bucket + ds.to_zarr(store=gcsmap) + # read it back + ds_gcs = xr.open_zarr(gcsmap, mode='r') .. _Zarr: http://zarr.readthedocs.io/ +.. _Amazon S3: https://aws.amazon.com/s3/ +.. _Google Cloud Storage: https://cloud.google.com/storage/ +.. _gcsfs: https://github.com/dask/gcsfs + +Zarr Compressors and Filters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are many different options for compression and filtering possible with +zarr. These are described in the +`zarr documentation `_. +These options can be passed to the ``to_zarr`` method as variable encoding. +For example: + +.. ipython:: python + :suppress: + + ! rm -rf foo.zarr + +.. ipython:: python + + import zarr + compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) + ds.to_zarr('foo.zarr', encoding={'foo': {'compressor': compressor}}) + +.. note:: + Not all native zarr compression and filtering options have been tested with + xarray. .. _io.pynio: From 61027ebcefe665aa180863ef618a36ed278be2f4 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 6 Dec 2017 00:17:49 -0500 Subject: [PATCH 53/68] weird ascii character issue --- xarray/core/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ea89b93a17e..7c4d3ad1957 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1144,8 +1144,8 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, store : MutableMapping or str, optional Store or path to directory in file system. mode : {‘w’, ‘w-‘} - Persistence mode: ‘‘w’ means create (overwrite if exists); - ‘w-‘ means create (fail if exists). + Persistence mode: 'w' means create (overwrite if exists); + 'w-' means create (fail if exists). synchronizer : object, optional Array synchronizer group : str, obtional From bbaa7762286017e63abab5f7a3eb789d1ab85609 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 6 Dec 2017 00:24:14 -0500 Subject: [PATCH 54/68] doc fixes --- doc/io.rst | 4 ++-- xarray/core/dataset.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/io.rst b/doc/io.rst index ffb8e77b5b9..dbf9e0b2123 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -523,7 +523,7 @@ Xarray can't open just any zarr dataset, because xarray requires special metadata (attributes) describing the dataset dimensions and coordinates. At this time, xarray can only open zarr datasets that have been written by xarray. To write a dataset to zarr using files, we use the -:py:attr:`Dataset.to_netcdf ` method. +:py:attr:`Dataset.to_zarr ` method. To write to a local directory, we pass a path to a directory .. ipython:: python @@ -546,7 +546,7 @@ from being overwritten. To override this behavior and overwrite and existing store, add ``mode='w'`` when invoking ``to_zarr``. To read back a zarr dataset that has been created this way, we use the -py:func:`~xarray.open_zarr` method: +:py:func:`~xarray.open_zarr` method: .. ipython:: python diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 7c4d3ad1957..266b2d339ff 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1143,7 +1143,7 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, ---------- store : MutableMapping or str, optional Store or path to directory in file system. - mode : {‘w’, ‘w-‘} + mode : {'w', 'w-'} Persistence mode: 'w' means create (overwrite if exists); 'w-' means create (fail if exists). synchronizer : object, optional From c8f23a5085d739af06b7292accf8dffae8477dbd Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 6 Dec 2017 00:28:04 -0500 Subject: [PATCH 55/68] what's new --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 899175af45f..dbf9967ffe3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,10 @@ Enhancements - :py:func:`~plot.contourf()` learned to contour 2D variables that have both a 1D co-ordinate (e.g. time) and a 2D co-ordinate (e.g. depth as a function of time). By `Deepak Cherian `_. +- Support for using `Zarr`_ as storage layer for xarray. + By `Ryan Abernathey `_. + +.. _Zarr: http://zarr.readthedocs.io/ Bug fixes ~~~~~~~~~ From f0c76f767b093129d1ee93bac6b92e6f8c81cb85 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 6 Dec 2017 00:34:31 -0500 Subject: [PATCH 56/68] more file encoding nightmares --- xarray/core/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 266b2d339ff..546dd03cf18 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1158,7 +1158,7 @@ def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, if encoding is None: encoding = {} if mode not in ['w', 'w-']: - # TODO: figure out how to handle ‘r+’ and ‘a’ + # TODO: figure out how to handle 'r+' and 'a' raise ValueError("The only supported options for mode are 'w' " "and 'w-'.") from ..backends.api import to_zarr From a84e3887e0e06e4b091f4ad0c98695801154d0c5 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 5 Dec 2017 22:26:12 -0800 Subject: [PATCH 57/68] Tests for backends.zarr._replace_slices_with_arrays (and misc. small cleanup, adding docstrings + a bit more validation logic) --- xarray/backends/zarr.py | 14 ++++++++------ xarray/core/indexing.py | 14 +++++++++++--- xarray/core/variable.py | 5 +++++ xarray/tests/test_backends.py | 21 +++++++++++++++++++++ xarray/tests/test_indexing.py | 3 +++ 5 files changed, 48 insertions(+), 9 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 80baf813291..d236eb1cd06 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -57,24 +57,26 @@ def _decode_zarr_attrs(attrs): for k, v in attrs.items()]) -# Do the appropriate shape munging to make slices -# appear as the last axes of the result array -# TODO: write tests for this def _replace_slices_with_arrays(key, shape): + """Replace slice objects in vindex with equivalent ndarray objects.""" num_slices = sum(1 for k in key if isinstance(k, slice)) - num_arrays = len(shape) - num_slices + array_subspace_size = max( + (k.ndim for k in key if isinstance(k, np.ndarray)), default=0) + assert len(key) == len(shape) new_key = [] slice_count = 0 for k, size in zip(key, shape): if isinstance(k, slice): + # the slice subspace always appears after the ndarray subspace array = np.arange(*k.indices(size)) sl = [np.newaxis] * len(shape) - sl[num_arrays + slice_count] = np.newaxis + sl[array_subspace_size + slice_count] = slice(None) k = array[tuple(sl)] slice_count += 1 else: assert isinstance(k, np.ndarray) - k = k[(slice(None),) * num_arrays + (np.newaxis,) * num_slices] + k = k[(slice(None),) * array_subspace_size + + (np.newaxis,) * num_slices] new_key.append(k) return tuple(new_key) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 3aea8ca6b8a..f1bbe202fa8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -376,9 +376,9 @@ class VectorizedIndexer(ExplicitIndexer): """Tuple for vectorized indexing. All elements should be slice or N-dimensional np.ndarray objects with an - integer dtype. Indexing follows proposed rules for np.ndarray.vindex, which - matches NumPy's advanced indexing rules (including broadcasting) except - sliced axes are always moved to the end: + integer dtype and the same number of dimensions. Indexing follows proposed + rules for np.ndarray.vindex, which matches NumPy's advanced indexing rules + (including broadcasting) except sliced axes are always moved to the end: https://github.com/numpy/numpy/pull/6256 """ def __init__(self, key): @@ -386,6 +386,7 @@ def __init__(self, key): raise TypeError('key must be a tuple: {!r}'.format(key)) new_key = [] + ndim = None for k in key: if isinstance(k, slice): k = as_integer_slice(k) @@ -393,6 +394,13 @@ def __init__(self, key): if not np.issubdtype(k.dtype, np.integer): raise TypeError('invalid indexer array, does not have ' 'integer dtype: {!r}'.format(k)) + if ndim is None: + ndim = k.ndim + elif ndim != k.ndim: + ndims = [k.ndim for k in key if isinstance(k, np.ndarray)] + raise ValueError('invalid indexer key: ndarray arguments ' + 'have different numbers of dimensions: {}' + .format(ndims)) k = np.asarray(k, dtype=np.int64) else: raise TypeError('unexpected indexer type for {}: {!r}' diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 53b4bf60c5c..6a8127ce115 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1613,6 +1613,11 @@ def _unified_dims(variables): def _broadcast_compat_variables(*variables): + """Create broadcast compatible variables, with the same dimensions. + + Unlike the result of broadcast_variables(), some variables may have + dimensions of size 1 instead of the the size of the broadcast dimension. + """ dims = tuple(_unified_dims(variables)) return tuple(var.set_dims(dims) if var.dims != dims else var for var in variables) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7c54cffab51..6ec6b5a20a8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1303,6 +1303,27 @@ def create_zarr_target(self): yield tmp +def test_replace_slices_with_arrays(): + (actual,) = xr.backends.zarr._replace_slices_with_arrays( + key=(slice(None),), shape=(5,)) + np.testing.assert_array_equal(actual, np.arange(5)) + + actual = xr.backends.zarr._replace_slices_with_arrays( + key=(np.arange(5),) * 3, shape=(8, 10, 12)) + expected = np.stack([np.arange(5)] * 3) + np.testing.assert_array_equal(np.stack(actual), expected) + + a, b = xr.backends.zarr._replace_slices_with_arrays( + key=(np.arange(5), slice(None)), shape=(8, 10)) + np.testing.assert_array_equal(a, np.arange(5)[:, np.newaxis]) + np.testing.assert_array_equal(b, np.arange(10)[np.newaxis, :]) + + a, b = xr.backends.zarr._replace_slices_with_arrays( + key=(slice(None), np.arange(5)), shape=(8, 10)) + np.testing.assert_array_equal(a, np.arange(8)[np.newaxis, :]) + np.testing.assert_array_equal(b, np.arange(5)[:, np.newaxis]) + + @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, NetCDF3Only, TestCase): engine = 'scipy' diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 28fecdb4827..590492414b9 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -328,6 +328,9 @@ def test_vectorized_indexer(): check_slice(indexing.VectorizedIndexer) check_array1d(indexing.VectorizedIndexer) check_array2d(indexing.VectorizedIndexer) + with raises_regex(ValueError, 'numbers of dimensions'): + indexing.VectorizedIndexer((np.array(1, dtype=np.int64), + np.arange(5, dtype=np.int64))) def test_unwrap_explicit_indexer(): From 37bc2f07a00ba2099ad56ef64dcabac016a14dbf Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Wed, 6 Dec 2017 10:46:21 -0500 Subject: [PATCH 58/68] respond to @shoyer's review --- xarray/backends/api.py | 6 +-- xarray/backends/zarr.py | 71 ++++++++++------------------------- xarray/core/utils.py | 2 +- xarray/tests/test_backends.py | 9 +++-- xarray/tests/test_utils.py | 2 +- 5 files changed, 29 insertions(+), 61 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 415689b2ef2..20c0f180a7c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -731,9 +731,9 @@ def to_zarr(dataset, store=None, mode='a', synchronizer=None, group=None, _validate_dataset_names(dataset) _validate_attrs(dataset) - store = backends.ZarrStore(store=store, mode=mode, - synchronizer=synchronizer, group=group, - writer=None) + store = backends.ZarrStore.open_group(store=store, mode=mode, + synchronizer=synchronizer, + group=group, writer=None) # I think zarr stores should always be sync'd immediately # TODO: figure out how to properly handle unlimited_dims diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d236eb1cd06..b4a530b3d9f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -10,14 +10,9 @@ from ..core import indexing from ..core.utils import FrozenOrderedDict, HiddenKeyDict from ..core.pycompat import iteritems, OrderedDict, integer_types - from .common import (AbstractWritableDataStore, BackendArray) - from .. import conventions -# this is a private method but we need it for open_zar -from .api import _protect_dataset_variables_inplace - # need some special secret attributes to tell us the dimensions _DIMENSION_KEY = '_ARRAY_DIMENSIONS' @@ -109,30 +104,6 @@ def __getitem__(self, key): # could possibly have a work-around for 0d data here -def _open_zarr_group(store=None, overwrite=None, synchronizer=None, - group=None, mode=None): - '''Wrap zarr.open_group''' - - import zarr - zarr_group = zarr.open_group(store=store, mode=mode, - synchronizer=synchronizer, path=group) - return zarr_group - - -def _dask_chunks_to_zarr_chunks(chunks): - '''this function translates dask chunk syntax to zarr chunk syntax''' - if chunks is None: - return chunks - - all_chunks = product(*chunks) - first_chunk = next(all_chunks) - for this_chunk in all_chunks: - if this_chunk != first_chunk: - raise ValueError("zarr requires uniform chunk sizes, found %r" % - chunks) - return first_chunk - - def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): """ Given encoding chunks (possibly None) and variable chunks (possibly None) @@ -179,7 +150,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): # Here we re-implement this expansion ourselves. That makes the logic of # checking chunk compatibility easier - if type(enc_chunks) in integer_types: + if isinstance(enc_chunks, integer_types): enc_chunks_tuple = ndim * (enc_chunks,) else: enc_chunks_tuple = tuple(enc_chunks) @@ -258,10 +229,6 @@ def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): chunks = _determine_zarr_chunks(encoding.get('chunks'), variable.chunks, variable.ndim) encoding['chunks'] = chunks - - # TODO: figure out how to serialize compressor and filters options - # in zarr these are python objects, not strings - return encoding @@ -311,13 +278,21 @@ class ZarrStore(AbstractWritableDataStore): """Store for reading and writing data via zarr """ - def __init__(self, store=None, mode='a', synchronizer=None, group=None, + @classmethod + def open_group(cls, store, mode='r', synchronizer=None, group=None, + writer=None): + import zarr + zarr_group = zarr.open_group(store=store, mode=mode, + synchronizer=synchronizer, path=group) + return cls(zarr_group, mode=mode, synchronizer=synchronizer, + group=group, writer=writer) + + def __init__(self, zarr_group, mode='r', synchronizer=None, group=None, writer=None): + self.ds = zarr_group self._mode = mode self._synchronizer = synchronizer self._group = group - self.ds = _open_zarr_group(store=store, mode=mode, - synchronizer=synchronizer, group=group) # initialize hidden dimension attribute if _DIMENSION_KEY not in self.ds.attrs: @@ -446,7 +421,7 @@ def store(self, variables, attributes, *args, **kwargs): def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, - cache=False, drop_variables=None): + drop_variables=None): """Load and decode a dataset from a Zarr store. .. note:: Experimental @@ -462,8 +437,8 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, store : MutableMapping or str A MutableMapping where a Zarr Group has been stored or a path to a directory in file system where a Zarr DirectoryStore has been stored. - mode : {‘r’, ‘r+’} - Persistence mode: ‘r’ means read only (must exist); ‘r+’ means + mode : {'r', 'r+'} + Persistence mode: 'r' means read only (must exist); 'r+' means read/write (must exist) synchronizer : object, optional Array synchronizer @@ -495,13 +470,6 @@ def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, decode_coords : bool, optional If True, decode the 'coordinates' attribute to identify coordinates in the resulting dataset. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. drop_variables: string or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or @@ -536,14 +504,13 @@ def maybe_decode_store(store, lock=False): concat_characters=concat_characters, decode_coords=decode_coords, drop_variables=drop_variables) - # this is how we would apply caching - # but do we want it for zarr stores? - _protect_dataset_variables_inplace(ds, cache) + # TODO: this is where we would apply caching return ds - zarr_store = ZarrStore(store=store, mode=mode, synchronizer=synchronizer, - group=group) + zarr_store = ZarrStore.open_group(store, mode=mode, + synchronizer=synchronizer, + group=group) ds = maybe_decode_store(zarr_store) # auto chunking needs to be here and not in ZarrStore because variable diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 7c41c633a52..46fea23b9ff 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -543,7 +543,7 @@ class HiddenKeyDict(MutableMapping): def __init__(self, data, hidden_keys): self._data = data if type(hidden_keys) not in (list, tuple): - raise ValueError("hidden_keys must be a list or tuple") + raise TypeError("hidden_keys must be a list or tuple") self._hidden_keys = hidden_keys def _raise_if_hidden(self, key): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6ec6b5a20a8..3c4153bf5cd 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1088,9 +1088,9 @@ class BaseZarrTest(CFEncodedDataTest): DIMENSION_KEY = '_ARRAY_DIMENSIONS' @contextlib.contextmanager - def create_store(self, **open_kwargs): + def create_store(self): with self.create_zarr_target() as store_target: - yield backends.ZarrStore(store=store_target, **open_kwargs) + yield backends.ZarrStore.open_group(store_target, mode='w') @contextlib.contextmanager def roundtrip(self, data, save_kwargs={}, open_kwargs={}, @@ -1181,7 +1181,7 @@ def test_vectorized_indexing(self): def test_hidden_zarr_keys(self): expected = create_test_data() - with self.create_store(mode='w') as store: + with self.create_store() as store: expected.dump_to_store(store) zarr_group = store.ds @@ -1189,7 +1189,8 @@ def test_hidden_zarr_keys(self): assert self.DIMENSION_KEY in zarr_group.attrs # check that a variable hidden attribute is present and correct - # for some reason, one is a list and the other a tuple + # JSON only has a single array type, which maps to list in Python. + # In contrast, dims in xarray is always a tuple. for var in expected.variables.keys(): assert (zarr_group[var].attrs[self.DIMENSION_KEY] == list(expected[var].dims)) diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index b2796f59189..1813e2b6df8 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -201,6 +201,6 @@ def test_hidden_key_dict(): for k, v in data_expected.items(): assert hkd[k] == v with pytest.raises(KeyError): - _ = hkd[hidden_key] + hkd[hidden_key] with pytest.raises(KeyError): del hkd[hidden_key] From 8cd17074629ef0f2ee1ab383d35e19d68253a1de Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 7 Dec 2017 09:58:18 -0500 Subject: [PATCH 59/68] final fixes --- xarray/backends/api.py | 2 +- xarray/backends/zarr.py | 27 +++++++++++++++------------ xarray/tests/test_backends.py | 26 ++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 20c0f180a7c..44c37d1cbf4 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -715,7 +715,7 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None, store.close() -def to_zarr(dataset, store=None, mode='a', synchronizer=None, group=None, +def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None, encoding=None): """This function creates an appropriate datastore for writing a dataset to disk a zarr ztore diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b4a530b3d9f..f6bb88f2c17 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -56,7 +56,7 @@ def _replace_slices_with_arrays(key, shape): """Replace slice objects in vindex with equivalent ndarray objects.""" num_slices = sum(1 for k in key if isinstance(k, slice)) array_subspace_size = max( - (k.ndim for k in key if isinstance(k, np.ndarray)), default=0) + (k.ndim for k in key if isinstance(k, np.ndarray)) + (0,)) assert len(key) == len(shape) new_key = [] slice_count = 0 @@ -133,7 +133,7 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): raise ValueError( "Zarr requires uniform chunk sizes excpet for final chunk." " Variable %r has incompatible chunks. Consider " - "rechunking using `chunk()`." % var_chunks) + "rechunking using `chunk()`." % (var_chunks,)) # last chunk is allowed to be smaller last_var_chunk = all_var_chunks[-1] for len_first, len_last in zip(first_var_chunk, last_var_chunk): @@ -284,19 +284,22 @@ def open_group(cls, store, mode='r', synchronizer=None, group=None, import zarr zarr_group = zarr.open_group(store=store, mode=mode, synchronizer=synchronizer, path=group) - return cls(zarr_group, mode=mode, synchronizer=synchronizer, - group=group, writer=writer) + return cls(zarr_group, writer=writer) - def __init__(self, zarr_group, mode='r', synchronizer=None, group=None, - writer=None): + def __init__(self, zarr_group, writer=None): self.ds = zarr_group - self._mode = mode - self._synchronizer = synchronizer - self._group = group + self._read_only = self.ds.read_only + self._synchronizer = self.ds.synchronizer + self._group = self.ds.path - # initialize hidden dimension attribute if _DIMENSION_KEY not in self.ds.attrs: - self.ds.attrs[_DIMENSION_KEY] = {} + if self._read_only: + raise KeyError("Zarr group can't be read by xarray because " + "it is missing the `%s` attribute." % + _DIMENSION_KEY) + else: + # initialize hidden dimension attribute + self.ds.attrs[_DIMENSION_KEY] = {} # do we need to define attributes for all of the opener keyword args? super(ZarrStore, self).__init__(writer) @@ -418,7 +421,7 @@ def store(self, variables, attributes, *args, **kwargs): # avoid two workers attempting to modify the same chunk at the same time. -def open_zarr(store, mode='r+', group=None, synchronizer=None, auto_chunk=True, +def open_zarr(store, mode='r', group=None, synchronizer=None, auto_chunk=True, decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, drop_variables=None): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3c4153bf5cd..2a3966ace29 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1147,9 +1147,11 @@ def test_chunk_encoding_with_dask(self): # should fail if dask_chunks are irregular... ds_chunk_irreg = ds.chunk({'x': (5, 4, 3)}) - with pytest.raises(ValueError): + with pytest.raises(ValueError) as e_info: with self.roundtrip(ds_chunk_irreg) as actual: pass + # make sure this error message is correct and not some other error + assert e_info.match('chunks') # ... except if the last chunk is smaller than the first ds_chunk_irreg = ds.chunk({'x': (5, 5, 2)}) @@ -1165,9 +1167,10 @@ def test_chunk_encoding_with_dask(self): # specify incompatible encoding ds_chunk4['var1'].encoding.update({'chunks': (5, 5)}) - with pytest.raises(ValueError): + with pytest.raises(ValueError) as e_info: with self.roundtrip(ds_chunk4) as actual: pass + assert e_info.match('chunks') # TODO: remove this failure once syncronized overlapping writes are # supported by xarray @@ -1248,6 +1251,7 @@ def test_read_persistence_modes(self): self.assertDatasetIdentical(original, actual) # TODO: actually do something with 'r+' that is different from 'r' + # what does 'r+' really mean for xarray? with self.roundtrip(original, open_kwargs={'mode': 'r+'}) as actual: self.assertDatasetIdentical(original, actual) @@ -1264,6 +1268,24 @@ def test_compressor_encoding(self): with self.roundtrip(original, save_kwargs=save_kwargs) as actual: assert actual.var1.encoding['compressor'] == blosc_comp + def test_group(self): + original = create_test_data() + group = 'some/random/path' + with self.roundtrip(original, save_kwargs={'group': group}, + open_kwargs={'group': group}) as actual: + self.assertDatasetIdentical(original, actual) + with pytest.raises(KeyError): + with self.roundtrip(original, + save_kwargs={'group': group}) as actual: + self.assertDatasetIdentical(original, actual) + # if we open the dataset without specifying group but with mode='r+', + # no error is raised, but the variables are not there. This is because + # xarray creates the attribute it needs. Is this the right behavior? + # Should we even allow 'r+' mode? + with self.roundtrip(original, save_kwargs={'group': group}, + open_kwargs={'mode': 'r+'}) as actual: + assert len(actual.variables) == 0 + # TODO: implement zarr object encoding and make these tests pass @pytest.mark.xfail(reason="Zarr object encoding not implemented") def test_multiindex_not_implemented(self): From ac27411d83f573763982420d2b342613b78db642 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 7 Dec 2017 11:01:54 -0500 Subject: [PATCH 60/68] put back @shoyer's original max function --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index f6bb88f2c17..e49f663fba7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -56,7 +56,7 @@ def _replace_slices_with_arrays(key, shape): """Replace slice objects in vindex with equivalent ndarray objects.""" num_slices = sum(1 for k in key if isinstance(k, slice)) array_subspace_size = max( - (k.ndim for k in key if isinstance(k, np.ndarray)) + (0,)) + (k.ndim for k in key if isinstance(k, np.ndarray)), default=0) assert len(key) == len(shape) new_key = [] slice_count = 0 From 618bf81a2ef9e286b9a02deeb35a97c3920407f8 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 7 Dec 2017 12:43:20 -0500 Subject: [PATCH 61/68] another try with 2.7-safe max function --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index e49f663fba7..cb689aa5d10 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -56,7 +56,7 @@ def _replace_slices_with_arrays(key, shape): """Replace slice objects in vindex with equivalent ndarray objects.""" num_slices = sum(1 for k in key if isinstance(k, slice)) array_subspace_size = max( - (k.ndim for k in key if isinstance(k, np.ndarray)), default=0) + (k.ndim for k in key if isinstance(k, np.ndarray)) or [0]) assert len(key) == len(shape) new_key = [] slice_count = 0 From e9421305dda65a1462fd50f6d96846413f88cc31 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Thu, 7 Dec 2017 13:53:10 -0500 Subject: [PATCH 62/68] put back @shoyer's original max function --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index cb689aa5d10..e49f663fba7 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -56,7 +56,7 @@ def _replace_slices_with_arrays(key, shape): """Replace slice objects in vindex with equivalent ndarray objects.""" num_slices = sum(1 for k in key if isinstance(k, slice)) array_subspace_size = max( - (k.ndim for k in key if isinstance(k, np.ndarray)) or [0]) + (k.ndim for k in key if isinstance(k, np.ndarray)), default=0) assert len(key) == len(shape) new_key = [] slice_count = 0 From b1fa69023023f7c04c72e90c32a42ebd7751f842 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 8 Dec 2017 11:23:26 -0500 Subject: [PATCH 63/68] bypass lock on ArrayWriter --- xarray/backends/common.py | 5 +++-- xarray/backends/zarr.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/xarray/backends/common.py b/xarray/backends/common.py index d33bffb1c1e..fd408877f87 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -164,9 +164,10 @@ def __exit__(self, exception_type, exception_value, traceback): class ArrayWriter(object): - def __init__(self): + def __init__(self, lock=GLOBAL_LOCK): self.sources = [] self.targets = [] + self.lock = lock def add(self, source, target): if isinstance(source, dask_array_type): @@ -184,7 +185,7 @@ def sync(self): import dask.array as da import dask if LooseVersion(dask.__version__) > LooseVersion('0.8.1'): - da.store(self.sources, self.targets, lock=GLOBAL_LOCK) + da.store(self.sources, self.targets, lock=self.lock) else: da.store(self.sources, self.targets) self.sources = [] diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index b4a530b3d9f..7f955493107 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -10,7 +10,7 @@ from ..core import indexing from ..core.utils import FrozenOrderedDict, HiddenKeyDict from ..core.pycompat import iteritems, OrderedDict, integer_types -from .common import (AbstractWritableDataStore, BackendArray) +from .common import AbstractWritableDataStore, BackendArray, ArrayWriter from .. import conventions # need some special secret attributes to tell us the dimensions @@ -298,8 +298,15 @@ def __init__(self, zarr_group, mode='r', synchronizer=None, group=None, if _DIMENSION_KEY not in self.ds.attrs: self.ds.attrs[_DIMENSION_KEY] = {} + if writer is None: + # by default, we should not need a lock for writing zarr because + # we do not (yet) allow overlapping chunks during write + zarr_writer = ArrayWriter(lock=None) + else: + zarr_writer = writer + # do we need to define attributes for all of the opener keyword args? - super(ZarrStore, self).__init__(writer) + super(ZarrStore, self).__init__(zarr_writer) def open_store_variable(self, name, zarr_array): data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) From ba200c1ed6ac0e0f923e0cf04f4b0f0439825c11 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 8 Dec 2017 11:57:24 -0500 Subject: [PATCH 64/68] eliminate read mode --- xarray/backends/api.py | 2 +- xarray/backends/zarr.py | 13 +++++-------- xarray/tests/test_backends.py | 24 +----------------------- 3 files changed, 7 insertions(+), 32 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 44c37d1cbf4..cdeb8c0c0c2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -718,7 +718,7 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None, def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None, encoding=None): """This function creates an appropriate datastore for writing a dataset to - disk a zarr ztore + a zarr ztore See `Dataset.to_zarr` for full API docs. """ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 3957ae069a7..baa168dae60 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -428,7 +428,7 @@ def store(self, variables, attributes, *args, **kwargs): # avoid two workers attempting to modify the same chunk at the same time. -def open_zarr(store, mode='r', group=None, synchronizer=None, auto_chunk=True, +def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, decode_coords=True, drop_variables=None): @@ -447,11 +447,8 @@ def open_zarr(store, mode='r', group=None, synchronizer=None, auto_chunk=True, store : MutableMapping or str A MutableMapping where a Zarr Group has been stored or a path to a directory in file system where a Zarr DirectoryStore has been stored. - mode : {'r', 'r+'} - Persistence mode: 'r' means read only (must exist); 'r+' means - read/write (must exist) synchronizer : object, optional - Array synchronizer + Array synchronizer provided to zarr group : str, obtional Group path. (a.k.a. `path` in zarr terminology.) auto_chunk : bool, optional @@ -498,9 +495,6 @@ def open_zarr(store, mode='r', group=None, synchronizer=None, auto_chunk=True, ---------- http://zarr.readthedocs.io/ """ - # note: there is no way to actually use 'r+' yet - if mode not in ['r', 'r+']: - raise ValueError("Mode must be 'r' or 'r+'.") if not decode_cf: mask_and_scale = False @@ -518,6 +512,9 @@ def maybe_decode_store(store, lock=False): return ds + # Zarr supports a wide range of access modes, but for now xarray either + # reads or writes from a store, never both. For open_zarr, we only read + mode = 'r' zarr_store = ZarrStore.open_group(store, mode=mode, synchronizer=synchronizer, group=group) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 2a3966ace29..e6dd4daa784 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1236,7 +1236,7 @@ def test_write_persistence_modes(self): actual = xr.open_zarr(store) self.assertDatasetIdentical(original, actual) with pytest.raises(ValueError): - xr.open_zarr(store, mode='w-') + original.to_zarr(store, mode='w-') # check that we can't use other persistence modes # TODO: reconsider whether other persistence modes should be supported @@ -1244,21 +1244,6 @@ def test_write_persistence_modes(self): with self.roundtrip(original, save_kwargs={'mode': 'a'}) as actual: pass - def test_read_persistence_modes(self): - original = create_test_data() - - with self.roundtrip(original, open_kwargs={'mode': 'r'}) as actual: - self.assertDatasetIdentical(original, actual) - - # TODO: actually do something with 'r+' that is different from 'r' - # what does 'r+' really mean for xarray? - with self.roundtrip(original, open_kwargs={'mode': 'r+'}) as actual: - self.assertDatasetIdentical(original, actual) - - with pytest.raises(ValueError): - with self.roundtrip(original, open_kwargs={'mode': 'w'}) as actual: - pass - def test_compressor_encoding(self): original = create_test_data() # specify a custom compressor @@ -1278,13 +1263,6 @@ def test_group(self): with self.roundtrip(original, save_kwargs={'group': group}) as actual: self.assertDatasetIdentical(original, actual) - # if we open the dataset without specifying group but with mode='r+', - # no error is raised, but the variables are not there. This is because - # xarray creates the attribute it needs. Is this the right behavior? - # Should we even allow 'r+' mode? - with self.roundtrip(original, save_kwargs={'group': group}, - open_kwargs={'mode': 'r+'}) as actual: - assert len(actual.variables) == 0 # TODO: implement zarr object encoding and make these tests pass @pytest.mark.xfail(reason="Zarr object encoding not implemented") From 8dafaf758bbccb32f7e6173473e51921285ea48e Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 8 Dec 2017 13:18:04 -0500 Subject: [PATCH 65/68] added zarr distributed integration test --- xarray/tests/test_distributed.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_distributed.py b/xarray/tests/test_distributed.py index 2761f85f3af..1d0c51322a1 100644 --- a/xarray/tests/test_distributed.py +++ b/xarray/tests/test_distributed.py @@ -13,7 +13,8 @@ from xarray.tests.test_backends import create_tmp_file, ON_WINDOWS from xarray.tests.test_dataset import create_test_data -from . import assert_allclose, has_scipy, has_netCDF4, has_h5netcdf +from . import (assert_allclose, has_scipy, has_netCDF4, has_h5netcdf, + requires_zarr) ENGINES = [] @@ -28,7 +29,7 @@ @pytest.mark.xfail(sys.platform == 'win32', reason='https://github.com/pydata/xarray/issues/1738') @pytest.mark.parametrize('engine', ENGINES) -def test_dask_distributed_integration_test(loop, engine): +def test_dask_distributed_netcdf_integration_test(loop, engine): with cluster() as (s, _): with distributed.Client(s['address'], loop=loop): original = create_test_data() @@ -39,6 +40,17 @@ def test_dask_distributed_integration_test(loop, engine): computed = restored.compute() assert_allclose(original, computed) +@requires_zarr +def test_dask_distributed_zarr_integration_test(loop): + with cluster() as (s, _): + with distributed.Client(s['address'], loop=loop): + original = create_test_data() + with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as filename: + original.to_zarr(filename) + with xr.open_zarr(filename) as restored: + assert isinstance(restored.var1.data, da.Array) + computed = restored.compute() + assert_allclose(original, computed) @pytest.mark.skipif(distributed.__version__ <= '1.19.3', reason='Need recent distributed version to clean up get') From 85174cda6440c2f6eed7860357e79897e796e623 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Fri, 8 Dec 2017 14:52:09 -0500 Subject: [PATCH 66/68] fixed max bug --- xarray/backends/zarr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index baa168dae60..231ab3522c1 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -55,8 +55,8 @@ def _decode_zarr_attrs(attrs): def _replace_slices_with_arrays(key, shape): """Replace slice objects in vindex with equivalent ndarray objects.""" num_slices = sum(1 for k in key if isinstance(k, slice)) - array_subspace_size = max( - (k.ndim for k in key if isinstance(k, np.ndarray)), default=0) + ndims = [k.ndim for k in key if isinstance(k, np.ndarray)] + array_subspace_size = max(ndims) if ndims else 0 assert len(key) == len(shape) new_key = [] slice_count = 0 From c76a01b8d284383a009d36f6a0ddae10bf9499f5 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 11 Dec 2017 12:00:26 -0500 Subject: [PATCH 67/68] change lock to False --- xarray/backends/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 231ab3522c1..779d8d07886 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -304,7 +304,7 @@ def __init__(self, zarr_group, writer=None): if writer is None: # by default, we should not need a lock for writing zarr because # we do not (yet) allow overlapping chunks during write - zarr_writer = ArrayWriter(lock=None) + zarr_writer = ArrayWriter(lock=False) else: zarr_writer = writer From c011c2d36bdb47f0a9fdcd67452605175394e3c3 Mon Sep 17 00:00:00 2001 From: Ryan Abernathey Date: Mon, 11 Dec 2017 12:03:52 -0500 Subject: [PATCH 68/68] fix doc typos --- doc/io.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/io.rst b/doc/io.rst index dbf9e0b2123..14e82d4aacc 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -522,7 +522,7 @@ Xarray's Zarr backend allows xarray to leverage these capabilities. Xarray can't open just any zarr dataset, because xarray requires special metadata (attributes) describing the dataset dimensions and coordinates. At this time, xarray can only open zarr datasets that have been written by -xarray. To write a dataset to zarr using files, we use the +xarray. To write a dataset with zarr, we use the :py:attr:`Dataset.to_zarr ` method. To write to a local directory, we pass a path to a directory @@ -540,9 +540,9 @@ To write to a local directory, we pass a path to a directory ds.to_zarr('path/to/directory.zarr') (The suffix ``.zarr`` is optional--just a reminder that a zarr store lives -there.) If the directory does not exist, it will be created. If a a zarr -store is already present at that path an error will be raised, preventing it -from being overwritten. To override this behavior and overwrite and existing +there.) If the directory does not exist, it will be created. If a zarr +store is already present at that path, an error will be raised, preventing it +from being overwritten. To override this behavior and overwrite an existing store, add ``mode='w'`` when invoking ``to_zarr``. To read back a zarr dataset that has been created this way, we use the @@ -557,7 +557,7 @@ Cloud Storage Buckets ~~~~~~~~~~~~~~~~~~~~~ It is possible to read and write xarray datasets directly from / to cloud -storage buckets using zarr. This example uses the `gcsfs`_ pacakge to provide +storage buckets using zarr. This example uses the `gcsfs`_ package to provide a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then pass to xarray::