-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP: Zarr backend #1528
WIP: Zarr backend #1528
Changes from 2 commits
5cdf6c8
f305c25
2ea21c5
d92bf2f
79da971
31e4409
2ec5ee5
bd21720
7e898fc
af5ff6c
9e7cc09
3f01365
41cf706
fd9fd0f
9f16e8f
fe9ebe7
c01cd09
b3e5d76
45375b2
0e79718
3d39ade
3d09c67
0b4a27a
f39035c
6446ea2
9136064
2966100
6bedf22
ced8267
e461cdb
049bf9e
c169128
82ef456
3ee243e
e20c29f
f82c8c1
43e539f
66299f0
2fce362
c19b81a
68b8f07
0ea0dad
58b3bf0
9da22da
a8b4785
2a6a776
021d3ba
5ef10d2
e47d936
a4b024e
d8842a6
54d116d
94678f4
64942e5
f584456
c43284e
9df6e50
012e858
b1819f4
8eb98c9
64bd76c
cffa158
3b4a941
688f415
c115a2b
4c92531
61027eb
bbaa776
c8f23a5
f0c76f7
a84e388
37bc2f0
8cd1707
ac27411
618bf81
e942130
b1fa690
4089d13
ba200c1
8dafaf7
85174cd
c76a01b
c011c2d
054ffeb
f5633ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
import functools | ||
import warnings | ||
from itertools import product | ||
from collections import MutableMapping | ||
|
||
from .. import Variable | ||
from ..core import indexing | ||
from ..core.utils import FrozenOrderedDict, close_on_error, HiddenKeyDict | ||
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict | ||
|
||
from .common import (WritableCFDataStore, AbstractWritableDataStore, | ||
DataStorePickleMixin) | ||
|
||
|
||
|
||
|
||
# most of the other stores have some kind of wrapper class like | ||
# class BaseNetCDF4Array(NdimSizeLenMixin, DunderArrayMixin): | ||
# class H5NetCDFArrayWrapper(BaseNetCDF4Array): | ||
# class NioArrayWrapper(NdimSizeLenMixin, DunderArrayMixin): | ||
# we problaby need something like this | ||
|
||
# the first question is whether it should be based on BaseNetCDF4Array or | ||
# NdimSizeLenMixing? | ||
|
||
# or maybe we don't need wrappers at all? probably not true | ||
|
||
|
||
# also most have a custom opener | ||
|
||
# keyword args for zarr.group | ||
# store=None, overwrite=False, chunk_store=None, synchronizer=None, path=None | ||
# the group name is called "path" in the zarr lexicon | ||
|
||
def _open_zarr_group(store, overwrite, chunk_store, synchronizer, path): | ||
import zarr | ||
zarr_group = zarr.group(store=store, overwrite=overwrite, | ||
chunk_store=chunk_store, synchronizer=synchronizer, path=path) | ||
return zarr_group | ||
|
||
|
||
def _dask_chunks_to_zarr_chunks(chunks): | ||
# zarr chunks needs to be uniform for each array | ||
# http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks | ||
# dask chunks can be variable sized | ||
# http://dask.pydata.org/en/latest/array-design.html#chunks | ||
# this function dask chunks syntax to zarr chunks | ||
if chunks is None: | ||
return chunks | ||
|
||
all_chunks = product(*chunks) | ||
first_chunk = all_chunks.next() | ||
for this_chunk in all_chunks: | ||
if not (this_chunk == first_chunk): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
raise ValueError("zarr requires uniform chunk sizes, found %s" % | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calling Note that zarr does allow chunks that overlap the edge of the array (i.e., the last chunk of a dask array). This use case might be important when storing arrays with unusual dimension sizes (e.g., prime numbers). |
||
repr(chunks)) | ||
return first_chunk | ||
|
||
|
||
def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): | ||
# Zarr arrays do not have dimenions. To get around this problem, we add | ||
# an attribute that specifies the dimension. We have to hide this attribute | ||
# when we send the attributes to the user. | ||
# zarr_obj can be either a zarr group or zarr array | ||
dimensions = zarr_obj.attrs.get(dimension_key) | ||
attributes = HiddenKeyDict(zarr_obj.attrs, dimension_key) | ||
return dimensions, attributes | ||
|
||
|
||
class ZarrStore(AbstractWritableDataStore, DataStorePickleMixin): | ||
"""Store for reading and writing data via zarr | ||
""" | ||
|
||
# need some special secret attributes to tell us the dimensions | ||
_dimension_key = '_XARRAY_DIMENSIONS' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: should be Also: maybe better to pick something more generic for the constant value, perhaps |
||
|
||
def __init__(self, store=None, overwrite=False, chunk_store=None, | ||
synchronizer=None, path=None, writer=None, autoclose=False): | ||
opener = functools.partial(_open_zarr_group, store, overwrite, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's try to follow something closer to the model for
This preserves a little bit more flexibility for downstream users. |
||
chunk_store, synchronizer, path) | ||
self.ds = opener() | ||
if autoclose: | ||
raise NotImplementedError('autoclose=True is not implemented ' | ||
'for the zarr backend') | ||
self._autoclose = False | ||
self._isopen = True | ||
self._opener = opener | ||
|
||
# initialize hidden dimension attribute | ||
self.ds.attrs[self._dimension_key] = {} | ||
|
||
# do we need to define attributes for all of the opener keyword args? | ||
super(ZarrStore, self).__init__(writer) | ||
|
||
def open_store_variable(self, name, zarr_array): | ||
# I don't see why it is necessary to wrap self.ds[name] | ||
# zarr seems to implement the required ndarray interface | ||
# TODO: possibly wrap zarr array in dask with aligned chunks | ||
data = indexing.LazilyIndexedArray(zarr_array) | ||
dimensions, attributes = _get_zarr_dims_and_attrs( | ||
zarr_array, self._dimension_key) | ||
return Variable(dimensions, data, attributes) | ||
|
||
def get_variables(self): | ||
with self.ensure_open(autoclose=False): | ||
return FrozenOrderedDict((k, self.open_store_variable(k, v)) | ||
for k, v in self.ds.arrays()) | ||
|
||
def get_attrs(self): | ||
with self.ensure_open(autoclose=True): | ||
_, attributes = _get_zarr_dims_and_attrs(self.ds, | ||
self._dimension_key) | ||
attrs = FrozenOrderedDict(attributes) | ||
return attrs | ||
|
||
def get_dimensions(self): | ||
with self.ensure_open(autoclose=True): | ||
dimensions, _ = _get_zarr_dims_and_attrs(self.ds, | ||
self._dimension_key) | ||
return dimensions | ||
|
||
def set_dimension(self, name, length): | ||
with self.ensure_open(autoclose=False): | ||
self.ds.attrs[self._dimension_key][name] = length | ||
|
||
def set_attribute(self, key, value): | ||
with self.ensure_open(autoclose=False): | ||
_, attributes = _get_zarr_dims_and_attrs(self.ds, | ||
self._dimension_key) | ||
attributes[key] = value | ||
|
||
def prepare_variable(self, name, variable, check_encoding=False, | ||
unlimited_dims=None): | ||
|
||
attrs = variable.attrs.copy() | ||
dims = variable.dims | ||
dtype = variable.dtype | ||
shape = variable.shape | ||
chunks = _dask_chunks_to_zarr_chunks(variable.chunks) | ||
|
||
# TODO: figure ouw how zarr should deal with unlimited dimensions | ||
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) | ||
|
||
# let's try keeping this fill value stuff | ||
fill_value = attrs.pop('_FillValue', None) | ||
if fill_value in ['\x00']: | ||
fill_value = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I explain in my comment, I am a bit confused about how to handle |
||
|
||
# TODO: figure out what encoding is needed for zarr | ||
|
||
### arguments for zarr.create | ||
# zarr.creation.create(shape, chunks=None, dtype=None, compressor='default', | ||
# fill_value=0, order='C', store=None, synchronizer=None, overwrite=False, | ||
# path=None, chunk_store=None, filters=None, cache_metadata=True, **kwargs) | ||
|
||
# TODO: figure out how to pass along all those other arguments | ||
|
||
zarr_array = self.ds.create(name, shape=shape, dtype=dtype, | ||
chunks=chunks, fill_value=fill_value) | ||
zarr_array.attrs[self._dimension_key] = dims | ||
_, attributes = _get_zarr_dims_and_attrs(zarr_array, | ||
self._dimension_key) | ||
|
||
for k, v in iteritems(attrs): | ||
attributes[k] = v | ||
|
||
return zarr_array, variable.data | ||
|
||
# sync() and close() methods should not be needed with zarr |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -489,3 +489,38 @@ def ensure_us_time_resolution(val): | |
elif np.issubdtype(val.dtype, np.timedelta64): | ||
val = val.astype('timedelta64[us]') | ||
return val | ||
|
||
|
||
class HiddenKeyDict(MutableMapping): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs test coverage. |
||
''' | ||
Acts like a normal dictionary, but hides certain keys. | ||
''' | ||
# ``__init__`` method required to create instance from class. | ||
def __init__(self, data, *hidden_keys): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I prefer avoiding There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you suggest the best way to tell whether an argument is a string or list of strings? This is something I always need to do but don't know the "correct" pythonic way to do it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you know it is an iterable, |
||
self._data = data | ||
self._hidden_keys = hidden_keys | ||
|
||
def _raise_if_hidden(self, key): | ||
if key in self._hidden_keys: | ||
raise KeyError('Key is hidden.') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Print the offending key in the error. |
||
|
||
# The next five methods are requirements of the ABC. | ||
def __setitem__(self, key, value): | ||
self._raise_if_hidden(key) | ||
self._data[key] = value | ||
|
||
def __getitem__(self, key): | ||
self._raise_if_hidden(key) | ||
return self._data[key] | ||
|
||
def __delitem__(self, key): | ||
self._raise_if_hidden(key) | ||
del self._data[key] | ||
|
||
def __iter__(self): | ||
for k in self._data: | ||
if k not in self._hidden_keys: | ||
yield k | ||
|
||
def __len__(self): | ||
return len(list(self.__iter__())) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would certainly try to use |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually think we probably don't need a wrapper at all -- zarr already defines all these attributes!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This time around I did add the wrapper.