WIP: Zarr backend (#1528)

* added HiddenKeyDict class * new zarr backend * added HiddenKeyDict class * new zarr backend * add zarr to ci reqs * add zarr api to docs * some zarr tests passing * requires zarr decorator * wip * added chunking test * remove debuggin statements * fixed HiddenKeyDict * added HiddenKeyDict class * new zarr backend * add zarr to ci reqs * add zarr api to docs * some zarr tests passing * requires zarr decorator * wip * added chunking test * remove debuggin statements * fixed HiddenKeyDict * wip * finished merge * create opener object * trying to get caching working * caching still not working * updating zarr backend with new indexing mixins * added new zarr dev test env * update travis * move zarr-dev to travis allowed failures * fix typo in env file * wip * fixed zarr auto_chunk * refactored zarr tests * new encoding test * cleanup and buildout ZarrArrayWrapper, vectorized indexing * more wip * very close to passing all tests * modified inheritance * subclass AbstractWriteableDataStore * xfailed certain tests * pr comments wip * removed autoclose * new test for chunk encoding * added another test * tests for HiddenKeyDict * flake8 * zarr version update * added more tests * added compressor test * docs * weird ascii character issue * doc fixes * what's new * more file encoding nightmares * Tests for backends.zarr._replace_slices_with_arrays (and misc. small cleanup, adding docstrings + a bit more validation logic) * respond to @shoyer's review * final fixes * put back @shoyer's original max function * another try with 2.7-safe max function * put back @shoyer's original max function * bypass lock on ArrayWriter * eliminate read mode * added zarr distributed integration test * fixed max bug * change lock to False * fix doc typos
pydata · Dec 14, 2017 · 8fe7eb0 · 8fe7eb0
1 parent 89a1a98
commit 8fe7eb0
Show file tree

Hide file tree

Showing 27 changed files with 1,083 additions and 17 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -43,6 +43,8 @@ matrix:
     env: CONDA_ENV=py36-pynio-dev
   - python: 3.6
     env: CONDA_ENV=py36-rasterio1.0alpha
+  - python: 3.6
+    env: CONDA_ENV=py36-zarr-dev
   allow_failures:
   - python: 3.6
     env:
@@ -67,6 +69,8 @@ matrix:
     env: CONDA_ENV=py36-pynio-dev
   - python: 3.6
     env: CONDA_ENV=py36-rasterio1.0alpha
+  - python: 3.6
+    env: CONDA_ENV=py36-zarr-dev
 
 before_install:
   - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then

diff --git a/ci/requirements-py27-cdat+pynio.yml b/ci/requirements-py27-cdat+pynio.yml
@@ -22,6 +22,7 @@ dependencies:
   - seaborn
   - toolz
   - rasterio
+  - zarr
   - pip:
     - coveralls
     - pytest-cov
diff --git a/ci/requirements-py27-windows.yml b/ci/requirements-py27-windows.yml
@@ -19,3 +19,4 @@ dependencies:
   - seaborn
   - toolz
   - rasterio
+  - zarr
diff --git a/ci/requirements-py35.yml b/ci/requirements-py35.yml
@@ -17,6 +17,7 @@ dependencies:
   - seaborn
   - toolz
   - rasterio
+  - zarr
   - pip:
     - coveralls
     - pytest-cov
diff --git a/ci/requirements-py36-windows.yml b/ci/requirements-py36-windows.yml
@@ -16,3 +16,4 @@ dependencies:
   - seaborn
   - toolz
   - rasterio
+  - zarr
diff --git a/ci/requirements-py36-zarr-dev.yml b/ci/requirements-py36-zarr-dev.yml
@@ -0,0 +1,20 @@
+name: test_env
+channels:
+  - conda-forge
+dependencies:
+  - python=3.6
+  - dask
+  - distributed
+  - matplotlib
+  - pytest
+  - flake8
+  - numpy
+  - pandas
+  - scipy
+  - seaborn
+  - toolz
+  - bottleneck
+  - pip:
+    - coveralls
+    - pytest-cov
+    - git+https://github.com/alimanfoo/zarr.git
diff --git a/ci/requirements-py36.yml b/ci/requirements-py36.yml
@@ -18,6 +18,7 @@ dependencies:
   - toolz
   - rasterio
   - bottleneck
+  - zarr
   - pip:
     - coveralls
     - pytest-cov

diff --git a/doc/api.rst b/doc/api.rst
@@ -415,7 +415,9 @@ Dataset methods
    open_dataset
    open_mfdataset
    open_rasterio
+   open_zarr
    Dataset.to_netcdf
+   Dataset.to_zarr
    save_mfdataset
    Dataset.to_array
    Dataset.to_dataframe

diff --git a/doc/conf.py b/doc/conf.py
@@ -23,7 +23,7 @@
 print("python exec:", sys.executable)
 print("sys.path:", sys.path)
 for name in ('numpy scipy pandas matplotlib dask IPython seaborn '
-             'cartopy netCDF4 rasterio').split():
+             'cartopy netCDF4 rasterio zarr').split():
     try:
         module = importlib.import_module(name)
         if name == 'matplotlib':

diff --git a/doc/environment.yml b/doc/environment.yml
@@ -16,3 +16,4 @@ dependencies:
   - cartopy=0.15.1
   - rasterio=0.36.0
   - sphinx-gallery
+  - zarr
diff --git a/doc/installing.rst b/doc/installing.rst
@@ -24,6 +24,7 @@ For netCDF and IO
   reading and writing netCDF4 files that does not use the netCDF-C libraries
 - `pynio <https://www.pyngl.ucar.edu/Nio.shtml>`__: for reading GRIB and other
   geoscience specific file formats
+- `zarr <http://zarr.readthedocs.io/`__: for chunked, compressed, N-dimensional arrays.
 
 For accelerating xarray
 ~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/doc/io.rst b/doc/io.rst
@@ -295,7 +295,7 @@ string encoding for character arrays in netCDF files was
 Technically, you can use
 `any string encoding recognized by Python <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ if you feel the need to deviate from UTF-8,
 by setting the ``_Encoding`` field in ``encoding``. But
-`we don't recommend it<http://utf8everywhere.org/>`_.
+`we don't recommend it <http://utf8everywhere.org/>`_.
 
 .. warning::
 
@@ -502,6 +502,103 @@ longitudes and latitudes.
 .. _test files: https://github.com/mapbox/rasterio/blob/master/tests/data/RGB.byte.tif
 .. _pyproj: https://github.com/jswhit/pyproj
 
+.. _io.zarr:
+
+Zarr
+----
+
+`Zarr`_ is a Python package providing an implementation of chunked, compressed,
+N-dimensional arrays.
+Zarr has the ability to store arrays in a range of ways, including in memory,
+in files, and in cloud-based object storage such as `Amazon S3`_ and
+`Google Cloud Storage`_.
+Xarray's Zarr backend allows xarray to leverage these capabilities.
+
+.. warning::
+
+    Zarr support is still an experimental feature. Please report any bugs or
+    unexepected behavior via github issues.
+
+Xarray can't open just any zarr dataset, because xarray requires special
+metadata (attributes) describing the dataset dimensions and coordinates.
+At this time, xarray can only open zarr datasets that have been written by
+xarray. To write a dataset with zarr, we use the
+:py:attr:`Dataset.to_zarr <xarray.Dataset.to_zarr>` method.
+To write to a local directory, we pass a path to a directory
+
+.. ipython:: python
+   :suppress:
+
+    ! rm -rf path/to/directory.zarr
+
+.. ipython:: python
+
+    ds = xr.Dataset({'foo': (('x', 'y'), np.random.rand(4, 5))},
+                    coords={'x': [10, 20, 30, 40],
+                            'y': pd.date_range('2000-01-01', periods=5),
+                            'z': ('x', list('abcd'))})
+    ds.to_zarr('path/to/directory.zarr')
+
+(The suffix ``.zarr`` is optional--just a reminder that a zarr store lives
+there.) If the directory does not exist, it will be created. If a zarr
+store is already present at that path, an error will be raised, preventing it
+from being overwritten. To override this behavior and overwrite an existing
+store, add ``mode='w'`` when invoking ``to_zarr``.
+
+To read back a zarr dataset that has been created this way, we use the
+:py:func:`~xarray.open_zarr` method:
+
+.. ipython:: python
+
+    ds_zarr = xr.open_zarr('path/to/directory.zarr')
+    ds_zarr
+
+Cloud Storage Buckets
+~~~~~~~~~~~~~~~~~~~~~
+
+It is possible to read and write xarray datasets directly from / to cloud
+storage buckets using zarr. This example uses the `gcsfs`_ package to provide
+a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then
+pass to xarray::
+
+    import gcsfs
+    fs = gcsfs.GCSFileSystem(project='<project-name>', token=None)
+    gcsmap = gcsfs.mapping.GCSMap('<bucket-name>', gcs=fs, check=True, create=False)
+    # write to the bucket
+    ds.to_zarr(store=gcsmap)
+    # read it back
+    ds_gcs = xr.open_zarr(gcsmap, mode='r')
+
+.. _Zarr: http://zarr.readthedocs.io/
+.. _Amazon S3: https://aws.amazon.com/s3/
+.. _Google Cloud Storage: https://cloud.google.com/storage/
+.. _gcsfs: https://github.com/dask/gcsfs
+
+Zarr Compressors and Filters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+There are many different options for compression and filtering possible with
+zarr. These are described in the
+`zarr documentation <http://zarr.readthedocs.io/en/stable/tutorial.html#compressors>`_.
+These options can be passed to the ``to_zarr`` method as variable encoding.
+For example:
+
+.. ipython:: python
+   :suppress:
+
+    ! rm -rf foo.zarr
+
+.. ipython:: python
+
+    import zarr
+    compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2)
+    ds.to_zarr('foo.zarr', encoding={'foo': {'compressor': compressor}})
+
+.. note::
+
+    Not all native zarr compression and filtering options have been tested with
+    xarray.
+
 .. _io.pynio:
 
 Formats supported by PyNIO
@@ -529,6 +626,7 @@ exporting your objects to pandas and using its broad range of `IO tools`_.
 .. _IO tools: http://pandas.pydata.org/pandas-docs/stable/io.html
 
 
+
 Combining multiple files
 ------------------------
 

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -28,12 +28,15 @@ Enhancements
 - Use ``pandas.Grouper`` class in xarray resample methods rather than the
   deprecated ``pandas.TimeGrouper`` class (:issue:`1766`).
   By `Joe Hamman <https://github.com/jhamman>`_.
-
-
+- Support for using `Zarr`_ as storage layer for xarray.
+  By `Ryan Abernathey <https://github.com/rabernat>`_.
 - Experimental support for parsing ENVI metadata to coordinates and attributes
   in :py:func:`xarray.open_rasterio`.
   By `Matti Eskelinen <https://github.com/maaleske>`
 
+.. _Zarr: http://zarr.readthedocs.io/
+
+
 Bug fixes
 ~~~~~~~~~
 

diff --git a/xarray/__init__.py b/xarray/__init__.py
@@ -18,6 +18,7 @@
 from .backends.api import (open_dataset, open_dataarray, open_mfdataset,
                            save_mfdataset)
 from .backends.rasterio_ import open_rasterio
+from .backends.zarr import open_zarr
 
 from .conventions import decode_cf, SerializationWarning
 

diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py
@@ -10,3 +10,4 @@
 from .pynio_ import NioDataStore
 from .scipy_ import ScipyDataStore
 from .h5netcdf_ import H5NetCDFStore
+from .zarr import ZarrStore
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -713,3 +713,29 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
     finally:
         for store in stores:
             store.close()
+
+
+def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None,
+            encoding=None):
+    """This function creates an appropriate datastore for writing a dataset to
+    a zarr ztore
+
+    See `Dataset.to_zarr` for full API docs.
+    """
+    if isinstance(store, path_type):
+        store = str(store)
+    if encoding is None:
+        encoding = {}
+
+    # validate Dataset keys, DataArray names, and attr keys/values
+    _validate_dataset_names(dataset)
+    _validate_attrs(dataset)
+
+    store = backends.ZarrStore.open_group(store=store, mode=mode,
+                                          synchronizer=synchronizer,
+                                          group=group, writer=None)
+
+    # I think zarr stores should always be sync'd immediately
+    # TODO: figure out how to properly handle unlimited_dims
+    dataset.dump_to_store(store, sync=True, encoding=encoding)
+    return store
diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -164,9 +164,10 @@ def __exit__(self, exception_type, exception_value, traceback):
 
 
 class ArrayWriter(object):
-    def __init__(self):
+    def __init__(self, lock=GLOBAL_LOCK):
         self.sources = []
         self.targets = []
+        self.lock = lock
 
     def add(self, source, target):
         if isinstance(source, dask_array_type):
@@ -184,7 +185,7 @@ def sync(self):
             import dask.array as da
             import dask
             if LooseVersion(dask.__version__) > LooseVersion('0.8.1'):
-                da.store(self.sources, self.targets, lock=GLOBAL_LOCK)
+                da.store(self.sources, self.targets, lock=self.lock)
             else:
                 da.store(self.sources, self.targets)
             self.sources = []