From 2ee89c36a7ed512a221d38e09e7637429ced22d3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 10 Dec 2019 09:02:00 -0700 Subject: [PATCH] Respect user-specified coordinates attribute. (#3487) * Respect user-specified coordinates attribute. * Add whats-new * Better if statement. * maybe it's better to not raise an error if "coordinates" in attrs. * tweak whats-new. * more thorough test. * Emit one "coordinates" warning per dataset, instead of one per variable. Also add stacklevel * Preserve attrs["coordinates"] when roundtripping with decode_coords=False * Avoid raising warnings. * fix whats-new * [minor] add comments * fix whats-new * Actually test global "coordinates" handling. * filerwarning not necessary. * Add comment * fix whats-new --- doc/io.rst | 16 ++++++++++++- doc/whats-new.rst | 3 +++ xarray/conventions.py | 40 +++++++++++++++++++++----------- xarray/tests/test_backends.py | 37 +++++++++++++++++++++++++++-- xarray/tests/test_conventions.py | 14 +++++++++++ 5 files changed, 93 insertions(+), 17 deletions(-) diff --git a/doc/io.rst b/doc/io.rst index 8f8a776f73a..2e50e5639da 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -437,9 +437,23 @@ like ``'days'`` for ``timedelta64`` data. ``calendar`` should be one of the cale supported by netCDF4-python: 'standard', 'gregorian', 'proleptic_gregorian' 'noleap', '365_day', '360_day', 'julian', 'all_leap', '366_day'. -By default, xarray uses the 'proleptic_gregorian' calendar and units of the smallest time +By default, xarray uses the ``'proleptic_gregorian'`` calendar and units of the smallest time difference between values, with a reference time of the first time value. + +.. _io.coordinates: + +Coordinates +........... + +You can control the ``coordinates`` attribute written to disk by specifying ``DataArray.encoding["coordinates"]``. +If not specified, xarray automatically sets ``DataArray.encoding["coordinates"]`` to a space-delimited list +of names of coordinate variables that share dimensions with the ``DataArray`` being written. +This allows perfect roundtripping of xarray datasets but may not be desirable. +When an xarray ``Dataset`` contains non-dimensional coordinates that do not share dimensions with any of +the variables, these coordinate variable names are saved under a "global" ``"coordinates"`` attribute. +This is not CF-compliant but again facilitates roundtripping of xarray datasets. + Invalid netCDF files ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index aa67a46c38e..1f60d457432 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -148,6 +148,9 @@ New Features invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`) By `Deepak Cherian `_ and `Guido Imperiale `_. +- xarray now respects the ``DataArray.encoding["coordinates"]`` attribute when writing to disk. + See :ref:`io.coordinates` for more. (:issue:`3351`, :pull:`3487`) + By `Deepak Cherian `_. - Add the documented-but-missing :py:meth:`DatasetGroupBy.quantile`. (:issue:`3525`, :pull:`3527`). By `Justus Magin `_. diff --git a/xarray/conventions.py b/xarray/conventions.py index a83b4b31c17..a8b9906c153 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -5,7 +5,7 @@ import pandas as pd from .coding import strings, times, variables -from .coding.variables import SerializationWarning +from .coding.variables import SerializationWarning, pop_to from .core import duck_array_ops, indexing from .core.common import contains_cftime_datetimes from .core.pycompat import dask_array_type @@ -660,34 +660,46 @@ def _encode_coordinates(variables, attributes, non_dim_coord_names): and set(target_dims) <= set(v.dims) ): variable_coordinates[k].add(coord_name) - global_coordinates.discard(coord_name) variables = {k: v.copy(deep=False) for k, v in variables.items()} - # These coordinates are saved according to CF conventions - for var_name, coord_names in variable_coordinates.items(): - attrs = variables[var_name].attrs - if "coordinates" in attrs: + # keep track of variable names written to file under the "coordinates" attributes + written_coords = set() + for name, var in variables.items(): + encoding = var.encoding + attrs = var.attrs + if "coordinates" in attrs and "coordinates" in encoding: raise ValueError( - "cannot serialize coordinates because variable " - "%s already has an attribute 'coordinates'" % var_name + f"'coordinates' found in both attrs and encoding for variable {name!r}." ) - attrs["coordinates"] = " ".join(map(str, coord_names)) + + # this will copy coordinates from encoding to attrs if "coordinates" in attrs + # after the next line, "coordinates" is never in encoding + # we get support for attrs["coordinates"] for free. + coords_str = pop_to(encoding, attrs, "coordinates") + if not coords_str and variable_coordinates[name]: + attrs["coordinates"] = " ".join(map(str, variable_coordinates[name])) + if "coordinates" in attrs: + written_coords.update(attrs["coordinates"].split()) # These coordinates are not associated with any particular variables, so we # save them under a global 'coordinates' attribute so xarray can roundtrip # the dataset faithfully. Because this serialization goes beyond CF # conventions, only do it if necessary. # Reference discussion: - # http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/057771.html + # http://mailman.cgd.ucar.edu/pipermail/cf-metadata/2014/007571.html + global_coordinates.difference_update(written_coords) if global_coordinates: attributes = dict(attributes) if "coordinates" in attributes: - raise ValueError( - "cannot serialize coordinates because the global " - "attribute 'coordinates' already exists" + warnings.warn( + f"cannot serialize global coordinates {global_coordinates!r} because the global " + f"attribute 'coordinates' already exists. This may prevent faithful roundtripping" + f"of xarray datasets", + SerializationWarning, ) - attributes["coordinates"] = " ".join(map(str, global_coordinates)) + else: + attributes["coordinates"] = " ".join(map(str, global_coordinates)) return variables, attributes diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 1e135ebd3e1..a23527bd49a 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -33,6 +33,7 @@ from xarray.backends.netCDF4_ import _extract_nc4_variable_encoding from xarray.backends.pydap_ import PydapDataStore from xarray.coding.variables import SerializationWarning +from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing from xarray.core.options import set_options from xarray.core.pycompat import dask_array_type @@ -522,15 +523,35 @@ def test_roundtrip_coordinates(self): with self.roundtrip(original) as actual: assert_identical(original, actual) + original["foo"].encoding["coordinates"] = "y" + with self.roundtrip(original, open_kwargs={"decode_coords": False}) as expected: + # check roundtripping when decode_coords=False + with self.roundtrip( + expected, open_kwargs={"decode_coords": False} + ) as actual: + assert_identical(expected, actual) + def test_roundtrip_global_coordinates(self): - original = Dataset({"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])}) + original = Dataset( + {"foo": ("x", [0, 1])}, {"x": [2, 3], "y": ("a", [42]), "z": ("x", [4, 5])} + ) with self.roundtrip(original) as actual: assert_identical(original, actual) + # test that global "coordinates" is as expected + _, attrs = encode_dataset_coordinates(original) + assert attrs["coordinates"] == "y" + + # test warning when global "coordinates" is already set + original.attrs["coordinates"] = "foo" + with pytest.warns(SerializationWarning): + _, attrs = encode_dataset_coordinates(original) + assert attrs["coordinates"] == "foo" + def test_roundtrip_coordinates_with_space(self): original = Dataset(coords={"x": 0, "y z": 1}) expected = Dataset({"y z": 1}, {"x": 0}) - with pytest.warns(xr.SerializationWarning): + with pytest.warns(SerializationWarning): with self.roundtrip(original) as actual: assert_identical(expected, actual) @@ -810,6 +831,18 @@ def equals_latlon(obj): assert "coordinates" not in ds["lat"].attrs assert "coordinates" not in ds["lon"].attrs + original["temp"].encoding["coordinates"] = "lat" + with self.roundtrip(original) as actual: + assert_identical(actual, original) + original["precip"].encoding["coordinates"] = "lat" + with create_tmp_file() as tmp_file: + original.to_netcdf(tmp_file) + with open_dataset(tmp_file, decode_coords=True) as ds: + assert "lon" not in ds["temp"].encoding["coordinates"] + assert "lon" not in ds["precip"].encoding["coordinates"] + assert "coordinates" not in ds["lat"].encoding + assert "coordinates" not in ds["lon"].encoding + def test_roundtrip_endian(self): ds = Dataset( { diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 09002e252b4..acb2400ea04 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -136,6 +136,20 @@ def test_multidimensional_coordinates(self): # Should not have any global coordinates. assert "coordinates" not in attrs + def test_do_not_overwrite_user_coordinates(self): + orig = Dataset( + coords={"x": [0, 1, 2], "y": ("x", [5, 6, 7]), "z": ("x", [8, 9, 10])}, + data_vars={"a": ("x", [1, 2, 3]), "b": ("x", [3, 5, 6])}, + ) + orig["a"].encoding["coordinates"] = "y" + orig["b"].encoding["coordinates"] = "z" + enc, _ = conventions.encode_dataset_coordinates(orig) + assert enc["a"].attrs["coordinates"] == "y" + assert enc["b"].attrs["coordinates"] == "z" + orig["a"].attrs["coordinates"] = "foo" + with raises_regex(ValueError, "'coordinates' found in both attrs"): + conventions.encode_dataset_coordinates(orig) + @requires_dask def test_string_object_warning(self): original = Variable(("x",), np.array(["foo", "bar"], dtype=object)).chunk()