From 09292b0d058b21e685aa7d75442349829206838c Mon Sep 17 00:00:00 2001 From: dcherian Date: Wed, 7 Aug 2019 08:29:47 -0600 Subject: [PATCH 01/35] Add compat = 'override' and data_vars/coords='sensible' --- xarray/core/concat.py | 37 ++++++++++++++++++++++++++----------- xarray/tests/test_concat.py | 4 ++-- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 9c7c622a31c..aa32be2776e 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -143,6 +143,11 @@ def concat( "`data_vars` and `coords` arguments" ) + if compat not in ["equals", "identical", "override"]: + raise ValueError( + "compat=%r invalid: must be 'equals', 'identical or 'override'" % compat + ) + if isinstance(first_obj, DataArray): f = _dataarray_concat elif isinstance(first_obj, Dataset): @@ -189,7 +194,11 @@ def _calc_concat_over(datasets, dim, data_vars, coords): equals = {} if dim in datasets[0]: + concat_over_existing_dim = True concat_over.add(dim) + else: + concat_over_existing_dim = False + for ds in datasets: concat_over.update(k for k, v in ds.variables.items() if dim in v.dims) @@ -226,6 +235,11 @@ def process_subset_opt(opt, subset): ) elif opt == "minimal": pass + elif opt == "sensible": + if not concat_over_existing_dim: + concat_over.update( + set(getattr(datasets[0], subset)) - set(datasets[0].dims) + ) else: raise ValueError("unexpected value for %s: %s" % (subset, opt)) else: @@ -263,11 +277,6 @@ def _dataset_concat( """ from .dataset import Dataset - if compat not in ["equals", "identical"]: - raise ValueError( - "compat=%r invalid: must be 'equals' " "or 'identical'" % compat - ) - dim, coord = _calc_concat_dim_coord(dim) # Make sure we're working on a copy (we'll be loading variables) datasets = [ds.copy() for ds in datasets] @@ -297,21 +306,24 @@ def insert_result_variable(k, v): # across all datasets for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv(ds.attrs, result_attrs): - raise ValueError("dataset global attributes not equal") + raise ValueError("Dataset global attributes are not equal.") for k, v in ds.variables.items(): if k not in result_vars and k not in concat_over: - raise ValueError("encountered unexpected variable %r" % k) + raise ValueError("Encountered unexpected variable %r" % k) elif (k in result_coord_names) != (k in ds.coords): raise ValueError( - "%r is a coordinate in some datasets but not " "others" % k + "%r is a coordinate in some datasets but not others." % k ) - elif k in result_vars and k != dim: + elif compat != "override" and k in result_vars and k != dim: # Don't use Variable.identical as it internally invokes # Variable.equals, and we may already know the answer if compat == "identical" and not utils.dict_equiv( v.attrs, result_vars[k].attrs ): - raise ValueError("variable %s not identical across datasets" % k) + raise ValueError( + "Variable '%s' is not identical across datasets. " + "You can skip this check by specifying compat='override'." % k + ) # Proceed with equals() try: @@ -321,7 +333,10 @@ def insert_result_variable(k, v): result_vars[k].load() is_equal = v.equals(result_vars[k]) if not is_equal: - raise ValueError("variable %s not equal across datasets" % k) + raise ValueError( + "Variable '%s' is not equal across datasets. " + "You can skip this check by specifying compat='override'." % k + ) # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index b8ab89e926c..5f02470e35a 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -140,13 +140,13 @@ def test_concat_errors(self): with raises_regex(ValueError, "are not coordinates"): concat([data, data], "new_dim", coords=["not_found"]) - with raises_regex(ValueError, "global attributes not"): + with raises_regex(ValueError, "global attributes are not"): data0, data1 = deepcopy(split_data) data1.attrs["foo"] = "bar" concat([data0, data1], "dim1", compat="identical") assert_identical(data, concat([data0, data1], "dim1", compat="equals")) - with raises_regex(ValueError, "encountered unexpected"): + with raises_regex(ValueError, "Encountered unexpected"): data0, data1 = deepcopy(split_data) data1["foo"] = ("bar", np.random.randn(10)) concat([data0, data1], "dim1") From 69feaaaf202e988e929aa3efddc3b252d4d4912e Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 9 Aug 2019 16:05:06 -0600 Subject: [PATCH 02/35] concat tests. --- xarray/tests/test_concat.py | 117 ++++++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 5f02470e35a..bd70e99f1ed 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -289,6 +289,123 @@ def test_concat_fill_value(self, fill_value): assert_identical(actual, expected) +class TestConcatDatasetNewApi: + @pytest.fixture(autouse=True) + def setUp(self): + self.ds1 = Dataset( + { + "only_x_y": (("y", "x"), [[1, 2]]), + "only_x": ("x", [1, 2]), + "only_z": ("z", [1, 2]), + "const1": 1.0, + }, + coords={"x": [0, 1], "y": [0], "z": [-1, -2], "coord1": ("x", [0, 1])}, + ) + self.ds2 = Dataset( + { + "only_x_y": (("y", "x"), [[3, 4]]), + "only_x": ("x", [1.1, 2.1]), + "only_z": ("z", [1, 2]), + "const1": 1.0, + }, + coords={ + "x": [0, 1], + "y": [1], + "z": [-1, -2], + "coord1": ("x", [0.0001, 1.0001]), + }, + ) + + self.expected_y = Dataset( + { + "only_x_y": (("y", "x"), [[1, 2], [3, 4]]), + "only_x": ("x", [1, 2]), + "only_z": ("z", [1, 2]), + "const1": 1.0, + }, + coords={"x": [0, 1], "y": [0, 1], "z": [-1, -2], "coord1": ("x", [0, 1])}, + ) + + self.expected_new_dim = Dataset( + { + "only_x_y": ( + ("new_dim", "y", "x"), + [[[1, 2], [np.nan, np.nan]], [[np.nan, np.nan], [3, 4]]], + ), + "only_x": (("new_dim", "x"), [[1, 2], [1.1, 2.1]]), + "only_z": (("new_dim", "z"), [[1, 2], [1, 2]]), + "const1": ("new_dim", [1.0, 1.0]), + }, + coords={ + "x": [0, 1], + "y": [0, 1], + "z": [-1, -2], + "coord1": (("new_dim", "x"), [[0, 1], [0.0001, 1.0001]]), + }, + ) + + self.dsets = [self.ds1, self.ds2] + + def test_concat_sensible_compat_errors(self): + + with raises_regex(ValueError, "'only_x' is not equal across datasets."): + concat(self.dsets, data_vars="sensible", dim="y") + + with raises_regex(ValueError, "'coord1' is not equal across datasets."): + concat(self.dsets, coords="sensible", dim="y") + + @pytest.mark.parametrize("concat_dim", ["y", "new_dim"]) + def test_sensible(self, concat_dim): + actual = concat( + self.dsets, + data_vars="sensible", + coords="sensible", + compat="override", + dim=concat_dim, + ) + + if concat_dim == "y": + expected = self.expected_y + else: + expected = self.expected_new_dim + + assert_equal(actual, expected) + + @pytest.mark.parametrize( + "data_vars, coords", [("sensible", "all"), ("all", "sensible")] + ) + def test_compat_override(self, data_vars, coords): + + actual = concat( + self.dsets, data_vars=data_vars, coords=coords, compat="override", dim="y" + ) + + if data_vars == "all": + expected_y_dim = ["only_x_y", "only_x", "only_z", "const1"] + equal_to_first_ds = [] + elif data_vars == "sensible": + expected_y_dim = ["only_x_y"] + if coords == "all": + # in this case, variable only_x will not have coord1 as non-dim coord + equal_to_first_ds = ["only_z", "const1"] + if coords == "sensible": + equal_to_first_ds = ["only_x", "only_z", "const1"] + + if coords == "all": + expected_y_dim += ["coord1"] + elif coords == "sensible": + equal_to_first_ds += ["coord1"] + + expected_no_y_dim = set(actual.data_vars.keys()) - set(expected_y_dim) + + for var in expected_no_y_dim: + assert "y" not in actual[var].dims + for var in expected_y_dim: + assert "y" in actual[var].dims + for var in equal_to_first_ds: + assert_equal(actual[var], self.dsets[0][var]) + + class TestConcatDataArray: def test_concat(self): ds = Dataset( From 0bf85975657ab3c79358a7457c8674281d4ad291 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 9 Aug 2019 17:09:49 -0600 Subject: [PATCH 03/35] Update docstring. --- xarray/core/concat.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index aa32be2776e..46bfa575e63 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -63,12 +63,14 @@ def concat( those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition to the 'minimal' coordinates. - compat : {'equals', 'identical'}, optional + compat : {'equals', 'identical', 'override'}, optional String indicating how to compare non-concatenated variables and - dataset global attributes for potential conflicts. 'equals' means - that all variable values and dimensions must be the same; - 'identical' means that variable attributes and global attributes - must also be equal. + dataset global attributes for potential conflicts. + * 'equals' means that all variable values and dimensions must be the same; + * 'identical' means that variable attributes and global attributes + must also be equal. + * 'override' means that checks are skipped and values from the first dataset + are used. positions : None or list of integer arrays, optional List of integer arrays which specifies the integer positions to which to assign each dataset along the concatenated dimension. If not From 4c994e2ee56b04be0020921ca48b758829a8065e Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 20 Aug 2019 18:09:06 -0700 Subject: [PATCH 04/35] Begin merge, combine. --- xarray/core/combine.py | 13 +++++++++++-- xarray/core/concat.py | 11 ++++++++--- xarray/core/merge.py | 9 +++++---- xarray/tests/test_concat.py | 28 ++++++++++++++++++++++++++++ xarray/tests/test_merge.py | 2 ++ 5 files changed, 54 insertions(+), 9 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index c24be88b19e..89db3f449d3 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -243,11 +243,12 @@ def _combine_1d( dim=concat_dim, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) except ValueError as err: - if "encountered unexpected variable" in str(err): + if "Encountered unexpected variable" in str(err): raise ValueError( "These objects cannot be combined using only " "xarray.combine_nested, instead either use " @@ -598,6 +599,7 @@ def combine_by_coords( concat_dims=concat_dims, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -832,6 +834,7 @@ def _old_auto_combine( dim=dim, data_vars=data_vars, coords=coords, + compat=compat, fill_value=fill_value, join=join, ) @@ -850,6 +853,7 @@ def _auto_concat( coords="different", fill_value=dtypes.NA, join="outer", + compat="no_conflicts", ): if len(datasets) == 1 and dim is None: # There is nothing more to combine, so kick out early. @@ -876,5 +880,10 @@ def _auto_concat( ) dim, = concat_dims return concat( - datasets, dim=dim, data_vars=data_vars, coords=coords, fill_value=fill_value + datasets, + dim=dim, + data_vars=data_vars, + coords=coords, + fill_value=fill_value, + compat=compat, ) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 46bfa575e63..3a7828a337b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -70,7 +70,7 @@ def concat( * 'identical' means that variable attributes and global attributes must also be equal. * 'override' means that checks are skipped and values from the first dataset - are used. + are used. This cannot be used with coords='different' or data_vars='different'. positions : None or list of integer arrays, optional List of integer arrays which specifies the integer positions to which to assign each dataset along the concatenated dimension. If not @@ -145,7 +145,7 @@ def concat( "`data_vars` and `coords` arguments" ) - if compat not in ["equals", "identical", "override"]: + if compat not in ["equals", "identical", "override", "no_conflicts"]: raise ValueError( "compat=%r invalid: must be 'equals', 'identical or 'override'" % compat ) @@ -207,6 +207,11 @@ def _calc_concat_over(datasets, dim, data_vars, coords): def process_subset_opt(opt, subset): if isinstance(opt, str): if opt == "different": + if compat == "override": + raise ValueError( + "Cannot specify both %s='different' and compat='override'." + % subset + ) # all nonindexes that are not the same in each dataset for k in getattr(datasets[0], subset): if k not in concat_over: @@ -395,7 +400,7 @@ def _dataarray_concat( if data_vars != "all": raise ValueError( - "data_vars is not a valid argument when " "concatenating DataArray objects" + "data_vars is not a valid argument when concatenating DataArray objects" ) datasets = [] diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 225507b9204..3c626ead475 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -44,6 +44,7 @@ "broadcast_equals": 2, "minimal": 3, "no_conflicts": 4, + "override": 5, } ) @@ -81,7 +82,7 @@ def unique_variable(name, variables, compat="broadcast_equals"): variables : list of xarray.Variable List of Variable objects, all of which go by the same name in different inputs. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Type of equality check to use. Returns @@ -93,7 +94,7 @@ def unique_variable(name, variables, compat="broadcast_equals"): MergeError: if any of the variables are not equal. """ # noqa out = variables[0] - if len(variables) > 1: + if len(variables) > 1 and compat != "override": combine_method = None if compat == "minimal": @@ -152,7 +153,7 @@ def merge_variables( priority_vars : mapping with Variable or None values, optional If provided, variables are always taken from this dict in preference to the input variable dictionaries, without checking for conflicts. - compat : {'identical', 'equals', 'broadcast_equals', 'minimal', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'minimal', 'no_conflicts', 'override'}, optional Type of equality check to use when checking for conflicts. Returns @@ -449,7 +450,7 @@ def merge_core( ---------- objs : list of mappings All values must be convertable to labeled arrays. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Compatibility checks to use when merging variables. join : {'outer', 'inner', 'left', 'right'}, optional How to combine objects with different indexes. diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index bd70e99f1ed..78373577efd 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -18,6 +18,30 @@ from .test_dataset import create_test_data +def test_concat_compat(): + ds1 = Dataset( + { + "has_x_y": (("y", "x"), [[1, 2]]), + "has_x": ("x", [1, 2]), + "no_x_y": ("z", [1, 2]), + }, + coords={"x": [0, 1], "y": [0], "z": [-1, -2]}, + ) + ds2 = Dataset( + { + "has_x_y": (("y", "x"), [[3, 4]]), + "has_x": ("x", [1, 2]), + "no_x_y": (("q", "z"), [[1, 2]]), + }, + coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]}, + ) + + result = concat([ds1, ds2], dim="y", compat="equals") + + for var in ["has_x", "no_x_y", "const1"]: + assert "y" not in result[var] + + class TestConcatDataset: @pytest.fixture def data(self): @@ -405,6 +429,10 @@ def test_compat_override(self, data_vars, coords): for var in equal_to_first_ds: assert_equal(actual[var], self.dsets[0][var]) + def test_compat_override_different_error(self): + with raises_regex(ValueError, "Cannot specify both .*='different'"): + concat(self.dsets, data_vars="different", compat="override") + class TestConcatDataArray: def test_concat(self): diff --git a/xarray/tests/test_merge.py b/xarray/tests/test_merge.py index ed1453ce95d..c1e6c7a5ce8 100644 --- a/xarray/tests/test_merge.py +++ b/xarray/tests/test_merge.py @@ -196,6 +196,8 @@ def test_merge_compat(self): with raises_regex(ValueError, "compat=.* invalid"): ds1.merge(ds2, compat="foobar") + assert ds1.identical(ds1.merge(ds2, compat="override")) + def test_merge_auto_align(self): ds1 = xr.Dataset({"a": ("x", [1, 2]), "x": [0, 1]}) ds2 = xr.Dataset({"b": ("x", [3, 4]), "x": [1, 2]}) From 4cb7d02c71863d476394b38652ae76b5a3f8ae30 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 29 Aug 2019 10:08:42 -0600 Subject: [PATCH 05/35] Merge non concatenated variables. --- xarray/core/concat.py | 118 ++++++++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 50 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 3a7828a337b..40ae0b12bc9 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -5,6 +5,12 @@ from . import dtypes, utils from .alignment import align +from .merge import ( + determine_coords, + merge_variables, + expand_variable_dicts, + _VALID_COMPAT, +) from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars @@ -65,7 +71,7 @@ def concat( in addition to the 'minimal' coordinates. compat : {'equals', 'identical', 'override'}, optional String indicating how to compare non-concatenated variables and - dataset global attributes for potential conflicts. + dataset global attributes for potential conflicts. This is passed down to merge. * 'equals' means that all variable values and dimensions must be the same; * 'identical' means that variable attributes and global attributes must also be equal. @@ -145,7 +151,7 @@ def concat( "`data_vars` and `coords` arguments" ) - if compat not in ["equals", "identical", "override", "no_conflicts"]: + if compat not in _VALID_COMPAT: raise ValueError( "compat=%r invalid: must be 'equals', 'identical or 'override'" % compat ) @@ -186,22 +192,28 @@ def _calc_concat_dim_coord(dim): return dim, coord -def _calc_concat_over(datasets, dim, data_vars, coords): +def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): """ Determine which dataset variables need to be concatenated in the result, - and which can simply be taken from the first dataset. """ # Return values concat_over = set() equals = {} - if dim in datasets[0]: + if dim in dim_names: concat_over_existing_dim = True concat_over.add(dim) else: concat_over_existing_dim = False for ds in datasets: + if concat_over_existing_dim: + if dim not in ds.dims: + # TODO: why did I do this + if dim in ds: + ds = ds.set_coords(dim) + else: + raise ValueError("%r is not a dimension in all datasets" % dim) concat_over.update(k for k, v in ds.variables.items() if dim in v.dims) def process_subset_opt(opt, subset): @@ -225,7 +237,7 @@ def process_subset_opt(opt, subset): for ds_rhs in datasets[1:]: v_rhs = ds_rhs.variables[k].compute() computed.append(v_rhs) - if not v_lhs.equals(v_rhs): + if not getattr(v_lhs, compat)(v_rhs): concat_over.add(k) equals[k] = False # computed variables are not to be re-computed @@ -291,68 +303,74 @@ def _dataset_concat( *datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value ) - concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords) + # determine dimensional coordinate names and a dict mapping name to DataArray + def determine_dims(datasets, result_coord_names): + dims = set() + coords = dict() + for ds in datasets: + for dim in set(ds.dims) - dims: + if dim not in coords: + coords[dim] = ds.coords[dim].variable + dims = dims | set(ds.dims) + return dims, coords + + result_coord_names, noncoord_names = determine_coords(datasets) + both_data_and_coords = result_coord_names & noncoord_names + if both_data_and_coords: + raise ValueError( + "%r is a coordinate in some datasets and not in others." + % list(both_data_and_coords)[0] # preserve previous behaviour + ) + dim_names, result_coords = determine_dims(datasets, result_coord_names) + # we don't want the concat dimension in the result dataset yet + result_coords.pop(dim, None) - def insert_result_variable(k, v): - assert isinstance(v, Variable) - if k in datasets[0].coords: - result_coord_names.add(k) - result_vars[k] = v + # determine which variables to concatentate + concat_over, equals = _calc_concat_over( + datasets, dim, dim_names, data_vars, coords, compat + ) + + # determine which variables to merge + variables_to_merge = (result_coord_names | noncoord_names) - concat_over - dim_names + if variables_to_merge: + to_merge = [] + for ds in datasets: + to_merge.append(ds.reset_coords()[list(variables_to_merge)]) + # TODO: Provide equals as an argument and thread that down to merge.unique_variable + result_vars = merge_variables( + expand_variable_dicts(to_merge), priority_vars=None, compat=compat + ) + else: + result_vars = OrderedDict() + result_vars.update(result_coords) - # create the new dataset and add constant variables - result_vars = OrderedDict() - result_coord_names = set(datasets[0].coords) + # assign attrs and encoding from first dataset result_attrs = datasets[0].attrs result_encoding = datasets[0].encoding - for k, v in datasets[0].variables.items(): - if k not in concat_over: - insert_result_variable(k, v) + def insert_result_variable(k, v): + assert isinstance(v, Variable) + result_vars[k] = v - # check that global attributes and non-concatenated variables are fixed - # across all datasets + # check that global attributes are fixed across all datasets if necessary for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv(ds.attrs, result_attrs): raise ValueError("Dataset global attributes are not equal.") - for k, v in ds.variables.items(): - if k not in result_vars and k not in concat_over: - raise ValueError("Encountered unexpected variable %r" % k) - elif (k in result_coord_names) != (k in ds.coords): - raise ValueError( - "%r is a coordinate in some datasets but not others." % k - ) - elif compat != "override" and k in result_vars and k != dim: - # Don't use Variable.identical as it internally invokes - # Variable.equals, and we may already know the answer - if compat == "identical" and not utils.dict_equiv( - v.attrs, result_vars[k].attrs - ): - raise ValueError( - "Variable '%s' is not identical across datasets. " - "You can skip this check by specifying compat='override'." % k - ) - - # Proceed with equals() - try: - # May be populated when using the "different" method - is_equal = equals[k] - except KeyError: - result_vars[k].load() - is_equal = v.equals(result_vars[k]) - if not is_equal: - raise ValueError( - "Variable '%s' is not equal across datasets. " - "You can skip this check by specifying compat='override'." % k - ) + ############## + # TODO: do this stuff earlier so we loop over datasets only once + ############# # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] + # non_concat_dims = dim_names - concat_over non_concat_dims = {} for ds in datasets: non_concat_dims.update(ds.dims) non_concat_dims.pop(dim, None) + # seems like there should be a helper function for this. We would need to add + # an exclude kwarg to exclude comparing along concat_dim def ensure_common_dims(vars): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the From 9dc340e05cb211296ca771c1cf1156f959f400fc Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 29 Aug 2019 10:08:53 -0600 Subject: [PATCH 06/35] Fix tests. --- xarray/core/concat.py | 17 +++++++++++++++-- xarray/tests/test_concat.py | 27 +++++++++++++++++---------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 40ae0b12bc9..b59cee4491a 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -318,13 +318,17 @@ def determine_dims(datasets, result_coord_names): both_data_and_coords = result_coord_names & noncoord_names if both_data_and_coords: raise ValueError( - "%r is a coordinate in some datasets and not in others." - % list(both_data_and_coords)[0] # preserve previous behaviour + "%r is a coordinate in some datasets but not others." + % list(both_data_and_coords)[0] # preserve format of error message ) dim_names, result_coords = determine_dims(datasets, result_coord_names) # we don't want the concat dimension in the result dataset yet result_coords.pop(dim, None) + # case where concat dimension is a coordinate but not a dimension + if dim in result_coord_names and dim not in dim_names: + datasets = [ds.expand_dims(dim) for ds in datasets] + # determine which variables to concatentate concat_over, equals = _calc_concat_over( datasets, dim, dim_names, data_vars, coords, compat @@ -335,6 +339,10 @@ def determine_dims(datasets, result_coord_names): if variables_to_merge: to_merge = [] for ds in datasets: + if variables_to_merge - set(ds.variables): + raise ValueError( + "Encountered unexpected variables %r" % list(variables_to_merge)[0] + ) to_merge.append(ds.reset_coords()[list(variables_to_merge)]) # TODO: Provide equals as an argument and thread that down to merge.unique_variable result_vars = merge_variables( @@ -397,6 +405,11 @@ def ensure_common_dims(vars): result = result.set_coords(result_coord_names) result.encoding = result_encoding + # TODO: avoid this + unlabeled_dims = dim_names - result_coord_names + result = result.drop(unlabeled_dims, errors="ignore") + + # need to pass test when provided dim is a DataArray if coord is not None: # add concat dimension last to ensure that its in the final Dataset result[coord.name] = coord diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 78373577efd..c3b48c4b2ca 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -5,8 +5,7 @@ import pytest from xarray import DataArray, Dataset, Variable, concat -from xarray.core import dtypes - +from xarray.core import dtypes, merge from . import ( InaccessibleArray, assert_array_equal, @@ -36,11 +35,15 @@ def test_concat_compat(): coords={"x": [0, 1], "y": [1], "z": [-1, -2], "q": [0]}, ) - result = concat([ds1, ds2], dim="y", compat="equals") + result = concat([ds1, ds2], dim="y", data_vars="minimal", compat="broadcast_equals") + assert_equal(ds2.no_x_y, result.no_x_y.transpose()) - for var in ["has_x", "no_x_y", "const1"]: + for var in ["has_x", "no_x_y"]: assert "y" not in result[var] + with raises_regex(ValueError, "'q' is not a dimension in all datasets"): + concat([ds1, ds2], dim="q", data_vars="all", compat="broadcast_equals") + class TestConcatDataset: @pytest.fixture @@ -116,7 +119,7 @@ def test_concat_coords(self): actual = concat(objs, dim="x", coords=coords) assert_identical(expected, actual) for coords in ["minimal", []]: - with raises_regex(ValueError, "not equal across"): + with raises_regex(merge.MergeError, "conflicting values"): concat(objs, dim="x", coords=coords) def test_concat_constant_index(self): @@ -127,8 +130,10 @@ def test_concat_constant_index(self): for mode in ["different", "all", ["foo"]]: actual = concat([ds1, ds2], "y", data_vars=mode) assert_identical(expected, actual) - with raises_regex(ValueError, "not equal across datasets"): - concat([ds1, ds2], "y", data_vars="minimal") + with raises_regex(merge.MergeError, "conflicting values"): + # previously dim="y", and raised error which makes no sense. + # "foo" has dimension "y" so minimal should concatenate it? + concat([ds1, ds2], "new_dim", data_vars="minimal") def test_concat_size0(self): data = create_test_data() @@ -372,10 +377,10 @@ def setUp(self): def test_concat_sensible_compat_errors(self): - with raises_regex(ValueError, "'only_x' is not equal across datasets."): + with raises_regex(merge.MergeError, "conflicting values"): concat(self.dsets, data_vars="sensible", dim="y") - with raises_regex(ValueError, "'coord1' is not equal across datasets."): + with raises_regex(merge.MergeError, "conflicting values"): concat(self.dsets, coords="sensible", dim="y") @pytest.mark.parametrize("concat_dim", ["y", "new_dim"]) @@ -431,7 +436,9 @@ def test_compat_override(self, data_vars, coords): def test_compat_override_different_error(self): with raises_regex(ValueError, "Cannot specify both .*='different'"): - concat(self.dsets, data_vars="different", compat="override") + concat( + self.dsets, dim="concat_dim", data_vars="different", compat="override" + ) class TestConcatDataArray: From dd24b00062d767bb8af38ee4ccb1a977c900fd18 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 29 Aug 2019 13:02:31 -0600 Subject: [PATCH 07/35] Fix tests 2 --- xarray/core/concat.py | 2 +- xarray/tests/test_combine.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index b59cee4491a..6b1735ee189 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -213,7 +213,7 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): if dim in ds: ds = ds.set_coords(dim) else: - raise ValueError("%r is not a dimension in all datasets" % dim) + raise ValueError("%r is not present in all datasets" % dim) concat_over.update(k for k, v in ds.variables.items() if dim in v.dims) def process_subset_opt(opt, subset): diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index f786a851e62..ba1e393a89e 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -567,8 +567,8 @@ def test_combine_concat_over_redundant_nesting(self): def test_combine_nested_but_need_auto_combine(self): objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2], "wall": [0]})] - with raises_regex(ValueError, "cannot be combined"): - combine_nested(objs, concat_dim="x") + # with raises_regex(ValueError, "cannot be combined"): + combine_nested(objs, concat_dim="x") @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_combine_nested_fill_value(self, fill_value): @@ -761,7 +761,7 @@ def test_auto_combine(self): auto_combine(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})] - with pytest.raises(KeyError): + with raises_regex(ValueError, "'y' is not present in all datasets"): auto_combine(objs) def test_auto_combine_previously_failed(self): From 09746d971ca82313352a5b5d85f9e1ac99493420 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 29 Aug 2019 13:42:14 -0600 Subject: [PATCH 08/35] Fix test 3 --- xarray/core/concat.py | 8 +++++- xarray/core/merge.py | 53 ++++++++++++++++++++++--------------- xarray/tests/test_concat.py | 2 +- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 6b1735ee189..67ed9352e19 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -344,9 +344,15 @@ def determine_dims(datasets, result_coord_names): "Encountered unexpected variables %r" % list(variables_to_merge)[0] ) to_merge.append(ds.reset_coords()[list(variables_to_merge)]) + + merge_equals = {k: equals.get(k, None) for k in variables_to_merge} + # TODO: Provide equals as an argument and thread that down to merge.unique_variable result_vars = merge_variables( - expand_variable_dicts(to_merge), priority_vars=None, compat=compat + expand_variable_dicts(to_merge), + priority_vars=None, + compat=compat, + equals=merge_equals, ) else: result_vars = OrderedDict() diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 3c626ead475..c06bed67ec4 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -71,8 +71,8 @@ class MergeError(ValueError): # TODO: move this to an xarray.exceptions module? -def unique_variable(name, variables, compat="broadcast_equals"): - # type: (Any, List[Variable], str) -> Variable +def unique_variable(name, variables, compat="broadcast_equals", equals=None): + # type: (Any, List[Variable], str, bool) -> Variable """Return the unique variable from a list of variables or raise MergeError. Parameters @@ -84,6 +84,7 @@ def unique_variable(name, variables, compat="broadcast_equals"): inputs. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Type of equality check to use. + equals : mapping variable name to None or bool, corresponding to result of compat test Returns ------- @@ -94,30 +95,35 @@ def unique_variable(name, variables, compat="broadcast_equals"): MergeError: if any of the variables are not equal. """ # noqa out = variables[0] - if len(variables) > 1 and compat != "override": - combine_method = None - if compat == "minimal": - compat = "broadcast_equals" + if len(variables) == 1 or compat == "override": + return out - if compat == "broadcast_equals": - dim_lengths = broadcast_dimension_size(variables) - out = out.set_dims(dim_lengths) + combine_method = None - if compat == "no_conflicts": - combine_method = "fillna" + if compat == "minimal": + compat = "broadcast_equals" + if compat == "broadcast_equals": + dim_lengths = broadcast_dimension_size(variables) + out = out.set_dims(dim_lengths) + + if compat == "no_conflicts": + combine_method = "fillna" + + if equals is None: + equals = all([getattr(out, compat)(var) for var in variables[1:]]) + + if not equals: + raise MergeError( + "conflicting values for variable %r on " "objects to be combined" % (name) + ) + + if combine_method: for var in variables[1:]: - if not getattr(out, compat)(var): - raise MergeError( - "conflicting values for variable %r on " - "objects to be combined:\n" - "first value: %r\nsecond value: %r" % (name, out, var) - ) - if combine_method: - # TODO: add preservation of attrs into fillna - out = getattr(out, combine_method)(var) - out.attrs = var.attrs + # TODO: add preservation of attrs into fillna + out = getattr(out, combine_method)(var) + out.attrs = var.attrs return out @@ -143,6 +149,7 @@ def merge_variables( list_of_variables_dicts: List[Mapping[Any, Variable]], priority_vars: Mapping[Any, Variable] = None, compat: str = "minimal", + equals: Mapping[Any, bool] = {}, ) -> "OrderedDict[Any, Variable]": """Merge dicts of variables, while resolving conflicts appropriately. @@ -191,7 +198,9 @@ def merge_variables( merged[name] = unique_variable(name, dim_variables, dim_compat) else: try: - merged[name] = unique_variable(name, var_list, compat) + merged[name] = unique_variable( + name, var_list, compat, equals.get(name, None) + ) except MergeError: if compat != "minimal": # we need more than "minimal" compatibility (for which diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index c3b48c4b2ca..c5ea901a7de 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -41,7 +41,7 @@ def test_concat_compat(): for var in ["has_x", "no_x_y"]: assert "y" not in result[var] - with raises_regex(ValueError, "'q' is not a dimension in all datasets"): + with raises_regex(ValueError, "'q' is not present in all datasets"): concat([ds1, ds2], dim="q", data_vars="all", compat="broadcast_equals") From b090e820d125f733ae0a9b216bb6e6a4b451febb Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 11:03:07 -0600 Subject: [PATCH 09/35] Cleanup: reduce number of times we loop over datasets. --- xarray/core/concat.py | 73 ++++++++++++++++--------------------- xarray/tests/test_concat.py | 2 +- 2 files changed, 33 insertions(+), 42 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 67ed9352e19..cfd1af1e4dc 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -281,6 +281,22 @@ def process_subset_opt(opt, subset): return concat_over, equals +# determine dimensional coordinate names and a dict mapping name to DataArray +def _determine_dims(datasets, concat_dim): + dims = set() + coords = dict() # maps dim name to variable + concat_dim_lengths = [] # length of concat dimension in each dataset + dims_sizes = {} # shared dimension sizes to expand variables + for ds in datasets: + concat_dim_lengths.append(ds.dims.get(concat_dim, 1)) + dims_sizes.update(ds.dims) + for dim in set(ds.dims) - dims: + if dim not in coords: + coords[dim] = ds.coords[dim].variable + dims = dims | set(ds.dims) + return dims, coords, concat_dim_lengths, dims_sizes + + def _dataset_concat( datasets, dim, @@ -303,17 +319,6 @@ def _dataset_concat( *datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value ) - # determine dimensional coordinate names and a dict mapping name to DataArray - def determine_dims(datasets, result_coord_names): - dims = set() - coords = dict() - for ds in datasets: - for dim in set(ds.dims) - dims: - if dim not in coords: - coords[dim] = ds.coords[dim].variable - dims = dims | set(ds.dims) - return dims, coords - result_coord_names, noncoord_names = determine_coords(datasets) both_data_and_coords = result_coord_names & noncoord_names if both_data_and_coords: @@ -321,9 +326,14 @@ def determine_dims(datasets, result_coord_names): "%r is a coordinate in some datasets but not others." % list(both_data_and_coords)[0] # preserve format of error message ) - dim_names, result_coords = determine_dims(datasets, result_coord_names) + dim_names, result_coords, concat_dim_lengths, dims_sizes = _determine_dims( + datasets, dim + ) + unlabeled_dims = dim_names - result_coord_names + # we don't want the concat dimension in the result dataset yet result_coords.pop(dim, None) + dims_sizes.pop(dim, None) # case where concat dimension is a coordinate but not a dimension if dim in result_coord_names and dim not in dim_names: @@ -334,20 +344,19 @@ def determine_dims(datasets, result_coord_names): datasets, dim, dim_names, data_vars, coords, compat ) - # determine which variables to merge + # determine which variables to merge, and then merge them according to compat variables_to_merge = (result_coord_names | noncoord_names) - concat_over - dim_names if variables_to_merge: to_merge = [] for ds in datasets: if variables_to_merge - set(ds.variables): raise ValueError( - "Encountered unexpected variables %r" % list(variables_to_merge)[0] + "encountered unexpected variables %r" % list(variables_to_merge)[0] ) to_merge.append(ds.reset_coords()[list(variables_to_merge)]) merge_equals = {k: equals.get(k, None) for k in variables_to_merge} - # TODO: Provide equals as an argument and thread that down to merge.unique_variable result_vars = merge_variables( expand_variable_dicts(to_merge), priority_vars=None, @@ -362,27 +371,13 @@ def determine_dims(datasets, result_coord_names): result_attrs = datasets[0].attrs result_encoding = datasets[0].encoding - def insert_result_variable(k, v): - assert isinstance(v, Variable) - result_vars[k] = v - # check that global attributes are fixed across all datasets if necessary for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv(ds.attrs, result_attrs): raise ValueError("Dataset global attributes are not equal.") - ############## - # TODO: do this stuff earlier so we loop over datasets only once - ############# # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables - dim_lengths = [ds.dims.get(dim, 1) for ds in datasets] - # non_concat_dims = dim_names - concat_over - non_concat_dims = {} - for ds in datasets: - non_concat_dims.update(ds.dims) - non_concat_dims.pop(dim, None) - # seems like there should be a helper function for this. We would need to add # an exclude kwarg to exclude comparing along concat_dim def ensure_common_dims(vars): @@ -392,30 +387,26 @@ def ensure_common_dims(vars): common_dims = tuple(pd.unique([d for v in vars for d in v.dims])) if dim not in common_dims: common_dims = (dim,) + common_dims - for var, dim_len in zip(vars, dim_lengths): + for var, dim_len in zip(vars, concat_dim_lengths): if var.dims != common_dims: - common_shape = tuple( - non_concat_dims.get(d, dim_len) for d in common_dims - ) + common_shape = tuple(dims_sizes.get(d, dim_len) for d in common_dims) var = var.set_dims(common_dims, common_shape) yield var # stack up each variable to fill-out the dataset (in order) - for k in datasets[0].variables: - if k in concat_over: - vars = ensure_common_dims([ds.variables[k] for ds in datasets]) - combined = concat_vars(vars, dim, positions) - insert_result_variable(k, combined) + for k in concat_over & set(datasets[0].variables): + vars = ensure_common_dims([ds.variables[k] for ds in datasets]) + combined = concat_vars(vars, dim, positions) + assert isinstance(combined, Variable) + result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) result = result.set_coords(result_coord_names) result.encoding = result_encoding - # TODO: avoid this - unlabeled_dims = dim_names - result_coord_names + # TODO: avoid this? result = result.drop(unlabeled_dims, errors="ignore") - # need to pass test when provided dim is a DataArray if coord is not None: # add concat dimension last to ensure that its in the final Dataset result[coord.name] = coord diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index c5ea901a7de..9ab8797d23d 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -175,7 +175,7 @@ def test_concat_errors(self): concat([data0, data1], "dim1", compat="identical") assert_identical(data, concat([data0, data1], "dim1", compat="equals")) - with raises_regex(ValueError, "Encountered unexpected"): + with raises_regex(ValueError, "encountered unexpected"): data0, data1 = deepcopy(split_data) data1["foo"] = ("bar", np.random.randn(10)) concat([data0, data1], "dim1") From 4ba681bbf43072b129be124dbe671db79b84417e Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 11:36:50 -0600 Subject: [PATCH 10/35] unique_variable does minimum number of loads: fixes dask test --- xarray/core/merge.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index c06bed67ec4..82c63c5e9e2 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -112,7 +112,12 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): combine_method = "fillna" if equals is None: - equals = all([getattr(out, compat)(var) for var in variables[1:]]) + out = out.compute() + equals_list = [] + for var in variables[1:]: + equals_list.append(getattr(out, compat)(var.compute())) + + equals = all(equals_list) if not equals: raise MergeError( From 75bd59ba33d610d6e222e2527a5253661cc580bc Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 11:44:55 -0600 Subject: [PATCH 11/35] docstrings for compat='override' --- xarray/backends/api.py | 3 ++- xarray/core/combine.py | 9 ++++++--- xarray/core/concat.py | 21 +++++++++++++-------- xarray/core/merge.py | 3 ++- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9ad1db1829b..e63b4e7b759 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -759,7 +759,7 @@ def open_mfdataset( `xarray.auto_combine` is used, but in the future this behavior will switch to use `xarray.combine_by_coords` by default. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: * 'broadcast_equals': all values must be equal when variables are @@ -770,6 +770,7 @@ def open_mfdataset( * 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + * 'override': skip comparing and pick variable from first dataset preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. You can find the file-name from which each dataset was loaded in diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 89db3f449d3..167b5a93ebc 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -352,7 +352,7 @@ def combine_nested( Must be the same length as the depth of the list passed to ``datasets``. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential merge conflicts: @@ -364,6 +364,7 @@ def combine_nested( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional @@ -505,7 +506,7 @@ def combine_by_coords( datasets : sequence of xarray.Dataset Dataset objects to combine. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: @@ -517,6 +518,7 @@ def combine_by_coords( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' or list of str}, optional @@ -669,7 +671,7 @@ def auto_combine( component files. Set ``concat_dim=None`` explicitly to disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', - 'no_conflicts'}, optional + 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: - 'broadcast_equals': all values must be equal when variables are @@ -680,6 +682,7 @@ def auto_combine( - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset data_vars : {'minimal', 'different', 'all' or list of str}, optional Details are in the documentation of concat coords : {'minimal', 'different', 'all' o list of str}, optional diff --git a/xarray/core/concat.py b/xarray/core/concat.py index cfd1af1e4dc..05091a9a627 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -69,14 +69,19 @@ def concat( those corresponding to other dimensions. * list of str: The listed coordinate variables will be concatenated, in addition to the 'minimal' coordinates. - compat : {'equals', 'identical', 'override'}, optional - String indicating how to compare non-concatenated variables and - dataset global attributes for potential conflicts. This is passed down to merge. - * 'equals' means that all variable values and dimensions must be the same; - * 'identical' means that variable attributes and global attributes - must also be equal. - * 'override' means that checks are skipped and values from the first dataset - are used. This cannot be used with coords='different' or data_vars='different'. + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional + String indicating how to compare non-concatenated variables of the same name for + potential conflicts. This is passed down to merge. + + - 'broadcast_equals': all values must be equal when variables are + broadcast against each other to ensure common dimensions. + - 'equals': all values and dimensions must be the same. + - 'identical': all values, dimensions and attributes must be the + same. + - 'no_conflicts': only values which are not null in both datasets + must be equal. The returned dataset then contains the combination + of all non-null values. + - 'override': skip comparing and pick variable from first dataset positions : None or list of integer arrays, optional List of integer arrays which specifies the integer positions to which to assign each dataset along the concatenated dimension. If not diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 82c63c5e9e2..54e772b6bf5 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -534,7 +534,7 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): objects : Iterable[Union[xarray.Dataset, xarray.DataArray, dict]] Merge together all variables from these objects. If any of them are DataArray objects, they must have a name. - compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional + compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional String indicating how to compare variables of the same name for potential conflicts: @@ -546,6 +546,7 @@ def merge(objects, compat="no_conflicts", join="outer", fill_value=dtypes.NA): - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. + - 'override': skip comparing and pick variable from first dataset join : {'outer', 'inner', 'left', 'right', 'exact'}, optional String indicating how to combine differing indexes in objects. From 79ee32d413c12966c72baab558d4433326463024 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 11:48:58 -0600 Subject: [PATCH 12/35] concat compat docstring. --- xarray/core/concat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 05091a9a627..253ed210358 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -158,7 +158,8 @@ def concat( if compat not in _VALID_COMPAT: raise ValueError( - "compat=%r invalid: must be 'equals', 'identical or 'override'" % compat + "compat=%r invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" + % compat ) if isinstance(first_obj, DataArray): From e7b152cf11b52246e1d5d36afd9c8104e9e90157 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 11:53:41 -0600 Subject: [PATCH 13/35] remove the sensible option. --- xarray/core/concat.py | 6 -- xarray/tests/test_concat.py | 131 +++--------------------------------- 2 files changed, 8 insertions(+), 129 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 253ed210358..a5ec7c5185a 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -215,7 +215,6 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): for ds in datasets: if concat_over_existing_dim: if dim not in ds.dims: - # TODO: why did I do this if dim in ds: ds = ds.set_coords(dim) else: @@ -260,11 +259,6 @@ def process_subset_opt(opt, subset): ) elif opt == "minimal": pass - elif opt == "sensible": - if not concat_over_existing_dim: - concat_over.update( - set(getattr(datasets[0], subset)) - set(datasets[0].dims) - ) else: raise ValueError("unexpected value for %s: %s" % (subset, opt)) else: diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 9ab8797d23d..6142762063c 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -163,6 +163,14 @@ def test_concat_errors(self): data = create_test_data() split_data = [data.isel(dim1=slice(3)), data.isel(dim1=slice(3, None))] + with raises_regex(ValueError, "must supply at least one"): + concat([], "dim1") + + with raises_regex(ValueError, "Cannot specify both .*='different'"): + concat( + [data, data], dim="concat_dim", data_vars="different", compat="override" + ) + with raises_regex(ValueError, "must supply at least one"): concat([], "dim1") @@ -318,129 +326,6 @@ def test_concat_fill_value(self, fill_value): assert_identical(actual, expected) -class TestConcatDatasetNewApi: - @pytest.fixture(autouse=True) - def setUp(self): - self.ds1 = Dataset( - { - "only_x_y": (("y", "x"), [[1, 2]]), - "only_x": ("x", [1, 2]), - "only_z": ("z", [1, 2]), - "const1": 1.0, - }, - coords={"x": [0, 1], "y": [0], "z": [-1, -2], "coord1": ("x", [0, 1])}, - ) - self.ds2 = Dataset( - { - "only_x_y": (("y", "x"), [[3, 4]]), - "only_x": ("x", [1.1, 2.1]), - "only_z": ("z", [1, 2]), - "const1": 1.0, - }, - coords={ - "x": [0, 1], - "y": [1], - "z": [-1, -2], - "coord1": ("x", [0.0001, 1.0001]), - }, - ) - - self.expected_y = Dataset( - { - "only_x_y": (("y", "x"), [[1, 2], [3, 4]]), - "only_x": ("x", [1, 2]), - "only_z": ("z", [1, 2]), - "const1": 1.0, - }, - coords={"x": [0, 1], "y": [0, 1], "z": [-1, -2], "coord1": ("x", [0, 1])}, - ) - - self.expected_new_dim = Dataset( - { - "only_x_y": ( - ("new_dim", "y", "x"), - [[[1, 2], [np.nan, np.nan]], [[np.nan, np.nan], [3, 4]]], - ), - "only_x": (("new_dim", "x"), [[1, 2], [1.1, 2.1]]), - "only_z": (("new_dim", "z"), [[1, 2], [1, 2]]), - "const1": ("new_dim", [1.0, 1.0]), - }, - coords={ - "x": [0, 1], - "y": [0, 1], - "z": [-1, -2], - "coord1": (("new_dim", "x"), [[0, 1], [0.0001, 1.0001]]), - }, - ) - - self.dsets = [self.ds1, self.ds2] - - def test_concat_sensible_compat_errors(self): - - with raises_regex(merge.MergeError, "conflicting values"): - concat(self.dsets, data_vars="sensible", dim="y") - - with raises_regex(merge.MergeError, "conflicting values"): - concat(self.dsets, coords="sensible", dim="y") - - @pytest.mark.parametrize("concat_dim", ["y", "new_dim"]) - def test_sensible(self, concat_dim): - actual = concat( - self.dsets, - data_vars="sensible", - coords="sensible", - compat="override", - dim=concat_dim, - ) - - if concat_dim == "y": - expected = self.expected_y - else: - expected = self.expected_new_dim - - assert_equal(actual, expected) - - @pytest.mark.parametrize( - "data_vars, coords", [("sensible", "all"), ("all", "sensible")] - ) - def test_compat_override(self, data_vars, coords): - - actual = concat( - self.dsets, data_vars=data_vars, coords=coords, compat="override", dim="y" - ) - - if data_vars == "all": - expected_y_dim = ["only_x_y", "only_x", "only_z", "const1"] - equal_to_first_ds = [] - elif data_vars == "sensible": - expected_y_dim = ["only_x_y"] - if coords == "all": - # in this case, variable only_x will not have coord1 as non-dim coord - equal_to_first_ds = ["only_z", "const1"] - if coords == "sensible": - equal_to_first_ds = ["only_x", "only_z", "const1"] - - if coords == "all": - expected_y_dim += ["coord1"] - elif coords == "sensible": - equal_to_first_ds += ["coord1"] - - expected_no_y_dim = set(actual.data_vars.keys()) - set(expected_y_dim) - - for var in expected_no_y_dim: - assert "y" not in actual[var].dims - for var in expected_y_dim: - assert "y" in actual[var].dims - for var in equal_to_first_ds: - assert_equal(actual[var], self.dsets[0][var]) - - def test_compat_override_different_error(self): - with raises_regex(ValueError, "Cannot specify both .*='different'"): - concat( - self.dsets, dim="concat_dim", data_vars="different", compat="override" - ) - - class TestConcatDataArray: def test_concat(self): ds = Dataset( From c114143d9b2fbfc0bd3731ee134a035c628aa22e Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 11:55:15 -0600 Subject: [PATCH 14/35] reduce silly changes. --- xarray/core/combine.py | 2 +- xarray/core/concat.py | 2 +- xarray/tests/test_concat.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/core/combine.py b/xarray/core/combine.py index 167b5a93ebc..e35bb51e030 100644 --- a/xarray/core/combine.py +++ b/xarray/core/combine.py @@ -248,7 +248,7 @@ def _combine_1d( join=join, ) except ValueError as err: - if "Encountered unexpected variable" in str(err): + if "encountered unexpected variable" in str(err): raise ValueError( "These objects cannot be combined using only " "xarray.combine_nested, instead either use " diff --git a/xarray/core/concat.py b/xarray/core/concat.py index a5ec7c5185a..de0dfcc5a5d 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -374,7 +374,7 @@ def _dataset_concat( # check that global attributes are fixed across all datasets if necessary for ds in datasets[1:]: if compat == "identical" and not utils.dict_equiv(ds.attrs, result_attrs): - raise ValueError("Dataset global attributes are not equal.") + raise ValueError("Dataset global attributes not equal.") # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 6142762063c..d624c5c1363 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -177,7 +177,7 @@ def test_concat_errors(self): with raises_regex(ValueError, "are not coordinates"): concat([data, data], "new_dim", coords=["not_found"]) - with raises_regex(ValueError, "global attributes are not"): + with raises_regex(ValueError, "global attributes not"): data0, data1 = deepcopy(split_data) data1.attrs["foo"] = "bar" concat([data0, data1], "dim1", compat="identical") From 9f849d255e4f8bee8ab979d301b40b3be49aa83b Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 12:39:18 -0600 Subject: [PATCH 15/35] fix groupby order test. --- xarray/core/concat.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index de0dfcc5a5d..fa8c53ff2c8 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -394,11 +394,13 @@ def ensure_common_dims(vars): yield var # stack up each variable to fill-out the dataset (in order) - for k in concat_over & set(datasets[0].variables): - vars = ensure_common_dims([ds.variables[k] for ds in datasets]) - combined = concat_vars(vars, dim, positions) - assert isinstance(combined, Variable) - result_vars[k] = combined + # n.b. this loop preserves variable order, needed for groupby. + for k in datasets[0].variables: + if k in concat_over: + vars = ensure_common_dims([ds.variables[k] for ds in datasets]) + combined = concat_vars(vars, dim, positions) + assert isinstance(combined, Variable) + result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) result = result.set_coords(result_coord_names) From 40707ca17040a8ecbea5b3794d75f5727c201217 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 30 Aug 2019 12:40:15 -0600 Subject: [PATCH 16/35] cleanup: var names + remove one loop through datasets. --- xarray/core/concat.py | 46 +++++++++++++++++++++++-------------------- xarray/core/merge.py | 2 +- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index fa8c53ff2c8..28e79379190 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -5,12 +5,7 @@ from . import dtypes, utils from .alignment import align -from .merge import ( - determine_coords, - merge_variables, - expand_variable_dicts, - _VALID_COMPAT, -) +from .merge import merge_variables, expand_variable_dicts, _VALID_COMPAT from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars @@ -282,19 +277,28 @@ def process_subset_opt(opt, subset): # determine dimensional coordinate names and a dict mapping name to DataArray -def _determine_dims(datasets, concat_dim): +def _parse_datasets(datasets, concat_dim): + dims = set() - coords = dict() # maps dim name to variable + all_coord_names = set() + data_vars = set() # list of data_vars + dim_coords = dict() # maps dim name to variable concat_dim_lengths = [] # length of concat dimension in each dataset dims_sizes = {} # shared dimension sizes to expand variables + for ds in datasets: concat_dim_lengths.append(ds.dims.get(concat_dim, 1)) dims_sizes.update(ds.dims) + + all_coord_names.update(ds.coords) + data_vars.update(ds.data_vars) + for dim in set(ds.dims) - dims: - if dim not in coords: - coords[dim] = ds.coords[dim].variable + if dim not in dim_coords: + dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) - return dims, coords, concat_dim_lengths, dims_sizes + + return dim_coords, concat_dim_lengths, dims_sizes, all_coord_names, data_vars def _dataset_concat( @@ -319,20 +323,20 @@ def _dataset_concat( *datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value ) - result_coord_names, noncoord_names = determine_coords(datasets) - both_data_and_coords = result_coord_names & noncoord_names + result_dim_coords, concat_dim_lengths, dims_sizes, result_coord_names, data_names = _parse_datasets( + datasets, dim + ) + dim_names = set(result_dim_coords) + unlabeled_dims = dim_names - result_coord_names + + both_data_and_coords = result_coord_names & data_names if both_data_and_coords: raise ValueError( "%r is a coordinate in some datasets but not others." % list(both_data_and_coords)[0] # preserve format of error message ) - dim_names, result_coords, concat_dim_lengths, dims_sizes = _determine_dims( - datasets, dim - ) - unlabeled_dims = dim_names - result_coord_names - # we don't want the concat dimension in the result dataset yet - result_coords.pop(dim, None) + result_dim_coords.pop(dim, None) dims_sizes.pop(dim, None) # case where concat dimension is a coordinate but not a dimension @@ -345,7 +349,7 @@ def _dataset_concat( ) # determine which variables to merge, and then merge them according to compat - variables_to_merge = (result_coord_names | noncoord_names) - concat_over - dim_names + variables_to_merge = (result_coord_names | data_names) - concat_over - dim_names if variables_to_merge: to_merge = [] for ds in datasets: @@ -365,7 +369,7 @@ def _dataset_concat( ) else: result_vars = OrderedDict() - result_vars.update(result_coords) + result_vars.update(result_dim_coords) # assign attrs and encoding from first dataset result_attrs = datasets[0].attrs diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 54e772b6bf5..a62eed92b61 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -121,7 +121,7 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): if not equals: raise MergeError( - "conflicting values for variable %r on " "objects to be combined" % (name) + "conflicting values for variable %r on objects to be combined" % (name) ) if combine_method: From 633efe9ff69436768664d7b326c93954c60e3483 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 3 Sep 2019 16:11:48 -0600 Subject: [PATCH 17/35] fix warning. --- xarray/core/dataarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 4858e4d0e91..39acd4be3b4 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1503,8 +1503,8 @@ def set_index( obj : DataArray Another DataArray, with this data but replaced coordinates. - Example - ------- + Examples + -------- >>> arr = xr.DataArray(data=np.ones((2, 3)), ... dims=['x', 'y'], ... coords={'x': From 6a6daabb6ffda8f51222dc0363edcb48a52d7ffa Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 3 Sep 2019 12:41:20 -0600 Subject: [PATCH 18/35] Add whats-new entry. --- doc/whats-new.rst | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 22971bb9955..1ac7d7457e8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -65,18 +65,37 @@ New functions/methods By `Deepak Cherian `_ and `David Mertz `_. -- Dataset plotting API for visualizing dependencies between two `DataArray`s! +- Dataset plotting API for visualizing dependencies between two DataArrays! Currently only :py:meth:`Dataset.plot.scatter` is implemented. By `Yohai Bar Sinai `_ and `Deepak Cherian `_ Enhancements ~~~~~~~~~~~~ -- Added ``join='override'``. This only checks that index sizes are equal among objects and skips - checking indexes for equality. By `Deepak Cherian `_. +- Multiple enhancements to :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset`. -- :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg. - It is passed down to :py:func:`~xarray.align`. By `Deepak Cherian `_. + - Added ``compat='override'``. When merging, this option picks the variable from the first dataset + and skips all comparisons. + + - Added ``join='override'``. When aligning, this only checks that index sizes are equal among objects + and skips checking indexes for equality. + + - :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg. + It is passed down to :py:func:`~xarray.align`. + + - :py:func:`~xarray.concat` now calls :py:func:`~xarray.merge` on variables that are not concatenated + (i.e. variables without ``concat_dim`` when ``data_vars`` or ``coords`` are ``"minimal"``). + :py:func:`~xarray.concat` passes its new ``compat`` kwarg down to :py:func:`~xarray.merge`. + (:issue:`2064`) + + Users can avoid a common bottleneck when using :py:func:`~xarray.open_mfdataset` on a large number of + files with variables that are known to be aligned and some of which need not be concatenated. + Slow equality comparisons can now be avoided, for e.g.:: + + data = xr.open_mfdataset(files, concat_dim='time', data_vars='minimal', + coords='minimal', compat='override', join='override') + + By `Deepak Cherian `_: - In :py:meth:`~xarray.Dataset.to_zarr`, passing ``mode`` is not mandatory if ``append_dim`` is set, as it will automatically be set to ``'a'`` internally. From eb154f50abb940221759b1716103e7810d1a904d Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 3 Sep 2019 12:45:20 -0600 Subject: [PATCH 19/35] Add note in io.rst --- doc/io.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/io.rst b/doc/io.rst index f7ac8c095b9..66ba8ed406b 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -845,6 +845,10 @@ functions see :ref:`combining data`. It is the recommended way to open multiple files with xarray. For more details, see :ref:`combining.multi`, :ref:`dask.io` and a `blog post`_ by Stephan Hoyer. + :py:func:`~xarray.open_mfdataset` takes many kwargs that allow you to + control its behaviour (for e.g. ``compat``, ``join``, ``concat_dim``. + See the docstring for more details. + .. _dask: http://dask.pydata.org .. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ From 2a1ff5da8b82d8dcfdb6322abd1ed7e2b39ae0aa Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 3 Sep 2019 16:11:32 -0600 Subject: [PATCH 20/35] Update netcdf multi-file dataset section in io.rst. --- doc/io.rst | 231 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 130 insertions(+), 101 deletions(-) diff --git a/doc/io.rst b/doc/io.rst index 66ba8ed406b..788dd500bb3 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -99,7 +99,9 @@ netCDF The recommended way to store xarray data structures is `netCDF`__, which is a binary file format for self-described datasets that originated in the geosciences. xarray is based on the netCDF data model, so netCDF files -on disk directly correspond to :py:class:`~xarray.Dataset` objects. +on disk directly correspond to :py:class:`~xarray.Dataset` objects (more accurately, +a group in a netCDF file directly corresponds to a to :py:class:`~xarray.Dataset` object. +See :ref:`io.netcdf_groups` for more.) NetCDF is supported on almost all platforms, and parsers exist for the vast majority of scientific programming languages. Recent versions of @@ -121,7 +123,7 @@ read/write netCDF V4 files and use the compression options described below). __ https://github.com/Unidata/netcdf4-python We can save a Dataset to disk using the -:py:attr:`Dataset.to_netcdf ` method: +:py:meth:`~Dataset.to_netcdf` method: .. ipython:: python @@ -147,19 +149,6 @@ convert the ``DataArray`` to a ``Dataset`` before saving, and then convert back when loading, ensuring that the ``DataArray`` that is loaded is always exactly the same as the one that was saved. -NetCDF groups are not supported as part of the -:py:class:`~xarray.Dataset` data model. Instead, groups can be loaded -individually as Dataset objects. -To do so, pass a ``group`` keyword argument to the -``open_dataset`` function. The group can be specified as a path-like -string, e.g., to access subgroup 'bar' within group 'foo' pass -'/foo/bar' as the ``group`` argument. -In a similar way, the ``group`` keyword argument can be given to the -:py:meth:`~xarray.Dataset.to_netcdf` method to write to a group -in a netCDF file. -When writing multiple groups in one file, pass ``mode='a'`` to ``to_netcdf`` -to ensure that each call does not delete the file. - Data is always loaded lazily from netCDF files. You can manipulate, slice and subset Dataset and DataArray objects, and no array values are loaded into memory until you try to perform some sort of actual computation. For an example of how these @@ -195,6 +184,24 @@ It is possible to append or overwrite netCDF variables using the ``mode='a'`` argument. When using this option, all variables in the dataset will be written to the original netCDF file, regardless if they exist in the original dataset. + +.. _io.netcdf_groups: + +Groups +~~~~~~ + +NetCDF groups are not supported as part of the :py:class:`~xarray.Dataset` data model. +Instead, groups can be loaded individually as Dataset objects. +To do so, pass a ``group`` keyword argument to the +:py:func:`~xarray.open_dataset` function. The group can be specified as a path-like +string, e.g., to access subgroup ``'bar'`` within group ``'foo'`` pass +``'/foo/bar'`` as the ``group`` argument. +In a similar way, the ``group`` keyword argument can be given to the +:py:meth:`~xarray.Dataset.to_netcdf` method to write to a group +in a netCDF file. +When writing multiple groups in one file, pass ``mode='a'`` to +:py:meth:`~xarray.Dataset.to_netcdf` to ensure that each call does not delete the file. + .. _io.encoding: Reading encoded data @@ -203,7 +210,7 @@ Reading encoded data NetCDF files follow some conventions for encoding datetime arrays (as numbers with a "units" attribute) and for packing and unpacking data (as described by the "scale_factor" and "add_offset" attributes). If the argument -``decode_cf=True`` (default) is given to ``open_dataset``, xarray will attempt +``decode_cf=True`` (default) is given to :py:func:`~xarray.open_dataset`, xarray will attempt to automatically decode the values in the netCDF objects according to `CF conventions`_. Sometimes this will fail, for example, if a variable has an invalid "units" or "calendar" attribute. For these cases, you can @@ -247,6 +254,113 @@ will remove encoding information. import os os.remove('saved_on_disk.nc') + +.. _combining multiple files: + +Reading multi-file datasets +........................... + +NetCDF files are often encountered in collections, e.g., with different files +corresponding to different model runs or one file per timestamp. +xarray can straightforwardly combine such files into a single Dataset by making use of +:py:func:`~xarray.concat`, :py:func:`~xarray.merge`, :py:func:`~xarray.combine_nested` and +:py:func:`~xarray.combine_by_coords`. For details on the difference between these +functions see :ref:`combining data`. + +Xarray includes support for manipulating datasets that don't fit into memory +with dask_. If you have dask installed, you can open multiple files +simultaneously in parallel using :py:func:`~xarray.open_mfdataset`:: + + xr.open_mfdataset('my/files/*.nc', parallel=True) + +This function automatically concatenates and merges multiple files into a +single xarray dataset. +It is the recommended way to open multiple files with xarray. +For more details on parallel reading, see :ref:`combining.multi`, :ref:`dask.io` and a +`blog post`_ by Stephan Hoyer. +:py:func:`~xarray.open_mfdataset` takes many kwargs that allow you to +control its behaviour (for e.g. ``parallel``, ``combine``, ``compat``, ``join``, ``concat_dim``). +See its docstring for more details. + + +.. note:: + + A common use-case involves a dataset distributed across a large number of files with + each file containing a large number of variables. Commonly a few of these variables + need to be concatenated along a dimension (say ``"time"``), while the rest are equal + across the datasets (ignoring floating point differences). The following command + with suitable modifications (such as ``parallel=True``) works well with such datasets:: + + xr.open_mfdataset('my/files/*.nc', concat_dim="time", + data_vars='minimal', coords='minimal', compat='override') + + This command concatenates variables along the ``"time"`` dimension, but only those that + already contain the ``"time"`` dimension (``data_vars='minimal', coords='minimal'``). + Variables that lack the ``"time"`` dimension are taken from the first dataset + (``compat='override'``). + + +.. _dask: http://dask.pydata.org +.. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ + +Sometimes multi-file datasets are not conveniently organized for easy use of :py:func:`~xarray.open_mfdataset`. +One can use the ``preprocess`` argument to provide a function that takes a dataset +and returns a modified Dataset. +:py:func:`~xarray.open_mfdataset` will call ``preprocess`` on every dataset +(corresponding to each file) prior to combining them. + +If :py:func:`~xarray.open_mfdataset` does not meet your needs, other approaches are possible. +For example, here's how we could approximate ``MFDataset`` from the netCDF4 +library:: + + from glob import glob + import xarray as xr + + def read_netcdfs(files, dim): + # glob expands paths with * to a list of files, like the unix shell + paths = sorted(glob(files)) + datasets = [xr.open_dataset(p) for p in paths] + combined = xr.concat(dataset, dim) + return combined + + combined = read_netcdfs('/all/my/files/*.nc', dim='time') + +This function will work in many cases, but it's not very robust. First, it +never closes files, which means it will fail one you need to load more than +a few thousands file. Second, it assumes that you want all the data from each +file and that it can all fit into memory. In many situations, you only need +a small subset or an aggregated summary of the data from each file. + +Here's a slightly more sophisticated example of how to remedy these +deficiencies:: + + def read_netcdfs(files, dim, transform_func=None): + def process_one_path(path): + # use a context manager, to ensure the file gets closed after use + with xr.open_dataset(path) as ds: + # transform_func should do some sort of selection or + # aggregation + if transform_func is not None: + ds = transform_func(ds) + # load all data from the transformed dataset, to ensure we can + # use it after closing each original file + ds.load() + return ds + + paths = sorted(glob(files)) + datasets = [process_one_path(p) for p in paths] + combined = xr.concat(datasets, dim) + return combined + + # here we suppose we only care about the combined mean of each file; + # you might also use indexing operations like .sel to subset datasets + combined = read_netcdfs('/all/my/files/*.nc', dim='time', + transform_func=lambda ds: ds.mean()) + +This pattern works well and is very robust. We've used similar code to process +tens of thousands of files constituting 100s of GB of data. + + .. _io.netcdf.writing_encoded: Writing encoded data @@ -817,88 +931,3 @@ For CSV files, one might also consider `xarray_extras`_. .. _xarray_extras: https://xarray-extras.readthedocs.io/en/latest/api/csv.html .. _IO tools: http://pandas.pydata.org/pandas-docs/stable/io.html - - -.. _combining multiple files: - - -Combining multiple files ------------------------- - -NetCDF files are often encountered in collections, e.g., with different files -corresponding to different model runs. xarray can straightforwardly combine such -files into a single Dataset by making use of :py:func:`~xarray.concat`, -:py:func:`~xarray.merge`, :py:func:`~xarray.combine_nested` and -:py:func:`~xarray.combine_by_coords`. For details on the difference between these -functions see :ref:`combining data`. - -.. note:: - - Xarray includes support for manipulating datasets that don't fit into memory - with dask_. If you have dask installed, you can open multiple files - simultaneously using :py:func:`~xarray.open_mfdataset`:: - - xr.open_mfdataset('my/files/*.nc') - - This function automatically concatenates and merges multiple files into a - single xarray dataset. - It is the recommended way to open multiple files with xarray. - For more details, see :ref:`combining.multi`, :ref:`dask.io` and a - `blog post`_ by Stephan Hoyer. - :py:func:`~xarray.open_mfdataset` takes many kwargs that allow you to - control its behaviour (for e.g. ``compat``, ``join``, ``concat_dim``. - See the docstring for more details. - - -.. _dask: http://dask.pydata.org -.. _blog post: http://stephanhoyer.com/2015/06/11/xray-dask-out-of-core-labeled-arrays/ - -For example, here's how we could approximate ``MFDataset`` from the netCDF4 -library:: - - from glob import glob - import xarray as xr - - def read_netcdfs(files, dim): - # glob expands paths with * to a list of files, like the unix shell - paths = sorted(glob(files)) - datasets = [xr.open_dataset(p) for p in paths] - combined = xr.concat(dataset, dim) - return combined - - combined = read_netcdfs('/all/my/files/*.nc', dim='time') - -This function will work in many cases, but it's not very robust. First, it -never closes files, which means it will fail one you need to load more than -a few thousands file. Second, it assumes that you want all the data from each -file and that it can all fit into memory. In many situations, you only need -a small subset or an aggregated summary of the data from each file. - -Here's a slightly more sophisticated example of how to remedy these -deficiencies:: - - def read_netcdfs(files, dim, transform_func=None): - def process_one_path(path): - # use a context manager, to ensure the file gets closed after use - with xr.open_dataset(path) as ds: - # transform_func should do some sort of selection or - # aggregation - if transform_func is not None: - ds = transform_func(ds) - # load all data from the transformed dataset, to ensure we can - # use it after closing each original file - ds.load() - return ds - - paths = sorted(glob(files)) - datasets = [process_one_path(p) for p in paths] - combined = xr.concat(datasets, dim) - return combined - - # here we suppose we only care about the combined mean of each file; - # you might also use indexing operations like .sel to subset datasets - combined = read_netcdfs('/all/my/files/*.nc', dim='time', - transform_func=lambda ds: ds.mean()) - -This pattern works well and is very robust. We've used similar code to process -tens of thousands of files constituting 100s of GB of data. From 2ad66089ec8122d671d28dbec8feccef2b335a82 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 3 Sep 2019 17:08:38 -0600 Subject: [PATCH 21/35] Update mfdataset in dask.rst. --- doc/dask.rst | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/dask.rst b/doc/dask.rst index adf0a6bf585..ea705d3d8f4 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -75,17 +75,32 @@ entirely equivalent to opening a dataset using ``open_dataset`` and then chunking the data using the ``chunk`` method, e.g., ``xr.open_dataset('example-data.nc').chunk({'time': 10})``. -To open multiple files simultaneously, use :py:func:`~xarray.open_mfdataset`:: +To open multiple files simultaneously in parallel using Dask delayed, +use :py:func:`~xarray.open_mfdataset`:: - xr.open_mfdataset('my/files/*.nc') + xr.open_mfdataset('my/files/*.nc', parallel=True) This function will automatically concatenate and merge dataset into one in the simple cases that it understands (see :py:func:`~xarray.auto_combine` -for the full disclaimer). By default, ``open_mfdataset`` will chunk each +for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chunk each netCDF file into a single Dask array; again, supply the ``chunks`` argument to control the size of the resulting Dask arrays. In more complex cases, you can open each file individually using ``open_dataset`` and merge the result, as -described in :ref:`combining data`. +described in :ref:`combining data`. The pattern for parallel reading of multiple files +using dask, modifying those datasets and then combining into a single ``Dataset`` is:: + + def modify(ds): + # modify ds here + return ds + + + # this is basically what open_mfdataset does + open_kwargs = dict(decode_cf=True, decode_times=False) + open_tasks = [dask.delayed(xr.open_dataset)(f, **open_kwargs) for f in file_names] + tasks = [dask.delayed(modify)(task) for task in open_tasks] + datasets = dask.compute(tasks) + combined = xr.combine_nested(datasets) # or some combination of concat, merge + You'll notice that printing a dataset still shows a preview of array values, even if they are actually Dask arrays. We can do this quickly with Dask because From 3154b064de567a142f01d6ef88c16dff7053f71a Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 6 Sep 2019 13:07:06 -0600 Subject: [PATCH 22/35] simplify parse_datasets. --- xarray/core/concat.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 28e79379190..bff929ef456 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -207,6 +207,7 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): else: concat_over_existing_dim = False + concat_dim_lengths = [] for ds in datasets: if concat_over_existing_dim: if dim not in ds.dims: @@ -215,6 +216,7 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars, coords, compat): else: raise ValueError("%r is not present in all datasets" % dim) concat_over.update(k for k, v in ds.variables.items() if dim in v.dims) + concat_dim_lengths.append(ds.dims.get(dim, 1)) def process_subset_opt(opt, subset): if isinstance(opt, str): @@ -273,7 +275,7 @@ def process_subset_opt(opt, subset): process_subset_opt(data_vars, "data_vars") process_subset_opt(coords, "coords") - return concat_over, equals + return concat_over, equals, concat_dim_lengths # determine dimensional coordinate names and a dict mapping name to DataArray @@ -283,13 +285,10 @@ def _parse_datasets(datasets, concat_dim): all_coord_names = set() data_vars = set() # list of data_vars dim_coords = dict() # maps dim name to variable - concat_dim_lengths = [] # length of concat dimension in each dataset dims_sizes = {} # shared dimension sizes to expand variables for ds in datasets: - concat_dim_lengths.append(ds.dims.get(concat_dim, 1)) dims_sizes.update(ds.dims) - all_coord_names.update(ds.coords) data_vars.update(ds.data_vars) @@ -298,7 +297,7 @@ def _parse_datasets(datasets, concat_dim): dim_coords[dim] = ds.coords[dim].variable dims = dims | set(ds.dims) - return dim_coords, concat_dim_lengths, dims_sizes, all_coord_names, data_vars + return dim_coords, dims_sizes, all_coord_names, data_vars def _dataset_concat( @@ -323,7 +322,7 @@ def _dataset_concat( *datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value ) - result_dim_coords, concat_dim_lengths, dims_sizes, result_coord_names, data_names = _parse_datasets( + result_dim_coords, dims_sizes, result_coord_names, data_names = _parse_datasets( datasets, dim ) dim_names = set(result_dim_coords) @@ -344,7 +343,7 @@ def _dataset_concat( datasets = [ds.expand_dims(dim) for ds in datasets] # determine which variables to concatentate - concat_over, equals = _calc_concat_over( + concat_over, equals, concat_dim_lengths = _calc_concat_over( datasets, dim, dim_names, data_vars, coords, compat ) From 02e35e685e08b277aa78457580eaadaf69854ee4 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 6 Sep 2019 13:09:25 -0600 Subject: [PATCH 23/35] Avoid using merge_variables. unique_variable instead. --- xarray/core/concat.py | 27 ++++++++++++++------------- xarray/core/merge.py | 16 +++++++--------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index bff929ef456..8725579a18b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -5,7 +5,7 @@ from . import dtypes, utils from .alignment import align -from .merge import merge_variables, expand_variable_dicts, _VALID_COMPAT +from .merge import unique_variable, _VALID_COMPAT from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars @@ -349,23 +349,25 @@ def _dataset_concat( # determine which variables to merge, and then merge them according to compat variables_to_merge = (result_coord_names | data_names) - concat_over - dim_names + + result_vars = {} if variables_to_merge: - to_merge = [] + to_merge = {var: [] for var in variables_to_merge} + for ds in datasets: - if variables_to_merge - set(ds.variables): + unexpected_merge_vars = variables_to_merge - set(ds.variables) + if unexpected_merge_vars: raise ValueError( - "encountered unexpected variables %r" % list(variables_to_merge)[0] + "encountered unexpected variables %r" % unexpected_merge_vars ) - to_merge.append(ds.reset_coords()[list(variables_to_merge)]) - merge_equals = {k: equals.get(k, None) for k in variables_to_merge} + for var in variables_to_merge: + to_merge[var].append(ds.variables[var]) - result_vars = merge_variables( - expand_variable_dicts(to_merge), - priority_vars=None, - compat=compat, - equals=merge_equals, - ) + for var in variables_to_merge: + result_vars[var] = unique_variable( + var, to_merge[var], compat=compat, equals=equals.get(var, None) + ) else: result_vars = OrderedDict() result_vars.update(result_dim_coords) @@ -409,7 +411,6 @@ def ensure_common_dims(vars): result = result.set_coords(result_coord_names) result.encoding = result_encoding - # TODO: avoid this? result = result.drop(unlabeled_dims, errors="ignore") if coord is not None: diff --git a/xarray/core/merge.py b/xarray/core/merge.py index a62eed92b61..ce5090524f7 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -112,16 +112,17 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): combine_method = "fillna" if equals is None: + equals = True out = out.compute() - equals_list = [] for var in variables[1:]: - equals_list.append(getattr(out, compat)(var.compute())) - - equals = all(equals_list) + if not getattr(out, compat)(var.compute()): + equals = False + break if not equals: raise MergeError( - "conflicting values for variable %r on objects to be combined" % (name) + "conflicting values for variable %r on objects to be combined. You can skip this check by specifying compat='override'." + % (name) ) if combine_method: @@ -154,7 +155,6 @@ def merge_variables( list_of_variables_dicts: List[Mapping[Any, Variable]], priority_vars: Mapping[Any, Variable] = None, compat: str = "minimal", - equals: Mapping[Any, bool] = {}, ) -> "OrderedDict[Any, Variable]": """Merge dicts of variables, while resolving conflicts appropriately. @@ -203,9 +203,7 @@ def merge_variables( merged[name] = unique_variable(name, dim_variables, dim_compat) else: try: - merged[name] = unique_variable( - name, var_list, compat, equals.get(name, None) - ) + merged[name] = unique_variable(name, var_list, compat) except MergeError: if compat != "minimal": # we need more than "minimal" compatibility (for which From d7988c9633b358bcbd15721a7a43e886f515e1a8 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 6 Sep 2019 13:11:05 -0600 Subject: [PATCH 24/35] small stuff. --- xarray/core/concat.py | 3 +-- xarray/tests/test_dask.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 8725579a18b..83792d37a10 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -331,8 +331,7 @@ def _dataset_concat( both_data_and_coords = result_coord_names & data_names if both_data_and_coords: raise ValueError( - "%r is a coordinate in some datasets but not others." - % list(both_data_and_coords)[0] # preserve format of error message + "%r is a coordinate in some datasets but not others." % both_data_and_coords ) # we don't want the concat dimension in the result dataset yet result_dim_coords.pop(dim, None) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index e3fc6f65e0f..f5afaf53b20 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -785,7 +785,6 @@ def kernel(name): """Dask kernel to test pickling/unpickling and __repr__. Must be global to make it pickleable. """ - print("kernel(%s)" % name) global kernel_call_count kernel_call_count += 1 return np.ones(1, dtype=np.int64) From 628974ed50170750bb717001b3870adf8bb6933b Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 6 Sep 2019 13:16:46 -0600 Subject: [PATCH 25/35] Update docs. --- doc/dask.rst | 16 +--------------- doc/io.rst | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/doc/dask.rst b/doc/dask.rst index ea705d3d8f4..19cbc11292c 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -86,21 +86,7 @@ for the full disclaimer). By default, :py:func:`~xarray.open_mfdataset` will chu netCDF file into a single Dask array; again, supply the ``chunks`` argument to control the size of the resulting Dask arrays. In more complex cases, you can open each file individually using ``open_dataset`` and merge the result, as -described in :ref:`combining data`. The pattern for parallel reading of multiple files -using dask, modifying those datasets and then combining into a single ``Dataset`` is:: - - def modify(ds): - # modify ds here - return ds - - - # this is basically what open_mfdataset does - open_kwargs = dict(decode_cf=True, decode_times=False) - open_tasks = [dask.delayed(xr.open_dataset)(f, **open_kwargs) for f in file_names] - tasks = [dask.delayed(modify)(task) for task in open_tasks] - datasets = dask.compute(tasks) - combined = xr.combine_nested(datasets) # or some combination of concat, merge - +described in :ref:`combining data`. You'll notice that printing a dataset still shows a preview of array values, even if they are actually Dask arrays. We can do this quickly with Dask because diff --git a/doc/io.rst b/doc/io.rst index 788dd500bb3..775d915188e 100644 --- a/doc/io.rst +++ b/doc/io.rst @@ -309,8 +309,25 @@ and returns a modified Dataset. :py:func:`~xarray.open_mfdataset` will call ``preprocess`` on every dataset (corresponding to each file) prior to combining them. + If :py:func:`~xarray.open_mfdataset` does not meet your needs, other approaches are possible. -For example, here's how we could approximate ``MFDataset`` from the netCDF4 +The general pattern for parallel reading of multiple files +using dask, modifying those datasets and then combining into a single ``Dataset`` is:: + + def modify(ds): + # modify ds here + return ds + + + # this is basically what open_mfdataset does + open_kwargs = dict(decode_cf=True, decode_times=False) + open_tasks = [dask.delayed(xr.open_dataset)(f, **open_kwargs) for f in file_names] + tasks = [dask.delayed(modify)(task) for task in open_tasks] + datasets = dask.compute(tasks) # get a list of xarray.Datasets + combined = xr.combine_nested(datasets) # or some combination of concat, merge + + +As an example, here's how we could approximate ``MFDataset`` from the netCDF4 library:: from glob import glob From 73a203b12663a8986d582c3ca6bfec20f35ccb0f Mon Sep 17 00:00:00 2001 From: dcherian Date: Sat, 7 Sep 2019 15:51:11 -0600 Subject: [PATCH 26/35] minor fix. --- xarray/core/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 83792d37a10..8e48062eafa 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -279,7 +279,7 @@ def process_subset_opt(opt, subset): # determine dimensional coordinate names and a dict mapping name to DataArray -def _parse_datasets(datasets, concat_dim): +def _parse_datasets(datasets): dims = set() all_coord_names = set() @@ -323,7 +323,7 @@ def _dataset_concat( ) result_dim_coords, dims_sizes, result_coord_names, data_names = _parse_datasets( - datasets, dim + datasets ) dim_names = set(result_dim_coords) unlabeled_dims = dim_names - result_coord_names From 6b40c1f3a765d3ababc995aaabe4ccd3ccc0bd71 Mon Sep 17 00:00:00 2001 From: dcherian Date: Sat, 7 Sep 2019 15:53:45 -0600 Subject: [PATCH 27/35] minor fix. --- xarray/core/merge.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index ce5090524f7..9f392f7715d 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -112,11 +112,10 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): combine_method = "fillna" if equals is None: - equals = True out = out.compute() for var in variables[1:]: - if not getattr(out, compat)(var.compute()): - equals = False + equals = getattr(out, compat)(var.compute()) + if not equals: break if not equals: From e4c12e8dbe98eb4e45072add1bd447483aa449c4 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 9 Sep 2019 21:43:40 -0600 Subject: [PATCH 28/35] lint. --- xarray/core/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 2833a3c5be2..449eda8add2 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -120,7 +120,7 @@ def concat( "compat=%r invalid: must be 'broadcast_equals', 'equals', 'identical', 'no_conflicts' or 'override'" % compat ) - + if isinstance(first_obj, DataArray): f = _dataarray_concat elif isinstance(first_obj, Dataset): From 935089a683039d3215bef34c698093f275d65840 Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 9 Sep 2019 21:46:52 -0600 Subject: [PATCH 29/35] Better error message. --- xarray/core/concat.py | 7 ++++--- xarray/tests/test_concat.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 449eda8add2..d40879cf347 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -318,10 +318,11 @@ def _dataset_concat( to_merge = {var: [] for var in variables_to_merge} for ds in datasets: - unexpected_merge_vars = variables_to_merge - set(ds.variables) - if unexpected_merge_vars: + absent_merge_vars = variables_to_merge - set(ds.variables) + if absent_merge_vars: raise ValueError( - "encountered unexpected variables %r" % unexpected_merge_vars + "variables %r are present in some datasets but not others. " + % absent_merge_vars ) for var in variables_to_merge: diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 5089c673eac..00428f70966 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -183,7 +183,7 @@ def test_concat_errors(self): concat([data0, data1], "dim1", compat="identical") assert_identical(data, concat([data0, data1], "dim1", compat="equals")) - with raises_regex(ValueError, "encountered unexpected"): + with raises_regex(ValueError, "present in some datasets"): data0, data1 = deepcopy(split_data) data1["foo"] = ("bar", np.random.randn(10)) concat([data0, data1], "dim1") From c13dcff22a582d8c089b728fb48c37b5b0732a1d Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 10 Sep 2019 08:47:34 -0600 Subject: [PATCH 30/35] rename to shorter variable names. --- xarray/core/concat.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d40879cf347..d5fa495022b 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -286,23 +286,21 @@ def _dataset_concat( *datasets, join=join, copy=False, exclude=[dim], fill_value=fill_value ) - result_dim_coords, dims_sizes, result_coord_names, data_names = _parse_datasets( - datasets - ) - dim_names = set(result_dim_coords) - unlabeled_dims = dim_names - result_coord_names + dim_coords, dims_sizes, coord_names, data_names = _parse_datasets(datasets) + dim_names = set(dim_coords) + unlabeled_dims = dim_names - coord_names - both_data_and_coords = result_coord_names & data_names + both_data_and_coords = coord_names & data_names if both_data_and_coords: raise ValueError( "%r is a coordinate in some datasets but not others." % both_data_and_coords ) # we don't want the concat dimension in the result dataset yet - result_dim_coords.pop(dim, None) + dim_coords.pop(dim, None) dims_sizes.pop(dim, None) # case where concat dimension is a coordinate but not a dimension - if dim in result_coord_names and dim not in dim_names: + if dim in coord_names and dim not in dim_names: datasets = [ds.expand_dims(dim) for ds in datasets] # determine which variables to concatentate @@ -311,7 +309,7 @@ def _dataset_concat( ) # determine which variables to merge, and then merge them according to compat - variables_to_merge = (result_coord_names | data_names) - concat_over - dim_names + variables_to_merge = (coord_names | data_names) - concat_over - dim_names result_vars = {} if variables_to_merge: @@ -334,7 +332,7 @@ def _dataset_concat( ) else: result_vars = OrderedDict() - result_vars.update(result_dim_coords) + result_vars.update(dim_coords) # assign attrs and encoding from first dataset result_attrs = datasets[0].attrs @@ -372,7 +370,7 @@ def ensure_common_dims(vars): result_vars[k] = combined result = Dataset(result_vars, attrs=result_attrs) - result = result.set_coords(result_coord_names) + result = result.set_coords(coord_names) result.encoding = result_encoding result = result.drop(unlabeled_dims, errors="ignore") From 6e4727ac85c9e8ffbdee1d44af39bbc9e3c0561b Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 10 Sep 2019 08:55:52 -0600 Subject: [PATCH 31/35] Cleanup: fillna preserves attrs now. --- xarray/core/merge.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 9f392f7715d..48e32ebff32 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -126,9 +126,7 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): if combine_method: for var in variables[1:]: - # TODO: add preservation of attrs into fillna out = getattr(out, combine_method)(var) - out.attrs = var.attrs return out From 71ec88da1db6d0d442877ee6e72ec0f91ec30146 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 10 Sep 2019 14:10:57 -0600 Subject: [PATCH 32/35] Look for concat dim in data_vars also. --- xarray/core/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index d5fa495022b..46e7fdb55f9 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -299,8 +299,8 @@ def _dataset_concat( dim_coords.pop(dim, None) dims_sizes.pop(dim, None) - # case where concat dimension is a coordinate but not a dimension - if dim in coord_names and dim not in dim_names: + # case where concat dimension is a coordinate or data_var but not a dimension + if (dim in coord_names or dim in data_names) and dim not in dim_names: datasets = [ds.expand_dims(dim) for ds in datasets] # determine which variables to concatentate From d77bf93c86552ca669df8d3eba8977e569bfcb4b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 15 Sep 2019 00:58:43 +0000 Subject: [PATCH 33/35] Update xarray/core/merge.py Co-Authored-By: Stephan Hoyer --- xarray/core/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 48e32ebff32..5a22bd2a601 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -84,7 +84,7 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): inputs. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts', 'override'}, optional Type of equality check to use. - equals : mapping variable name to None or bool, corresponding to result of compat test +None or bool, corresponding to result of compat test Returns ------- From bdd388faa6ae3377639fe60995eec9d4c5592022 Mon Sep 17 00:00:00 2001 From: dcherian Date: Sat, 14 Sep 2019 19:10:23 -0600 Subject: [PATCH 34/35] avoid unnecessary computes. --- xarray/core/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 48e32ebff32..8edecfd64dd 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -114,7 +114,7 @@ def unique_variable(name, variables, compat="broadcast_equals", equals=None): if equals is None: out = out.compute() for var in variables[1:]: - equals = getattr(out, compat)(var.compute()) + equals = getattr(out, compat)(var) if not equals: break From 50d3b8f868ab27925171a0c22cc9062d270ddccc Mon Sep 17 00:00:00 2001 From: dcherian Date: Mon, 16 Sep 2019 08:46:22 -0600 Subject: [PATCH 35/35] minor cleanups. --- xarray/core/concat.py | 2 -- xarray/tests/test_combine.py | 11 +++-------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 46e7fdb55f9..e68c247d880 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -345,8 +345,6 @@ def _dataset_concat( # we've already verified everything is consistent; now, calculate # shared dimension sizes so we can expand the necessary variables - # seems like there should be a helper function for this. We would need to add - # an exclude kwarg to exclude comparing along concat_dim def ensure_common_dims(vars): # ensure each variable with the given name shares the same # dimensions and the same shape for all of them except along the diff --git a/xarray/tests/test_combine.py b/xarray/tests/test_combine.py index ba1e393a89e..1abca30d199 100644 --- a/xarray/tests/test_combine.py +++ b/xarray/tests/test_combine.py @@ -327,13 +327,13 @@ class TestCheckShapeTileIDs: def test_check_depths(self): ds = create_test_data(0) combined_tile_ids = {(0,): ds, (0, 1): ds} - with raises_regex(ValueError, "sub-lists do not have " "consistent depths"): + with raises_regex(ValueError, "sub-lists do not have consistent depths"): _check_shape_tile_ids(combined_tile_ids) def test_check_lengths(self): ds = create_test_data(0) combined_tile_ids = {(0, 0): ds, (0, 1): ds, (0, 2): ds, (1, 0): ds, (1, 1): ds} - with raises_regex(ValueError, "sub-lists do not have " "consistent lengths"): + with raises_regex(ValueError, "sub-lists do not have consistent lengths"): _check_shape_tile_ids(combined_tile_ids) @@ -565,11 +565,6 @@ def test_combine_concat_over_redundant_nesting(self): expected = Dataset({"x": [0]}) assert_identical(expected, actual) - def test_combine_nested_but_need_auto_combine(self): - objs = [Dataset({"x": [0, 1]}), Dataset({"x": [2], "wall": [0]})] - # with raises_regex(ValueError, "cannot be combined"): - combine_nested(objs, concat_dim="x") - @pytest.mark.parametrize("fill_value", [dtypes.NA, 2, 2.0]) def test_combine_nested_fill_value(self, fill_value): datasets = [ @@ -618,7 +613,7 @@ def test_combine_by_coords(self): assert_equal(actual, expected) objs = [Dataset({"x": 0}), Dataset({"x": 1})] - with raises_regex(ValueError, "Could not find any dimension " "coordinates"): + with raises_regex(ValueError, "Could not find any dimension coordinates"): combine_by_coords(objs) objs = [Dataset({"x": [0], "y": [0]}), Dataset({"x": [0]})]