pydata · TomNicholas · Aug 7, 2019 · Jul 12, 2019 · Jul 12, 2019 · Jul 15, 2019
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -30,6 +30,9 @@ New functions/methods
 Enhancements
 ~~~~~~~~~~~~
 
+- :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg.
+  It is passed down to :py:func:`~xarray.align`. By `Deepak Cherian <https://github.com/dcherian>`_.
+
 Bug fixes
 ~~~~~~~~~
 

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -609,7 +609,7 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
                    compat='no_conflicts', preprocess=None, engine=None,
                    lock=None, data_vars='all', coords='different',
                    combine='_old_auto', autoclose=None, parallel=False,
-                   **kwargs):
+                   join='outer', **kwargs):
     """Open multiple files as a single dataset.
 
     If combine='by_coords' then the function ``combine_by_coords`` is used to 
@@ -704,6 +704,8 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
     parallel : bool, optional
         If True, the open and preprocess steps of this function will be
         performed in parallel using ``dask.delayed``. Default is False.
+    join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
+        How to combine objects with different indexes.
     **kwargs : optional
         Additional arguments passed on to :py:func:`xarray.open_dataset`.
 
@@ -798,18 +800,20 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
 
             combined = auto_combine(datasets, concat_dim=concat_dim,
                                     compat=compat, data_vars=data_vars,
-                                    coords=coords, from_openmfds=True)
+                                    coords=coords, join=join,
+                                    from_openmfds=True)
         elif combine == 'nested':
             # Combined nested list by successive concat and merge operations
             # along each dimension, using structure given by "ids"
             combined = _nested_combine(datasets, concat_dims=concat_dim,
                                        compat=compat, data_vars=data_vars,
-                                       coords=coords, ids=ids)
+                                       coords=coords, ids=ids, join=join)
         elif combine == 'by_coords':
             # Redo ordering from coordinates, ignoring how they were ordered
             # previously
             combined = combine_by_coords(datasets, compat=compat,
-                                         data_vars=data_vars, coords=coords)
+                                         data_vars=data_vars, coords=coords,
+                                         join=join)
         else:
             raise ValueError("{} is an invalid option for the keyword argument"
                              " ``combine``".format(combine))

diff --git a/xarray/core/combine.py b/xarray/core/combine.py
@@ -136,7 +136,7 @@ def _check_shape_tile_ids(combined_tile_ids):
 
 def _combine_nd(combined_ids, concat_dims, data_vars='all',
                 coords='different', compat='no_conflicts',
-                fill_value=dtypes.NA):
+                fill_value=dtypes.NA, join='outer'):
     """
     Combines an N-dimensional structure of datasets into one by applying a
     series of either concat and merge operations along each dimension.
@@ -177,13 +177,14 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all',
                                                     data_vars=data_vars,
                                                     coords=coords,
                                                     compat=compat,
-                                                    fill_value=fill_value)
+                                                    fill_value=fill_value,
+                                                    join=join)
     (combined_ds,) = combined_ids.values()
     return combined_ds
 
 
 def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
-                                 fill_value=dtypes.NA):
+                                 fill_value=dtypes.NA, join='outer'):
 
     # Group into lines of datasets which must be combined along dim
     # need to sort by _new_tile_id first for groupby to work
@@ -197,12 +198,13 @@ def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
         combined_ids = OrderedDict(sorted(group))
         datasets = combined_ids.values()
         new_combined_ids[new_id] = _combine_1d(datasets, dim, compat,
-                                               data_vars, coords, fill_value)
+                                               data_vars, coords, fill_value,
+                                               join)
     return new_combined_ids
 
 
 def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
-                coords='different', fill_value=dtypes.NA):
+                coords='different', fill_value=dtypes.NA, join='outer'):
     """
     Applies either concat or merge to 1D list of datasets depending on value
     of concat_dim
@@ -211,7 +213,7 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
     if concat_dim is not None:
         try:
             combined = concat(datasets, dim=concat_dim, data_vars=data_vars,
-                              coords=coords, fill_value=fill_value)
+                              coords=coords, fill_value=fill_value, join=join)
         except ValueError as err:
             if "encountered unexpected variable" in str(err):
                 raise ValueError("These objects cannot be combined using only "
@@ -222,7 +224,8 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
             else:
                 raise
     else:
-        combined = merge(datasets, compat=compat, fill_value=fill_value)
+        combined = merge(datasets, compat=compat, fill_value=fill_value,
+                         join=join)
 
     return combined
 
@@ -233,7 +236,7 @@ def _new_tile_id(single_id_ds_pair):
 
 
 def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
-                    fill_value=dtypes.NA):
+                    fill_value=dtypes.NA, join='outer'):
 
     if len(datasets) == 0:
         return Dataset()
@@ -254,12 +257,13 @@ def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
     # Apply series of concatenate or merge operations along each dimension
     combined = _combine_nd(combined_ids, concat_dims, compat=compat,
                            data_vars=data_vars, coords=coords,
-                           fill_value=fill_value)
+                           fill_value=fill_value, join=join)
     return combined
 
 
 def combine_nested(datasets, concat_dim, compat='no_conflicts',
-                   data_vars='all', coords='different', fill_value=dtypes.NA):
+                   data_vars='all', coords='different', fill_value=dtypes.NA,
+                   join='outer'):
     """
     Explicitly combine an N-dimensional grid of datasets into one by using a
     succession of concat and merge operations along each dimension of the grid.
@@ -312,6 +316,8 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
         Details are in the documentation of concat
     fill_value : scalar, optional
         Value to use for newly missing values
+    join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
+        How to combine objects with different indexes.
 
     Returns
     -------
@@ -383,15 +389,15 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
     # The IDs argument tells _manual_combine that datasets aren't yet sorted
     return _nested_combine(datasets, concat_dims=concat_dim, compat=compat,
                            data_vars=data_vars, coords=coords, ids=False,
-                           fill_value=fill_value)
+                           fill_value=fill_value, join=join)
 
 
 def vars_as_keys(ds):
     return tuple(sorted(ds))
 
 
 def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
-                      coords='different', fill_value=dtypes.NA):
+                      coords='different', fill_value=dtypes.NA, join='outer'):
     """
     Attempt to auto-magically combine the given datasets into one by using
     dimension coordinates.
@@ -439,6 +445,8 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
         Details are in the documentation of concat
     fill_value : scalar, optional
         Value to use for newly missing values
+    join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
+        How to combine objects with different indexes.
 
     Returns
     -------
@@ -498,7 +506,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
         # Concatenate along all of concat_dims one by one to create single ds
         concatenated = _combine_nd(combined_ids, concat_dims=concat_dims,
                                    data_vars=data_vars, coords=coords,
-                                   fill_value=fill_value)
+                                   fill_value=fill_value, join=join)
 
         # Check the overall coordinates are monotonically increasing
         for dim in concatenated.dims:
@@ -512,7 +520,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
         concatenated_grouped_by_data_vars.append(concatenated)
 
     return merge(concatenated_grouped_by_data_vars, compat=compat,
-                 fill_value=fill_value)
+                 fill_value=fill_value, join=join)
 
 
 # Everything beyond here is only needed until the deprecation cycle in #2616
@@ -524,7 +532,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
 
 def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
                  data_vars='all', coords='different', fill_value=dtypes.NA,
-                 from_openmfds=False):
+                 join='outer', from_openmfds=False):
     """
     Attempt to auto-magically combine the given datasets into one.
 
@@ -572,6 +580,8 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
         Details are in the documentation of concat
     fill_value : scalar, optional
         Value to use for newly missing values
+    join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
+        How to combine objects with different indexes.
 
     Returns
     -------
@@ -630,7 +640,8 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
 
     return _old_auto_combine(datasets, concat_dim=concat_dim,
                              compat=compat, data_vars=data_vars,
-                             coords=coords, fill_value=fill_value)
+                             coords=coords, fill_value=fill_value,
+                             join=join)
 
 
 def _dimension_coords_exist(datasets):
@@ -671,7 +682,7 @@ def _requires_concat_and_merge(datasets):
 def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
                       compat='no_conflicts',
                       data_vars='all', coords='different',
-                      fill_value=dtypes.NA):
+                      fill_value=dtypes.NA, join='outer'):
     if concat_dim is not None:
         dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
 
@@ -680,16 +691,17 @@ def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
 
         concatenated = [_auto_concat(list(datasets), dim=dim,
                                      data_vars=data_vars, coords=coords,
-                                     fill_value=fill_value)
+                                     fill_value=fill_value, join=join)
                         for vars, datasets in grouped]
     else:
         concatenated = datasets
-    merged = merge(concatenated, compat=compat, fill_value=fill_value)
+    merged = merge(concatenated, compat=compat, fill_value=fill_value,
+                   join=join)
     return merged
 
 
 def _auto_concat(datasets, dim=None, data_vars='all', coords='different',
-                 fill_value=dtypes.NA):
+                 fill_value=dtypes.NA, join='outer'):
     if len(datasets) == 1 and dim is None:
         # There is nothing more to combine, so kick out early.
         return datasets[0]

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
@@ -11,7 +11,7 @@
 
 def concat(objs, dim=None, data_vars='all', coords='different',
            compat='equals', positions=None, indexers=None, mode=None,
-           concat_over=None, fill_value=dtypes.NA):
+           concat_over=None, fill_value=dtypes.NA, join='outer'):
     """Concatenate xarray objects along a new or existing dimension.
 
     Parameters
@@ -65,6 +65,9 @@ def concat(objs, dim=None, data_vars='all', coords='different',
         supplied, objects are concatenated in the provided order.
     fill_value : scalar, optional
         Value to use for newly missing values
+    join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
+        How to combine objects with different indexes
+        (excluding index along 'dim').
     indexers, mode, concat_over : deprecated
 
     Returns
@@ -76,7 +79,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
     merge
     auto_combine
     """
-    # TODO: add join and ignore_index arguments copied from pandas.concat
+    # TODO: add ignore_index arguments copied from pandas.concat
     # TODO: support concatenating scalar coordinates even if the concatenated
     # dimension already exists
     from .dataset import Dataset
@@ -116,7 +119,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
     else:
         raise TypeError('can only concatenate xarray Dataset and DataArray '
                         'objects, got %s' % type(first_obj))
-    return f(objs, dim, data_vars, coords, compat, positions, fill_value)
+    return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)
 
 
 def _calc_concat_dim_coord(dim):
@@ -212,7 +215,7 @@ def process_subset_opt(opt, subset):
 
 
 def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
-                    fill_value=dtypes.NA):
+                    fill_value=dtypes.NA, join='outer'):
     """
     Concatenate a sequence of datasets along a new or existing dimension
     """
@@ -225,7 +228,7 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
     dim, coord = _calc_concat_dim_coord(dim)
     # Make sure we're working on a copy (we'll be loading variables)
     datasets = [ds.copy() for ds in datasets]
-    datasets = align(*datasets, join='outer', copy=False, exclude=[dim],
+    datasets = align(*datasets, join=join, copy=False, exclude=[dim],
                      fill_value=fill_value)
 
     concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords)
@@ -318,7 +321,7 @@ def ensure_common_dims(vars):
 
 
 def _dataarray_concat(arrays, dim, data_vars, coords, compat,
-                      positions, fill_value=dtypes.NA):
+                      positions, fill_value=dtypes.NA, join='outer'):
     arrays = list(arrays)
 
     if data_vars != 'all':
@@ -337,5 +340,5 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
         datasets.append(arr._to_temp_dataset())
 
     ds = _dataset_concat(datasets, dim, data_vars, coords, compat,
-                         positions, fill_value=fill_value)
+                         positions, fill_value=fill_value, join=join)
     return arrays[0]._from_temp_dataset(ds, name)
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -2358,8 +2358,12 @@ class TestOpenMFDatasetWithDataVarsAndCoordsKw:
     var_name = 'v1'
 
     @contextlib.contextmanager
-    def setup_files_and_datasets(self):
+    def setup_files_and_datasets(self, fuzz=0):
         ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
+
+        # to test join='exact'
+        ds1['x'] = ds1.x + fuzz
+
         with create_tmp_file() as tmpfile1:
             with create_tmp_file() as tmpfile2:
 
@@ -2396,20 +2400,29 @@ def gen_datasets_with_common_coord_and_time(self):
 
         return ds1, ds2
 
+    @pytest.mark.parametrize('combine', ['nested', 'by_coords'])
     @pytest.mark.parametrize('opt', ['all', 'minimal', 'different'])
-    def test_open_mfdataset_does_same_as_concat(self, opt):
+    @pytest.mark.parametrize('join', ['outer', 'inner', 'left', 'right'])
+    def test_open_mfdataset_does_same_as_concat(self, combine, opt, join):
         with self.setup_files_and_datasets() as (files, [ds1, ds2]):
-            with open_mfdataset(files, data_vars=opt,
-                                combine='nested', concat_dim='t') as ds:
-                kwargs = dict(data_vars=opt, dim='t')
-                ds_expect = xr.concat([ds1, ds2], **kwargs)
-                assert_identical(ds, ds_expect)
-            with open_mfdataset(files, coords=opt,
-                                combine='nested', concat_dim='t') as ds:
-                kwargs = dict(coords=opt, dim='t')
-                ds_expect = xr.concat([ds1, ds2], **kwargs)
+            if combine == 'by_coords':
+                files.reverse()
+            with open_mfdataset(files, data_vars=opt, combine=combine,
+                                concat_dim='t', join=join) as ds:
+                ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim='t',
+                                      join=join)
                 assert_identical(ds, ds_expect)
 
+    @pytest.mark.parametrize('combine', ['nested', 'by_coords'])
+    @pytest.mark.parametrize('opt', ['all', 'minimal', 'different'])
+    def test_open_mfdataset_exact_join_raises_error(self, combine, opt):
+        with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]):
+            if combine == 'by_coords':
+                files.reverse()
+            with raises_regex(ValueError, 'indexes along dimension'):
+                open_mfdataset(files, data_vars=opt, combine=combine,
+                               concat_dim='t', join='exact')
+
     def test_common_coord_when_datavars_all(self):
         opt = 'all'