Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

mfdataset, concat now support the 'join' kwarg. #3102

Merged
merged 13 commits into from
Aug 7, 2019
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ New functions/methods
Enhancements
~~~~~~~~~~~~

- :py:func:`~xarray.concat` and :py:func:`~xarray.open_mfdataset` now support the ``join`` kwarg.
It is passed down to :py:func:`~xarray.align`. By `Deepak Cherian <https://github.com/dcherian>`_.

Bug fixes
~~~~~~~~~

Expand Down
12 changes: 8 additions & 4 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
compat='no_conflicts', preprocess=None, engine=None,
lock=None, data_vars='all', coords='different',
combine='_old_auto', autoclose=None, parallel=False,
**kwargs):
join='outer', **kwargs):
"""Open multiple files as a single dataset.

If combine='by_coords' then the function ``combine_by_coords`` is used to
Expand Down Expand Up @@ -704,6 +704,8 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',
parallel : bool, optional
If True, the open and preprocess steps of this function will be
performed in parallel using ``dask.delayed``. Default is False.
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
How to combine objects with different indexes.
dcherian marked this conversation as resolved.
Show resolved Hide resolved
**kwargs : optional
Additional arguments passed on to :py:func:`xarray.open_dataset`.

Expand Down Expand Up @@ -798,18 +800,20 @@ def open_mfdataset(paths, chunks=None, concat_dim='_not_supplied',

combined = auto_combine(datasets, concat_dim=concat_dim,
compat=compat, data_vars=data_vars,
coords=coords, from_openmfds=True)
coords=coords, join=join,
from_openmfds=True)
elif combine == 'nested':
# Combined nested list by successive concat and merge operations
# along each dimension, using structure given by "ids"
combined = _nested_combine(datasets, concat_dims=concat_dim,
compat=compat, data_vars=data_vars,
coords=coords, ids=ids)
coords=coords, ids=ids, join=join)
elif combine == 'by_coords':
# Redo ordering from coordinates, ignoring how they were ordered
# previously
combined = combine_by_coords(datasets, compat=compat,
data_vars=data_vars, coords=coords)
data_vars=data_vars, coords=coords,
join=join)
else:
raise ValueError("{} is an invalid option for the keyword argument"
" ``combine``".format(combine))
Expand Down
52 changes: 32 additions & 20 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _check_shape_tile_ids(combined_tile_ids):

def _combine_nd(combined_ids, concat_dims, data_vars='all',
coords='different', compat='no_conflicts',
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
"""
Combines an N-dimensional structure of datasets into one by applying a
series of either concat and merge operations along each dimension.
Expand Down Expand Up @@ -177,13 +177,14 @@ def _combine_nd(combined_ids, concat_dims, data_vars='all',
data_vars=data_vars,
coords=coords,
compat=compat,
fill_value=fill_value)
fill_value=fill_value,
join=join)
(combined_ds,) = combined_ids.values()
return combined_ds


def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):

# Group into lines of datasets which must be combined along dim
# need to sort by _new_tile_id first for groupby to work
Expand All @@ -197,12 +198,13 @@ def _combine_all_along_first_dim(combined_ids, dim, data_vars, coords, compat,
combined_ids = OrderedDict(sorted(group))
datasets = combined_ids.values()
new_combined_ids[new_id] = _combine_1d(datasets, dim, compat,
data_vars, coords, fill_value)
data_vars, coords, fill_value,
join)
return new_combined_ids


def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
coords='different', fill_value=dtypes.NA):
coords='different', fill_value=dtypes.NA, join='outer'):
"""
Applies either concat or merge to 1D list of datasets depending on value
of concat_dim
Expand All @@ -211,7 +213,7 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
if concat_dim is not None:
try:
combined = concat(datasets, dim=concat_dim, data_vars=data_vars,
coords=coords, fill_value=fill_value)
coords=coords, fill_value=fill_value, join=join)
except ValueError as err:
if "encountered unexpected variable" in str(err):
raise ValueError("These objects cannot be combined using only "
Expand All @@ -222,7 +224,8 @@ def _combine_1d(datasets, concat_dim, compat='no_conflicts', data_vars='all',
else:
raise
else:
combined = merge(datasets, compat=compat, fill_value=fill_value)
combined = merge(datasets, compat=compat, fill_value=fill_value,
join=join)

return combined

Expand All @@ -233,7 +236,7 @@ def _new_tile_id(single_id_ds_pair):


def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):

if len(datasets) == 0:
return Dataset()
Expand All @@ -254,12 +257,13 @@ def _nested_combine(datasets, concat_dims, compat, data_vars, coords, ids,
# Apply series of concatenate or merge operations along each dimension
combined = _combine_nd(combined_ids, concat_dims, compat=compat,
data_vars=data_vars, coords=coords,
fill_value=fill_value)
fill_value=fill_value, join=join)
return combined


def combine_nested(datasets, concat_dim, compat='no_conflicts',
data_vars='all', coords='different', fill_value=dtypes.NA):
data_vars='all', coords='different', fill_value=dtypes.NA,
join='outer'):
"""
Explicitly combine an N-dimensional grid of datasets into one by using a
succession of concat and merge operations along each dimension of the grid.
Expand Down Expand Up @@ -312,6 +316,8 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
Details are in the documentation of concat
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
How to combine objects with different indexes.

Returns
-------
Expand Down Expand Up @@ -383,15 +389,15 @@ def combine_nested(datasets, concat_dim, compat='no_conflicts',
# The IDs argument tells _manual_combine that datasets aren't yet sorted
return _nested_combine(datasets, concat_dims=concat_dim, compat=compat,
data_vars=data_vars, coords=coords, ids=False,
fill_value=fill_value)
fill_value=fill_value, join=join)


def vars_as_keys(ds):
return tuple(sorted(ds))


def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
coords='different', fill_value=dtypes.NA):
coords='different', fill_value=dtypes.NA, join='outer'):
"""
Attempt to auto-magically combine the given datasets into one by using
dimension coordinates.
Expand Down Expand Up @@ -439,6 +445,8 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
Details are in the documentation of concat
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
How to combine objects with different indexes.

Returns
-------
Expand Down Expand Up @@ -498,7 +506,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
# Concatenate along all of concat_dims one by one to create single ds
concatenated = _combine_nd(combined_ids, concat_dims=concat_dims,
data_vars=data_vars, coords=coords,
fill_value=fill_value)
fill_value=fill_value, join=join)

# Check the overall coordinates are monotonically increasing
for dim in concatenated.dims:
Expand All @@ -512,7 +520,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',
concatenated_grouped_by_data_vars.append(concatenated)

return merge(concatenated_grouped_by_data_vars, compat=compat,
fill_value=fill_value)
fill_value=fill_value, join=join)


# Everything beyond here is only needed until the deprecation cycle in #2616
Expand All @@ -524,7 +532,7 @@ def combine_by_coords(datasets, compat='no_conflicts', data_vars='all',

def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
data_vars='all', coords='different', fill_value=dtypes.NA,
from_openmfds=False):
join='outer', from_openmfds=False):
"""
Attempt to auto-magically combine the given datasets into one.

Expand Down Expand Up @@ -572,6 +580,8 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',
Details are in the documentation of concat
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
How to combine objects with different indexes.

Returns
-------
Expand Down Expand Up @@ -630,7 +640,8 @@ def auto_combine(datasets, concat_dim='_not_supplied', compat='no_conflicts',

return _old_auto_combine(datasets, concat_dim=concat_dim,
compat=compat, data_vars=data_vars,
coords=coords, fill_value=fill_value)
coords=coords, fill_value=fill_value,
join=join)


def _dimension_coords_exist(datasets):
Expand Down Expand Up @@ -671,7 +682,7 @@ def _requires_concat_and_merge(datasets):
def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,
compat='no_conflicts',
data_vars='all', coords='different',
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
if concat_dim is not None:
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim

Expand All @@ -680,16 +691,17 @@ def _old_auto_combine(datasets, concat_dim=_CONCAT_DIM_DEFAULT,

concatenated = [_auto_concat(list(datasets), dim=dim,
data_vars=data_vars, coords=coords,
fill_value=fill_value)
fill_value=fill_value, join=join)
for vars, datasets in grouped]
else:
concatenated = datasets
merged = merge(concatenated, compat=compat, fill_value=fill_value)
merged = merge(concatenated, compat=compat, fill_value=fill_value,
join=join)
return merged


def _auto_concat(datasets, dim=None, data_vars='all', coords='different',
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
if len(datasets) == 1 and dim is None:
# There is nothing more to combine, so kick out early.
return datasets[0]
Expand Down
17 changes: 10 additions & 7 deletions xarray/core/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

def concat(objs, dim=None, data_vars='all', coords='different',
compat='equals', positions=None, indexers=None, mode=None,
concat_over=None, fill_value=dtypes.NA):
concat_over=None, fill_value=dtypes.NA, join='outer'):
"""Concatenate xarray objects along a new or existing dimension.

Parameters
Expand Down Expand Up @@ -65,6 +65,9 @@ def concat(objs, dim=None, data_vars='all', coords='different',
supplied, objects are concatenated in the provided order.
fill_value : scalar, optional
Value to use for newly missing values
join : {'outer', 'inner', 'left', 'right', 'exact'}, optional
How to combine objects with different indexes
(excluding index along 'dim').
indexers, mode, concat_over : deprecated

Returns
Expand All @@ -76,7 +79,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
merge
auto_combine
"""
# TODO: add join and ignore_index arguments copied from pandas.concat
# TODO: add ignore_index arguments copied from pandas.concat
# TODO: support concatenating scalar coordinates even if the concatenated
# dimension already exists
from .dataset import Dataset
Expand Down Expand Up @@ -116,7 +119,7 @@ def concat(objs, dim=None, data_vars='all', coords='different',
else:
raise TypeError('can only concatenate xarray Dataset and DataArray '
'objects, got %s' % type(first_obj))
return f(objs, dim, data_vars, coords, compat, positions, fill_value)
return f(objs, dim, data_vars, coords, compat, positions, fill_value, join)


def _calc_concat_dim_coord(dim):
Expand Down Expand Up @@ -212,7 +215,7 @@ def process_subset_opt(opt, subset):


def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
fill_value=dtypes.NA):
fill_value=dtypes.NA, join='outer'):
"""
Concatenate a sequence of datasets along a new or existing dimension
"""
Expand All @@ -225,7 +228,7 @@ def _dataset_concat(datasets, dim, data_vars, coords, compat, positions,
dim, coord = _calc_concat_dim_coord(dim)
# Make sure we're working on a copy (we'll be loading variables)
datasets = [ds.copy() for ds in datasets]
datasets = align(*datasets, join='outer', copy=False, exclude=[dim],
datasets = align(*datasets, join=join, copy=False, exclude=[dim],
fill_value=fill_value)

concat_over, equals = _calc_concat_over(datasets, dim, data_vars, coords)
Expand Down Expand Up @@ -318,7 +321,7 @@ def ensure_common_dims(vars):


def _dataarray_concat(arrays, dim, data_vars, coords, compat,
positions, fill_value=dtypes.NA):
positions, fill_value=dtypes.NA, join='outer'):
arrays = list(arrays)

if data_vars != 'all':
Expand All @@ -337,5 +340,5 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
datasets.append(arr._to_temp_dataset())

ds = _dataset_concat(datasets, dim, data_vars, coords, compat,
positions, fill_value=fill_value)
positions, fill_value=fill_value, join=join)
return arrays[0]._from_temp_dataset(ds, name)
35 changes: 24 additions & 11 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2358,8 +2358,12 @@ class TestOpenMFDatasetWithDataVarsAndCoordsKw:
var_name = 'v1'

@contextlib.contextmanager
def setup_files_and_datasets(self):
def setup_files_and_datasets(self, fuzz=0):
ds1, ds2 = self.gen_datasets_with_common_coord_and_time()

# to test join='exact'
ds1['x'] = ds1.x + fuzz
dcherian marked this conversation as resolved.
Show resolved Hide resolved

with create_tmp_file() as tmpfile1:
with create_tmp_file() as tmpfile2:

Expand Down Expand Up @@ -2396,20 +2400,29 @@ def gen_datasets_with_common_coord_and_time(self):

return ds1, ds2

@pytest.mark.parametrize('combine', ['nested', 'by_coords'])
@pytest.mark.parametrize('opt', ['all', 'minimal', 'different'])
def test_open_mfdataset_does_same_as_concat(self, opt):
@pytest.mark.parametrize('join', ['outer', 'inner', 'left', 'right'])
def test_open_mfdataset_does_same_as_concat(self, combine, opt, join):
with self.setup_files_and_datasets() as (files, [ds1, ds2]):
with open_mfdataset(files, data_vars=opt,
combine='nested', concat_dim='t') as ds:
kwargs = dict(data_vars=opt, dim='t')
ds_expect = xr.concat([ds1, ds2], **kwargs)
assert_identical(ds, ds_expect)
with open_mfdataset(files, coords=opt,
combine='nested', concat_dim='t') as ds:
dcherian marked this conversation as resolved.
Show resolved Hide resolved
kwargs = dict(coords=opt, dim='t')
ds_expect = xr.concat([ds1, ds2], **kwargs)
if combine == 'by_coords':
files.reverse()
with open_mfdataset(files, data_vars=opt, combine=combine,
concat_dim='t', join=join) as ds:
ds_expect = xr.concat([ds1, ds2], data_vars=opt, dim='t',
join=join)
assert_identical(ds, ds_expect)

@pytest.mark.parametrize('combine', ['nested', 'by_coords'])
@pytest.mark.parametrize('opt', ['all', 'minimal', 'different'])
def test_open_mfdataset_exact_join_raises_error(self, combine, opt):
with self.setup_files_and_datasets(fuzz=0.1) as (files, [ds1, ds2]):
if combine == 'by_coords':
files.reverse()
with raises_regex(ValueError, 'indexes along dimension'):
open_mfdataset(files, data_vars=opt, combine=combine,
concat_dim='t', join='exact')

def test_common_coord_when_datavars_all(self):
opt = 'all'

Expand Down
Loading