Skip to content

Commit

Permalink
data_vars option added to open_mfdataset (#1580)
Browse files Browse the repository at this point in the history
* add data_vars option to open_mfdataset

* use single quotes

* fix the 'line too long' warning from flake8

* document the data_vars keyword for open_mfdataset

* improve the data_vars record in whats-new

* update my name in wats-new.rst

* Start writing the test for the data_vars keyword

* use the data_vars keyword in combine

* address flake8 warnings for test_backend.py

* ignore flake8 warnings concerning whats-new.rst

* fix function reference in whats-new.rst

* open_mfdataset does not accept dim keyword argument

* use single quotes for strings in the added tests

* refactor data_vars related tests

* Use with for opening mfdataset in data_vars related tests

* add @requires_scipy_or_netCDF4 to the data_vars test class

* address flake8 warnings about long lines in the data_vars related tests.

* close opened datasets in case of a ValueError in open_mfdataset, seems important for Windows

* fix line too long warnings from flake8

* refactor tests and open_mfdataset, to address comments

* refactor tests for data_vars keyword in open_mfdataset

* refactor to address flake8 warnings

* add another example of data_vars usage in open_mfdataset

* add coords keyword to open_mfdataset

* add a memory and performance related observations to the whats-new and modify code snippets to use single quotes for consistency.

* fixed a grammar mistake

* quote variable names referenced in the text

* add tests for coords keyword in the open_mfdataset, along with the similar tests for the data_vars keyword.

* split a test into 2 to simplify, introduce context manager for setting up test inputs in OpenMFDatasetWithDataVarsAndCoordsKwTest
  • Loading branch information
guziy authored and Joe Hamman committed Oct 10, 2017
1 parent 57ccf42 commit 27132fb
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 10 deletions.
28 changes: 28 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,34 @@ Backward Incompatible Changes

Enhancements
~~~~~~~~~~~~
- Support for ``data_vars`` and ``coords`` keywords added to
:py:func:`~xarray.open_mfdataset`
(:issue:`438`):

.. ipython::
:verbatim:
#allows to open multiple files as
ds = xarray.open_mfdataset(paths, chunks={'time': 100}, data_vars='minimal')
#instead of
ds = xarray.concat([xarray.open_dataset(p, chunks={'time': 100}) for p in paths], data_vars='minimal', dim='time')
# in the cases when they contain the same coordinate variables that should not be concantenated (i.e lon, lat)

# in case of 'minimal' does not add time dimension to spatial coordinates
In [1]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='all')

In [2]: ds['lon'].shape

Out[2]: (13505, 808, 782)

In [3]: ds = xarray.open_mfdataset('daymet_v3_tmin_*', data_vars='minimal')

In [4]: ds['lon'].shape

Out[4]: (808, 782)

# I also noticed that my memory-intensive applications use much less memory and run faster, when ``data_vars='minimal'`` is used.

By `Oleksandr Huziy <https://github.com/guziy>`_.

- Support for `pathlib.Path` objects added to
:py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_mfdataset`,
Expand Down
47 changes: 41 additions & 6 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ def close(self):

def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
compat='no_conflicts', preprocess=None, engine=None,
lock=None, **kwargs):
lock=None, data_vars='all', coords='different', **kwargs):
"""Open multiple files as a single dataset.
Requires dask to be installed. Attributes from the first dataset file
Expand Down Expand Up @@ -487,6 +487,32 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
default, a per-variable lock is used when reading data from netCDF
files with the netcdf4 and h5netcdf engines to avoid issues with
concurrent access when using dask's multithreaded backend.
data_vars : {'minimal', 'different', 'all' or list of str}, optional
These data variables will be concatenated together:
* 'minimal': Only data variables in which the dimension already
appears are included.
* 'different': Data variables which are not equal (ignoring
attributes) across all datasets are also concatenated (as well as
all for which dimension already appears). Beware: this option may
load the data payload of data variables into memory if they are not
already loaded.
* 'all': All data variables will be concatenated.
* list of str: The listed data variables will be concatenated, in
addition to the 'minimal' data variables.
coords : {'minimal', 'different', 'all' o list of str}, optional
These coordinate variables will be concatenated together:
* 'minimal': Only coordinates in which the dimension already appears
are included.
* 'different': Coordinates which are not equal (ignoring attributes)
across all datasets are also concatenated (as well as all for which
dimension already appears). Beware: this option may load the data
payload of coordinate variables into memory if they are not already
loaded.
* 'all': All coordinate variables will be concatenated, except
those corresponding to other dimensions.
* list of str: The listed coordinate variables will be concatenated,
in addition the 'minimal' coordinates.
**kwargs : optional
Additional arguments passed on to :py:func:`xarray.open_dataset`.
Expand Down Expand Up @@ -516,13 +542,22 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
if preprocess is not None:
datasets = [preprocess(ds) for ds in datasets]

if concat_dim is _CONCAT_DIM_DEFAULT:
combined = auto_combine(datasets, compat=compat)
else:
combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat)
# close datasets in case of a ValueError
try:
if concat_dim is _CONCAT_DIM_DEFAULT:
combined = auto_combine(datasets, compat=compat,
data_vars=data_vars, coords=coords)
else:
combined = auto_combine(datasets, concat_dim=concat_dim,
compat=compat,
data_vars=data_vars, coords=coords)
except ValueError:
for ds in datasets:
ds.close()
raise

combined._file_obj = _MultiFileCloser(file_objs)
combined.attrs = datasets[0].attrs

return combined


Expand Down
15 changes: 11 additions & 4 deletions xarray/core/combine.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ def _dataarray_concat(arrays, dim, data_vars, coords, compat,
return arrays[0]._from_temp_dataset(ds, name)


def _auto_concat(datasets, dim=None):
def _auto_concat(datasets, dim=None, data_vars='all', coords='different'):
if len(datasets) == 1:
return datasets[0]
else:
Expand All @@ -362,15 +362,16 @@ def _auto_concat(datasets, dim=None):
'supply the ``concat_dim`` argument '
'explicitly')
dim, = concat_dims
return concat(datasets, dim=dim)
return concat(datasets, dim=dim, data_vars=data_vars, coords=coords)


_CONCAT_DIM_DEFAULT = '__infer_concat_dim__'


def auto_combine(datasets,
concat_dim=_CONCAT_DIM_DEFAULT,
compat='no_conflicts'):
compat='no_conflicts',
data_vars='all', coords='different'):
"""Attempt to auto-magically combine the given datasets into one.
This method attempts to combine a list of datasets into a single entity by
Expand Down Expand Up @@ -411,6 +412,10 @@ def auto_combine(datasets,
- 'no_conflicts': only values which are not null in both datasets
must be equal. The returned dataset then contains the combination
of all non-null values.
data_vars : {'minimal', 'different', 'all' or list of str}, optional
Details are in the documentation of concat
coords : {'minimal', 'different', 'all' o list of str}, optional
Details are in the documentation of concat
Returns
-------
Expand All @@ -426,7 +431,9 @@ def auto_combine(datasets,
dim = None if concat_dim is _CONCAT_DIM_DEFAULT else concat_dim
grouped = itertoolz.groupby(lambda ds: tuple(sorted(ds.data_vars)),
datasets).values()
concatenated = [_auto_concat(ds, dim=dim) for ds in grouped]
concatenated = [_auto_concat(ds, dim=dim,
data_vars=data_vars, coords=coords)
for ds in grouped]
else:
concatenated = datasets
merged = merge(concatenated, compat=compat)
Expand Down
124 changes: 124 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,130 @@ def test_4_open_large_num_files_h5netcdf(self):
self.validate_open_mfdataset_large_num_files(engine=['h5netcdf'])


@requires_scipy_or_netCDF4
class OpenMFDatasetWithDataVarsAndCoordsKwTest(TestCase):
coord_name = 'lon'
var_name = 'v1'

@contextlib.contextmanager
def setup_files_and_datasets(self):
ds1, ds2 = self.gen_datasets_with_common_coord_and_time()
with create_tmp_file() as tmpfile1:
with create_tmp_file() as tmpfile2:

# save data to the temporary files
ds1.to_netcdf(tmpfile1)
ds2.to_netcdf(tmpfile2)

yield [tmpfile1, tmpfile2], [ds1, ds2]

def gen_datasets_with_common_coord_and_time(self):
# create coordinate data
nx = 10
nt = 10
x = np.arange(nx)
t1 = np.arange(nt)
t2 = np.arange(nt, 2 * nt, 1)

v1 = np.random.randn(nt, nx)
v2 = np.random.randn(nt, nx)

ds1 = Dataset(data_vars={self.var_name: (['t', 'x'], v1),
self.coord_name: ('x', 2 * x)},
coords={
't': (['t', ], t1),
'x': (['x', ], x)
})

ds2 = Dataset(data_vars={self.var_name: (['t', 'x'], v2),
self.coord_name: ('x', 2 * x)},
coords={
't': (['t', ], t2),
'x': (['x', ], x)
})

return ds1, ds2

def test_open_mfdataset_does_same_as_concat(self):
options = ['all', 'minimal', 'different', ]

with self.setup_files_and_datasets() as (files, [ds1, ds2]):
for opt in options:
with open_mfdataset(files, data_vars=opt) as ds:
kwargs = dict(data_vars=opt, dim='t')
ds_expect = xr.concat([ds1, ds2], **kwargs)
self.assertDatasetIdentical(ds, ds_expect)

with open_mfdataset(files, coords=opt) as ds:
kwargs = dict(coords=opt, dim='t')
ds_expect = xr.concat([ds1, ds2], **kwargs)
self.assertDatasetIdentical(ds, ds_expect)

def test_common_coord_when_datavars_all(self):
opt = 'all'

with self.setup_files_and_datasets() as (files, [ds1, ds2]):
# open the files with the data_var option
with open_mfdataset(files, data_vars=opt) as ds:

coord_shape = ds[self.coord_name].shape
coord_shape1 = ds1[self.coord_name].shape
coord_shape2 = ds2[self.coord_name].shape

var_shape = ds[self.var_name].shape

# shape pairs to be compared
shape_pairs = [
(var_shape, coord_shape),
(coord_shape1, coord_shape),
(coord_shape2, coord_shape)
]
# tests to be applied to respective pairs
tests = [self.assertEqual,
self.assertNotEqual, self.assertNotEqual]

for a_test, a_shape_pair in zip(tests, shape_pairs):
a_test(*a_shape_pair)

def test_common_coord_when_datavars_minimal(self):
opt = 'minimal'

with self.setup_files_and_datasets() as (files, [ds1, ds2]):
# open the files using data_vars option
with open_mfdataset(files, data_vars=opt) as ds:

coord_shape = ds[self.coord_name].shape
coord_shape1 = ds1[self.coord_name].shape
coord_shape2 = ds2[self.coord_name].shape

var_shape = ds[self.var_name].shape

# shape pairs to be compared
shape_pairs = [
(var_shape, coord_shape),
(coord_shape1, coord_shape),
(coord_shape2, coord_shape)
]
# tests to be applied to respective pairs
tests = [self.assertNotEqual,
self.assertEqual, self.assertEqual]

for a_test, a_shape_pair in zip(tests, shape_pairs):
a_test(*a_shape_pair)

def test_invalid_data_vars_value_should_fail(self):

with self.setup_files_and_datasets() as (files, _):
with self.assertRaises(ValueError):
with open_mfdataset(files, data_vars='minimum'):
pass

# test invalid coord parameter
with self.assertRaises(ValueError):
with open_mfdataset(files, coords='minimum'):
pass


@requires_dask
@requires_scipy
@requires_netCDF4
Expand Down

0 comments on commit 27132fb

Please sign in to comment.