diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 924b450daf5..4642fe17cc1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -21,6 +21,20 @@ v0.9.0 (unreleased) Breaking changes ~~~~~~~~~~~~~~~~ +- Index coordinates for each dimensions are now optional, and no longer created + by default. This has a number of implications: + + - :py:func:`~align` and :py:meth:`~Dataset.reindex` can now error, if + dimensions labels are missing and dimensions have different sizes. + - Because pandas does not support missing indexes, methods such as + ``to_dataframe``/``from_dataframe`` and ``stack``/``unstack`` no longer + roundtrip faithfully on all inputs. Use :py:meth:`~Dataset.reset_index` to + remove undesired indexes. + - ``Dataset.__delitem__`` no longer deletes all variables matching + dimension names. + - ``DataArray.coords.__delitem__`` is now allowed on variables matching + dimension names. + - The default behavior of ``merge`` is now ``compat='no_conflicts'``, so some merges will now succeed in cases that previously raised ``xarray.MergeError``. Set ``compat='broadcast_equals'`` to restore the diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 5bcd7efa319..3338ff693d7 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -197,11 +197,11 @@ def reindex_variables(variables, dims, indexes, indexers, method=None, """ # build up indexers for assignment along each dimension to_indexers = {} - to_shape = {} from_indexers = {} + # size of reindexed dimensions + new_sizes = {} for name, index in iteritems(indexes): - to_shape[name] = index.size if name in indexers: target = utils.safe_cast_to_index(indexers[name]) if not index.is_unique: @@ -210,7 +210,7 @@ def reindex_variables(variables, dims, indexes, indexers, method=None, 'index has duplicate values' % name) indexer = get_indexer(index, target, method, tolerance) - to_shape[name] = len(target) + new_sizes[name] = len(target) # Note pandas uses negative values from get_indexer to signify # values that are missing in the index # The non-negative values thus indicate the non-missing values @@ -246,12 +246,17 @@ def var_indexers(var, indexers): # create variables for the new dataset reindexed = OrderedDict() - for name, var in iteritems(variables): - if name in indexers: - # no need to copy, because index data is immutable - new_var = IndexVariable(var.dims, indexers[name], var.attrs, - var.encoding) + + for dim, indexer in indexers.items(): + if dim in variables: + var = variables[dim] + args = (var.attrs, var.encoding) else: + args = () + reindexed[dim] = IndexVariable((dim,), indexers[dim], *args) + + for name, var in iteritems(variables): + if name not in indexers: assign_to = var_indexers(var, to_indexers) assign_from = var_indexers(var, from_indexers) @@ -261,7 +266,8 @@ def var_indexers(var, indexers): dtype, fill_value = _maybe_promote(var.dtype) if isinstance(data, np.ndarray): - shape = tuple(to_shape[dim] for dim in var.dims) + shape = tuple(new_sizes.get(dim, size) + for dim, size in zip(var.dims, var.shape)) new_data = np.empty(shape, dtype=dtype) new_data[...] = fill_value # create a new Variable so we can use orthogonal indexing @@ -291,7 +297,7 @@ def var_indexers(var, indexers): # we neither created a new ndarray nor used fancy indexing new_var = var.copy(deep=copy) - reindexed[name] = new_var + reindexed[name] = new_var return reindexed diff --git a/xarray/core/coordinates.py b/xarray/core/coordinates.py index 3d30523271a..bba4e681d50 100644 --- a/xarray/core/coordinates.py +++ b/xarray/core/coordinates.py @@ -51,7 +51,7 @@ def to_index(self, ordered_dims=None): """ if ordered_dims is None: ordered_dims = self.dims - indexes = [self.variables[k].to_index() for k in ordered_dims] + indexes = [self.indexes.get(k) for k in ordered_dims] return pd.MultiIndex.from_product(indexes, names=list(ordered_dims)) def update(self, other): @@ -213,9 +213,6 @@ def to_dataset(self): return self._to_dataset() def __delitem__(self, key): - if key in self.dims: - raise ValueError('cannot delete a coordinate corresponding to a ' - 'DataArray dimension') del self._data._coords[key] @@ -244,11 +241,11 @@ def __init__(self, variables, dims): Arguments --------- - variables : OrderedDict + variables : OrderedDict[Any, Variable] Reference to OrderedDict holding variable objects. Should be the same dictionary used by the source object. - dims : sequence or mapping - Should be the same dimensions used by the source object. + dims : OrderedDict[Any, int] + Map from dimension names to sizes. """ self._variables = variables self._dims = dims @@ -265,10 +262,20 @@ def __contains__(self, key): return key in self._dims and key in self._variables def __getitem__(self, key): - if key in self: - return self._variables[key].to_index() - else: + if key not in self._dims: raise KeyError(key) + return self._variables[key].to_index() def __unicode__(self): return formatting.indexes_repr(self) + + def get(self, key): + """Get an index for a dimension, supplying default RangeIndex if needed. + """ + if key not in self._dims: + raise KeyError(key) + + if key in self._variables: + return self._variables[key].to_index() + else: + return pd.Index(range(self._dims[key]), name=key) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 025a68ea6ba..2a7a7a43a5f 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -17,7 +17,7 @@ from .coordinates import (DataArrayCoordinates, LevelCoordinatesSource, Indexes) from .dataset import Dataset -from .pycompat import iteritems, basestring, OrderedDict, zip +from .pycompat import iteritems, basestring, OrderedDict, zip, range from .variable import (as_variable, Variable, as_compatible_data, IndexVariable, default_index_coordinate, assert_unique_multiindex_level_names) @@ -505,7 +505,7 @@ def encoding(self, value): def indexes(self): """OrderedDict of pandas.Index objects used for label based indexing """ - return Indexes(self._coords, self.dims) + return Indexes(self._coords, OrderedDict(zip(self.dims, self.shape))) @property def coords(self): @@ -1066,7 +1066,8 @@ def to_pandas(self): except KeyError: raise ValueError('cannot convert arrays with %s dimensions into ' 'pandas objects' % self.ndim) - return constructor(self.values, *self.indexes.values()) + indexes = [self.indexes.get(dim) for dim in self.dims] + return constructor(self.values, *indexes) def to_dataframe(self, name=None): """Convert this array and its coordinates into a tidy pandas.DataFrame. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 1abdeefef4e..882a1fc5392 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -557,22 +557,9 @@ def __setitem__(self, key, value): def __delitem__(self, key): """Remove a variable from this dataset. - - If this variable is a dimension, all variables containing this - dimension are also removed. """ - def remove(k): - del self._variables[k] - self._coord_names.discard(k) - - remove(key) - - if key in self._dims: - del self._dims[key] - also_delete = [k for k, v in iteritems(self._variables) - if key in v.dims] - for key in also_delete: - remove(key) + del self._variables[key] + self._coord_names.discard(key) # mutable objects should not be hashable __hash__ = None @@ -1224,7 +1211,9 @@ def reindex(self, indexers=None, method=None, tolerance=None, copy=True, **kw_in variables = alignment.reindex_variables( self.variables, self.dims, self.indexes, indexers, method, tolerance, copy=copy) - return self._replace_vars_and_dims(variables) + coord_names = set(self._coord_names) + coord_names.update(indexers) + return self._replace_vars_and_dims(variables, coord_names) def rename(self, name_dict, inplace=False): """Returns a new object with renamed variables and dimensions. @@ -1250,9 +1239,9 @@ def rename(self, name_dict, inplace=False): DataArray.rename """ for k, v in name_dict.items(): - if k not in self: + if k not in self and k not in self.dims: raise ValueError("cannot rename %r because it is not a " - "variable in this dataset" % k) + "variable or dimension in this dataset" % k) if v in self and k != v: raise ValueError('the new name %r already exists' % v) @@ -1339,18 +1328,8 @@ def _stack_once(self, dims, new_dim): else: variables[name] = var.copy(deep=False) - indexes = self.indexes - dim_sizes = self.dims - - levels = [] - for dim in dims: - if dim in indexes: - level = indexes[dim] - else: - level = np.arange(dim_sizes[dim]) - levels.append(level) - # consider dropping levels that are unused? + levels = [self.indexes.get(dim) for dim in dims] idx = utils.multiindex_from_product_levels(levels, names=dims) variables[new_dim] = IndexVariable(new_dim, idx) @@ -1409,7 +1388,7 @@ def unstack(self, dim): if dim not in self.dims: raise ValueError('invalid dimension: %s' % dim) - index = self.indexes[dim] + index = self.indexes.get(dim) if not isinstance(index, pd.MultiIndex): raise ValueError('cannot unstack a dimension that does not have ' 'a MultiIndex') @@ -1551,7 +1530,12 @@ def drop(self, labels, dim=None): if dim is None: return self._drop_vars(labels) else: - new_index = self.indexes[dim].drop(labels) + try: + index = self.indexes[dim] + except KeyError: + raise ValueError( + 'dimension %r does not have coordinate labels' % dim) + new_index = index.drop(labels) return self.loc[{dim: new_index}] def _drop_vars(self, names): diff --git a/xarray/test/test_dataarray.py b/xarray/test/test_dataarray.py index decbb3e0a57..81d4b02cf85 100644 --- a/xarray/test/test_dataarray.py +++ b/xarray/test/test_dataarray.py @@ -651,11 +651,9 @@ def test_coords(self): actual = repr(da.coords) self.assertEquals(expected, actual) - with self.assertRaisesRegexp(ValueError, 'cannot delete'): - del da['x'] - - with self.assertRaisesRegexp(ValueError, 'cannot delete'): - del da.coords['x'] + del da.coords['x'] + expected = DataArray(da.values, {'y': [0, 1, 2]}, dims=['x', 'y']) + self.assertDataArrayIdentical(da, expected) with self.assertRaisesRegexp(ValueError, 'conflicting MultiIndex'): self.mda['level_1'] = np.arange(4) diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 253ed2e823b..cb70f84bfa6 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -227,7 +227,8 @@ def test_constructor_auto_align(self): # verify align uses outer join expected = Dataset({'a': ('x', [1, 2, np.nan]), - 'b': ('x', [np.nan, 3, 4])}) + 'b': ('x', [np.nan, 3, 4])}, + {'x': [0, 1, 2]}) actual = Dataset({'a': a, 'b': b}) self.assertDatasetIdentical(expected, actual) @@ -257,12 +258,14 @@ def test_constructor_pandas_sequence(self): (var_name, ds[var_name].to_pandas()) for var_name in ['foo','bar'] ) ds_based_on_pandas = Dataset(pandas_objs, ds.coords, attrs=ds.attrs) + del ds_based_on_pandas['x'] self.assertDatasetEqual(ds, ds_based_on_pandas) # reindex pandas obj, check align works rearranged_index = reversed(pandas_objs['foo'].index) pandas_objs['foo'] = pandas_objs['foo'].reindex(rearranged_index) ds_based_on_pandas = Dataset(pandas_objs, ds.coords, attrs=ds.attrs) + del ds_based_on_pandas['x'] self.assertDatasetEqual(ds, ds_based_on_pandas) def test_constructor_pandas_single(self): @@ -297,14 +300,14 @@ def test_constructor_compat(self): self.assertDatasetIdentical(expected, actual) original = Dataset({'a': (('x', 'y'), np.ones((2, 3)))}, - {'c': (('x', 'y'), np.zeros((2, 3)))}) + {'c': (('x', 'y'), np.zeros((2, 3))), 'x': [0, 1]}) expected = Dataset({'a': ('x', np.ones(2)), 'b': ('y', np.ones(3))}, - {'c': (('x', 'y'), np.zeros((2, 3)))}) + {'c': (('x', 'y'), np.zeros((2, 3))), 'x': [0, 1]}) # use an OrderedDict to ensure test results are reproducible; otherwise # the order of appearance of x and y matters for the order of # dimensions in 'c' - actual = Dataset(OrderedDict([('a', original['a'][:, 0].drop('y')), + actual = Dataset(OrderedDict([('a', original['a'][:, 0]), ('b', original['a'][0].drop('x'))])) self.assertDatasetIdentical(expected, actual) @@ -323,7 +326,7 @@ def test_constructor_with_coords(self): ds = Dataset({}, {'a': ('x', [1])}) self.assertFalse(ds.data_vars) - self.assertItemsEqual(ds.coords.keys(), ['x', 'a']) + self.assertItemsEqual(ds.coords.keys(), ['a']) mindex = pd.MultiIndex.from_product([['a', 'b'], [1, 2]], names=('level_1', 'level_2')) @@ -347,7 +350,7 @@ def test_properties(self): self.assertItemsEqual(ds.keys(), list(ds.variables)) self.assertNotIn('aasldfjalskdfj', ds.variables) self.assertIn('dim1', repr(ds.variables)) - self.assertEqual(len(ds), 8) + self.assertEqual(len(ds), 7) self.assertItemsEqual(ds.data_vars, ['var1', 'var2', 'var3']) self.assertItemsEqual(ds.data_vars.keys(), ['var1', 'var2', 'var3']) @@ -356,16 +359,16 @@ def test_properties(self): self.assertNotIn('numbers', ds.data_vars) self.assertEqual(len(ds.data_vars), 3) - self.assertItemsEqual(ds.indexes, ['dim1', 'dim2', 'dim3', 'time']) - self.assertEqual(len(ds.indexes), 4) - self.assertIn('dim1', repr(ds.indexes)) + self.assertItemsEqual(ds.indexes, ['dim2', 'dim3', 'time']) + self.assertEqual(len(ds.indexes), 3) + self.assertIn('dim2', repr(ds.indexes)) - self.assertItemsEqual(ds.coords, - ['time', 'dim1', 'dim2', 'dim3', 'numbers']) - self.assertIn('dim1', ds.coords) + self.assertItemsEqual(ds.coords, ['time', 'dim2', 'dim3', 'numbers']) + self.assertIn('dim2', ds.coords) self.assertIn('numbers', ds.coords) self.assertNotIn('var1', ds.coords) - self.assertEqual(len(ds.coords), 5) + self.assertNotIn('dim1', ds.coords) + self.assertEqual(len(ds.coords), 4) self.assertEqual(Dataset({'x': np.int64(1), 'y': np.float32([1, 2])}).nbytes, 16) @@ -395,9 +398,8 @@ def test_variable(self): self.assertTrue('foo' in a) a['bar'] = (('time', 'x',), d) # order of creation is preserved - self.assertEqual(list(a), ['foo', 'time', 'x', 'bar']) - self.assertTrue(all([a['foo'][i].values == d[i] - for i in np.ndindex(*d.shape)])) + self.assertEqual(list(a), ['foo', 'bar']) + self.assertArrayEqual(a['foo'].values, d) # try to add variable with dim (10,3) with data that's (3,10) with self.assertRaises(ValueError): a['qux'] = (('time', 'x'), d.T) @@ -408,8 +410,7 @@ def test_modify_inplace(self): attributes = {'foo': 'bar'} a['x'] = ('x', vec, attributes) self.assertTrue('x' in a.coords) - self.assertIsInstance(a.coords['x'].to_index(), - pd.Index) + self.assertIsInstance(a.coords['x'].to_index(), pd.Index) self.assertVariableIdentical(a.coords['x'], a.variables['x']) b = Dataset() b['x'] = ('x', vec, attributes) @@ -744,20 +745,19 @@ def test_isel(self): self.assertEqual({'time': 20, 'dim2': 9, 'dim3': 10}, ret.dims) self.assertItemsEqual(data.data_vars, ret.data_vars) self.assertItemsEqual(data.coords, ret.coords) - self.assertItemsEqual(data.indexes, list(ret.indexes) + ['dim1']) + self.assertItemsEqual(data.indexes, ret.indexes) ret = data.isel(time=slice(2), dim1=0, dim2=slice(5)) self.assertEqual({'time': 2, 'dim2': 5, 'dim3': 10}, ret.dims) self.assertItemsEqual(data.data_vars, ret.data_vars) self.assertItemsEqual(data.coords, ret.coords) - self.assertItemsEqual(data.indexes, list(ret.indexes) + ['dim1']) + self.assertItemsEqual(data.indexes, ret.indexes) ret = data.isel(time=0, dim1=0, dim2=slice(5)) self.assertItemsEqual({'dim2': 5, 'dim3': 10}, ret.dims) self.assertItemsEqual(data.data_vars, ret.data_vars) self.assertItemsEqual(data.coords, ret.coords) - self.assertItemsEqual(data.indexes, - list(ret.indexes) + ['dim1', 'time']) + self.assertItemsEqual(data.indexes, list(ret.indexes) + ['time']) def test_sel(self): data = create_test_data() @@ -800,12 +800,12 @@ def test_isel_points(self): actual = data.isel_points(dim1=pdim1, dim2=pdim2, dim3=pdim3, dim='test_coord') - assert 'test_coord' in actual.coords + assert 'test_coord' in actual.dims assert actual.coords['test_coord'].shape == (len(pdim1), ) actual = data.isel_points(dim1=pdim1, dim2=pdim2) - assert 'points' in actual.coords - np.testing.assert_array_equal(pdim1, actual['dim1']) + assert 'points' in actual.dims + np.testing.assert_array_equal(data['dim2'][pdim2], actual['dim2']) # test that the order of the indexers doesn't matter self.assertDatasetIdentical(data.isel_points(dim1=pdim1, dim2=pdim2), @@ -845,7 +845,7 @@ def test_isel_points(self): dim=stations['station']) assert 'station' in actual.coords assert 'station' in actual.dims - self.assertDataArrayIdentical(actual['station'].drop(['dim1', 'dim2']), + self.assertDataArrayIdentical(actual['station'].drop(['dim2']), stations['station']) # make sure we get the default 'points' coordinate when a list is passed @@ -879,9 +879,7 @@ def test_sel_points(self): self.assertDatasetIdentical(expected, actual) data = Dataset({'foo': (('x', 'y'), np.arange(9).reshape(3, 3))}) - expected = Dataset({'foo': ('points', [0, 4, 8])}, - {'x': ('points', range(3)), - 'y': ('points', range(3))}) + expected = Dataset({'foo': ('points', [0, 4, 8])}) actual = data.sel_points(x=[0.1, 1.1, 2.5], y=[0, 1.2, 2.0], method='pad') self.assertDatasetIdentical(expected, actual) @@ -994,18 +992,27 @@ def test_reindex(self): data = create_test_data() self.assertDatasetIdentical(data, data.reindex()) - expected = data.isel(dim1=slice(10)) - actual = data.reindex(dim1=data['dim1'][:10]) + expected = data.assign_coords(dim1=data['dim1']) + actual = data.reindex(dim1=data['dim1']) self.assertDatasetIdentical(actual, expected) - actual = data.reindex(dim1=data['dim1'][:10].values) + actual = data.reindex(dim1=data['dim1'].values) self.assertDatasetIdentical(actual, expected) - actual = data.reindex(dim1=data['dim1'][:10].to_index()) + actual = data.reindex(dim1=data['dim1'].to_index()) + self.assertDatasetIdentical(actual, expected) + + with self.assertRaisesRegexp( + ValueError, 'cannot reindex or align along dimension'): + data.reindex(dim1=data['dim1'][:5]) + + expected = data.isel(dim2=slice(5)) + actual = data.reindex(dim2=data['dim2'][:5]) self.assertDatasetIdentical(actual, expected) # test dict-like argument - actual = data.reindex({'dim1': data['dim1'][:10]}) + actual = data.reindex({'dim2': data['dim2']}) + expected = data self.assertDatasetIdentical(actual, expected) with self.assertRaisesRegexp(ValueError, 'cannot specify both'): data.reindex({'x': 0}, x=0) @@ -1017,20 +1024,22 @@ def test_reindex(self): data.reindex(invalid=0) # out of order - expected = data.sel(dim1=data['dim1'][:10:-1]) - actual = data.reindex(dim1=data['dim1'][:10:-1]) + expected = data.sel(dim2=data['dim2'][:5:-1]) + actual = data.reindex(dim2=data['dim2'][:5:-1]) self.assertDatasetIdentical(actual, expected) # regression test for #279 - expected = Dataset({'x': ('time', np.random.randn(5))}) + expected = Dataset({'x': ('time', np.random.randn(5))}, + {'time': range(5)}) time2 = DataArray(np.arange(5), dims="time2") actual = expected.reindex(time=time2) self.assertDatasetIdentical(actual, expected) # another regression test - ds = Dataset({'foo': (['x', 'y'], np.zeros((3, 4)))}) - expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 2))), - 'x': [0, 1, 3]}) + ds = Dataset({'foo': (['x', 'y'], np.zeros((3, 4)))}, + {'x': range(3), 'y': range(4)}) + expected = Dataset({'foo': (['x', 'y'], np.zeros((3, 2)))}, + {'x': [0, 1, 3], 'y': [0, 1]}) expected['foo'][-1] = np.nan actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) self.assertDatasetIdentical(expected, actual) @@ -1042,7 +1051,7 @@ def test_reindex_variables_copied(self): assert reindexed_data.variables[k] is not data.variables[k] def test_reindex_method(self): - ds = Dataset({'x': ('y', [10, 20])}) + ds = Dataset({'x': ('y', [10, 20]), 'y': [0, 1]}) y = [-0.5, 0.5, 1.5] actual = ds.reindex(y=y, method='backfill') expected = Dataset({'x': ('y', [10, 20, np.nan]), 'y': y}) @@ -1254,11 +1263,6 @@ def test_drop_variables(self): actual = data.drop(['time']) self.assertDatasetIdentical(expected, actual) - expected = Dataset(dict((k, data[k]) for - k in ['dim2', 'dim3', 'time', 'numbers'])) - actual = data.drop('dim1') - self.assertDatasetIdentical(expected, actual) - with self.assertRaisesRegexp(ValueError, 'cannot be found'): data.drop('not_found_here') @@ -1266,10 +1270,6 @@ def test_drop_index_labels(self): data = Dataset({'A': (['x', 'y'], np.random.randn(2, 3)), 'x': ['a', 'b']}) - actual = data.drop(1, 'y') - expected = data.isel(y=[0, 2]) - self.assertDatasetIdentical(expected, actual) - actual = data.drop(['a'], 'x') expected = data.isel(x=[1]) self.assertDatasetIdentical(expected, actual) @@ -1282,6 +1282,10 @@ def test_drop_index_labels(self): # not contained in axis data.drop(['c'], dim='x') + with self.assertRaisesRegexp( + ValueError, 'does not have coordinate labels'): + data.drop(1, 'y') + def test_copy(self): data = create_test_data() @@ -1401,6 +1405,7 @@ def test_unstack(self): names=['x', 'y']) ds = Dataset({'b': ('z', [0, 1, 2, 3]), 'z': index}) expected = Dataset({'b': (('x', 'y'), [[0, 1], [2, 3]]), + 'x': [0, 1], 'y': ['a', 'b']}) actual = ds.unstack('z') self.assertDatasetIdentical(actual, expected) @@ -1415,6 +1420,7 @@ def test_unstack_errors(self): def test_stack_unstack(self): ds = Dataset({'a': ('x', [0, 1]), 'b': (('x', 'y'), [[0, 1], [2, 3]]), + 'x': [0, 1], 'y': ['a', 'b']}) actual = ds.stack(z=['x', 'y']).unstack('z') assert actual.broadcast_equals(ds) @@ -1446,9 +1452,10 @@ def test_update(self): self.assertDatasetIdentical(expected, actual) def test_update_auto_align(self): - ds = Dataset({'x': ('t', [3, 4])}) + ds = Dataset({'x': ('t', [3, 4])}, {'t': [0, 1]}) - expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan, 5])}) + expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan, 5])}, + {'t': [0, 1]}) actual = ds.copy() other = {'y': ('t', [5]), 't': [1]} with self.assertRaisesRegexp(ValueError, 'conflicting sizes'): @@ -1459,7 +1466,8 @@ def test_update_auto_align(self): actual = ds.copy() other = Dataset({'y': ('t', [5]), 't': [100]}) actual.update(other) - expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan] * 2)}) + expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan] * 2)}, + {'t': [0, 1]}) self.assertDatasetIdentical(expected, actual) def test_getitem(self): @@ -1593,7 +1601,7 @@ def test_setitem(self): with self.assertRaisesRegexp(ValueError, 'already exists as a scalar'): data1['newvar'] = ('scalar', [3, 4, 5]) # can't resize a used dimension - with self.assertRaisesRegexp(ValueError, 'conflicting sizes'): + with self.assertRaisesRegexp(ValueError, 'arguments without labels'): data1['dim1'] = data1['dim1'][:5] # override an existing value data1['A'] = 3 * data2['A'] @@ -1605,6 +1613,7 @@ def test_setitem(self): def test_setitem_pandas(self): ds = self.make_example_math_dataset() + ds['x'] = np.arange(3) ds_copy = ds.copy() ds_copy['bar'] = ds['bar'].to_pandas() @@ -1618,26 +1627,27 @@ def test_setitem_auto_align(self): self.assertDatasetIdentical(ds, expected) ds['y'] = DataArray(range(3), dims='y') - expected = Dataset({'x': ('y', range(3))}) + expected = Dataset({'x': ('y', range(3))}, {'y': range(3)}) self.assertDatasetIdentical(ds, expected) - ds['x'] = DataArray([1, 2], dims='y') - expected = Dataset({'x': ('y', [1, 2, np.nan])}) + ds['x'] = DataArray([1, 2], coords=[('y', [0, 1])]) + expected = Dataset({'x': ('y', [1, 2, np.nan])}, {'y': range(3)}) self.assertDatasetIdentical(ds, expected) ds['x'] = 42 expected = Dataset({'x': 42, 'y': range(3)}) self.assertDatasetIdentical(ds, expected) - ds['x'] = DataArray([4, 5, 6, 7], dims='y') - expected = Dataset({'x': ('y', [4, 5, 6])}) + ds['x'] = DataArray([4, 5, 6, 7], coords=[('y', [0, 1, 2, 3])]) + expected = Dataset({'x': ('y', [4, 5, 6])}, {'y': range(3)}) self.assertDatasetIdentical(ds, expected) def test_setitem_align_new_indexes(self): ds = Dataset({'foo': ('x', [1, 2, 3])}, {'x': [0, 1, 2]}) ds['bar'] = DataArray([2, 3, 4], [('x', [1, 2, 3])]) expected = Dataset({'foo': ('x', [1, 2, 3]), - 'bar': ('x', [np.nan, 2, 3])}) + 'bar': ('x', [np.nan, 2, 3])}, + {'x': [0, 1, 2]}) self.assertDatasetIdentical(ds, expected) def test_assign(self): @@ -1649,11 +1659,11 @@ def test_assign(self): self.assertDatasetIdentical(ds, Dataset()) actual = actual.assign(y = lambda ds: ds.x ** 2) - expected = Dataset({'y': ('x', [0, 1, 4])}) + expected = Dataset({'y': ('x', [0, 1, 4]), 'x': [0, 1, 2]}) self.assertDatasetIdentical(actual, expected) actual = actual.assign_coords(z = 2) - expected = Dataset({'y': ('x', [0, 1, 4])}, {'z': 2}) + expected = Dataset({'y': ('x', [0, 1, 4])}, {'z': 2, 'x': [0, 1, 2]}) self.assertDatasetIdentical(actual, expected) ds = Dataset({'a': ('x', range(3))}, {'b': ('x', ['A'] * 2 + ['B'])}) @@ -1678,8 +1688,8 @@ def test_assign_multiindex_level(self): def test_setitem_original_non_unique_index(self): # regression test for GH943 original = Dataset({'data': ('x', np.arange(5))}, - coords={'x': [0, 1, 2, 0, 1]}) - expected = Dataset({'data': ('x', np.arange(5))}) + coords={'x': [0, 1, 2, 0, 1]}) + expected = Dataset({'data': ('x', np.arange(5))}, {'x': range(5)}) actual = original.copy() actual['x'] = list(range(5)) @@ -1715,10 +1725,9 @@ def test_delitem(self): self.assertItemsEqual(data, all_items) del data['var1'] self.assertItemsEqual(data, all_items - set(['var1'])) - del data['dim1'] - self.assertItemsEqual(data, set(['time', 'dim2', 'dim3', 'numbers'])) - self.assertNotIn('dim1', data.dims) - self.assertNotIn('dim1', data.coords) + del data['numbers'] + self.assertItemsEqual(data, all_items - set(['var1', 'numbers'])) + self.assertNotIn('numbers', data.coords) def test_squeeze(self): data = Dataset({'foo': (['x', 'y', 'z'], [[[1], [2]]])}) @@ -1736,7 +1745,8 @@ def get_args(v): def test_groupby(self): data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}, {'x': ('x', list('abc')), - 'c': ('x', [0, 1, 0])}) + 'c': ('x', [0, 1, 0]), + 'y': range(5)}) groupby = data.groupby('x') self.assertEqual(len(groupby), 3) expected_groups = {'a': 0, 'b': 1, 'c': 2} @@ -1757,11 +1767,11 @@ def test_groupby_returns_new_type(self): data = Dataset({'z': (['x', 'y'], np.random.randn(3, 5))}) actual = data.groupby('x').apply(lambda ds: ds['z']) - expected = data['z'] + expected = data['z'].assign_coords(x=data.x) self.assertDataArrayIdentical(expected, actual) actual = data['z'].groupby('x').apply(lambda x: x.to_dataset()) - expected = data + expected = data.assign_coords(x=data.x) self.assertDatasetIdentical(expected, actual) def test_groupby_iter(self): @@ -1788,6 +1798,7 @@ def test_groupby_reduce(self): 'letters': ('y', ['a', 'a', 'b', 'b'])}) expected = data.mean('y') + expected['x'] = [0, 1, 2] expected['yonly'] = expected['yonly'].variable.expand_dims({'x': 3}) actual = data.groupby('x').mean() self.assertDatasetAllClose(expected, actual) @@ -1807,6 +1818,7 @@ def test_groupby_math(self): reorder_dims = lambda x: x.transpose('dim1', 'dim2', 'dim3', 'time') ds = create_test_data() + ds['dim1'] = ds['dim1'] for squeeze in [True, False]: grouped = ds.groupby('dim1', squeeze=squeeze) @@ -1929,7 +1941,7 @@ def test_to_array(self): ds = Dataset(OrderedDict([('a', 1), ('b', ('x', [1, 2, 3]))]), coords={'c': 42}, attrs={'Conventions': 'None'}) data = [[1, 1, 1], [1, 2, 3]] - coords = {'x': range(3), 'c': 42, 'variable': ['a', 'b']} + coords = {'c': 42, 'variable': ['a', 'b']} dims = ('variable', 'x') expected = DataArray(data, coords, dims, attrs=ds.attrs) actual = ds.to_array() @@ -1971,23 +1983,24 @@ def test_to_and_from_dataframe(self): self.assertTrue(expected.equals(actual)) # check roundtrip - self.assertDatasetIdentical(ds, Dataset.from_dataframe(actual)) + self.assertDatasetIdentical(ds.assign_coords(x=[0, 1]), + Dataset.from_dataframe(actual)) # check pathological cases df = pd.DataFrame([1]) actual = Dataset.from_dataframe(df) - expected = Dataset({0: ('index', [1])}) + expected = Dataset({0: ('index', [1])}, {'index': [0]}) self.assertDatasetIdentical(expected, actual) df = pd.DataFrame() actual = Dataset.from_dataframe(df) - expected = Dataset(coords={'index':[]}) + expected = Dataset(coords={'index': []}) self.assertDatasetIdentical(expected, actual) # GH697 df = pd.DataFrame({'A' : []}) actual = Dataset.from_dataframe(df) - expected = Dataset({'A': DataArray([], dims=('index',))}) + expected = Dataset({'A': DataArray([], dims=('index',))}, {'index': []}) self.assertDatasetIdentical(expected, actual) # regression test for GH278 @@ -2213,11 +2226,12 @@ def test_dropna(self): ds.dropna('a', how=None) def test_fillna(self): - ds = Dataset({'a': ('x', [np.nan, 1, np.nan, 3])}) + ds = Dataset({'a': ('x', [np.nan, 1, np.nan, 3])}, + {'x': [0, 1, 2, 3]}) # fill with -1 actual = ds.fillna(-1) - expected = Dataset({'a': ('x', [-1, 1, -1, 3])}) + expected = Dataset({'a': ('x', [-1, 1, -1, 3])}, {'x': [0, 1, 2, 3]}) self.assertDatasetIdentical(expected, actual) actual = ds.fillna({'a': -1}) @@ -2231,7 +2245,7 @@ def test_fillna(self): self.assertDatasetIdentical(expected, actual) # fill with range(4) - b = DataArray(range(4), dims='x') + b = DataArray(range(4), coords=[('x', range(4))]) actual = ds.fillna(b) expected = b.rename('a').to_dataset() self.assertDatasetIdentical(expected, actual) @@ -2248,7 +2262,8 @@ def test_fillna(self): # okay to only include some data variables ds['b'] = np.nan actual = ds.fillna({'a': -1}) - expected = Dataset({'a': ('x', [-1, 1, -1, 3]), 'b': np.nan}) + expected = Dataset({'a': ('x', [-1, 1, -1, 3]), 'b': np.nan}, + {'x': [0, 1, 2, 3]}) self.assertDatasetIdentical(expected, actual) # but new data variables is not okay @@ -2264,7 +2279,7 @@ def test_fillna(self): self.assertDatasetIdentical(expected, result) # groupby - expected = Dataset({'a': ('x', range(4))}) + expected = Dataset({'a': ('x', range(4))}, {'x': [0, 1, 2, 3]}) for target in [ds, expected]: target.coords['b'] = ('x', [0, 0, 1, 1]) actual = ds.groupby('b').fillna(DataArray([0, 2], dims='b')) @@ -2281,12 +2296,12 @@ def test_fillna(self): self.assertEqual(actual.a.name, 'a') self.assertEqual(actual.a.attrs, ds.a.attrs) - da = DataArray(range(5), name='a', attrs={'attr':'da'}) + da = DataArray(range(5), name='a', attrs={'attr': 'da'}) actual = da.fillna(1) self.assertEqual(actual.name, 'a') self.assertEqual(actual.attrs, da.attrs) - ds = Dataset({'a': da}, attrs={'attr':'ds'}) + ds = Dataset({'a': da}, attrs={'attr': 'ds'}) actual = ds.fillna({'a': 1}) self.assertEqual(actual.attrs, ds.attrs) self.assertEqual(actual.a.name, 'a') @@ -2662,14 +2677,13 @@ def test_dataset_dataset_math(self): def test_dataset_math_auto_align(self): ds = self.make_example_math_dataset() - subset = ds.isel(x=slice(2), y=[1, 3]) + subset = ds.isel(y=[1, 3]) expected = 2 * subset actual = ds + subset self.assertDatasetIdentical(expected, actual) - - actual = ds.isel(x=slice(1)) + ds.isel(x=slice(1, None)) - expected = ds.drop(ds.x, dim='x') + actual = ds.isel(y=slice(1)) + ds.isel(y=slice(1, None)) + expected = 2 * ds.drop(ds.y, dim='y') self.assertDatasetEqual(actual, expected) actual = ds + ds[['bar']] @@ -2685,12 +2699,11 @@ def test_dataset_math_auto_align(self): # maybe unary arithmetic with empty datasets should raise instead? self.assertDatasetIdentical(Dataset() + 1, Dataset()) - for other in [ds.isel(x=slice(2)), ds.bar.isel(x=slice(0))]: - actual = ds.copy(deep=True) - other = ds.isel(x=slice(2)) - actual += other - expected = ds + other.reindex_like(ds) - self.assertDatasetIdentical(expected, actual) + actual = ds.copy(deep=True) + other = ds.isel(y=slice(2)) + actual += other + expected = ds + other.reindex_like(ds) + self.assertDatasetIdentical(expected, actual) def test_dataset_math_errors(self): ds = self.make_example_math_dataset() @@ -2770,12 +2783,10 @@ def test_dataset_diff_n1(self): actual = ds.diff('dim2') expected = dict() expected['var1'] = DataArray(np.diff(ds['var1'].values, axis=1), - [ds['dim1'].values, - ds['dim2'].values[1:]], + {'dim2': ds['dim2'].values[1:]}, ['dim1', 'dim2']) expected['var2'] = DataArray(np.diff(ds['var2'].values, axis=1), - [ds['dim1'].values, - ds['dim2'].values[1:]], + {'dim2': ds['dim2'].values[1:]}, ['dim1', 'dim2']) expected['var3'] = ds['var3'] expected = Dataset(expected, coords={'time': ds['time'].values}) @@ -2787,12 +2798,10 @@ def test_dataset_diff_n2(self): actual = ds.diff('dim2', n=2) expected = dict() expected['var1'] = DataArray(np.diff(ds['var1'].values, axis=1, n=2), - [ds['dim1'].values, - ds['dim2'].values[2:]], + {'dim2': ds['dim2'].values[2:]}, ['dim1', 'dim2']) expected['var2'] = DataArray(np.diff(ds['var2'].values, axis=1, n=2), - [ds['dim1'].values, - ds['dim2'].values[2:]], + {'dim2': ds['dim2'].values[2:]}, ['dim1', 'dim2']) expected['var3'] = ds['var3'] expected = Dataset(expected, coords={'time': ds['time'].values}) @@ -2900,7 +2909,7 @@ def data_set(seed=None): def test_dir_expected_attrs(data_set): some_expected_attrs = {'pipe', 'mean', 'isnull', 'var1', - 'dim1', 'numbers'} + 'dim2', 'numbers'} result = dir(data_set) assert set(result) >= some_expected_attrs