From 4a6237abad55bf5c9b54a5b93311338f0d613c40 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sat, 16 Nov 2019 20:03:06 +0900 Subject: [PATCH 01/10] Added fill_value for unstack --- doc/whats-new.rst | 3 +++ xarray/core/dataarray.py | 7 ++++++- xarray/core/dataset.py | 15 +++++++++++---- xarray/core/variable.py | 2 +- xarray/tests/test_dataset.py | 17 +++++++++++++++++ 5 files changed, 38 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c835fbeff45..6bf495713fe 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,6 +39,9 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added the ``fill_value`` option to :py:meth:`~xarray.DataArray.unstack` and + :py:meth:`~xarray.Dataset.unstack` (:issue:`3518`). + By `Keisuke Fujii `_. - Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data gap that will be filled by interpolation. By `Deepak Cherian `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b27a61d530b..9ea175bc991 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1726,7 +1726,10 @@ def stack( return self._from_temp_dataset(ds) def unstack( - self, dim: Union[Hashable, Sequence[Hashable], None] = None + self, + dim: Union[Hashable, Sequence[Hashable], None] = None, + fill_value: Any = dtypes.NA, + sparse: bool = False, ) -> "DataArray": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -1739,6 +1742,8 @@ def unstack( dim : hashable or sequence of hashable, optional Dimension(s) over which to unstack. By default unstacks all MultiIndexes. + fill_value: value to be filled. By default, np.nan + sparse: use sparse if True. Returns ------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3a83b477681..87a0eda0803 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3333,7 +3333,7 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable) -> "Dataset": + def _unstack_once(self, dim: Hashable, fill_value, sparse: bool) -> "Dataset": index = self.get_index(dim) index = index.remove_unused_levels() full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3342,7 +3342,7 @@ def _unstack_once(self, dim: Hashable) -> "Dataset": if index.equals(full_idx): obj = self else: - obj = self.reindex({dim: full_idx}, copy=False) + obj = self.reindex({dim: full_idx}, copy=False, fill_value=fill_value) new_dim_names = index.names new_dim_sizes = [lev.size for lev in index.levels] @@ -3368,7 +3368,12 @@ def _unstack_once(self, dim: Hashable) -> "Dataset": variables, coord_names=coord_names, indexes=indexes ) - def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": + def unstack( + self, + dim: Union[Hashable, Iterable[Hashable]] = None, + fill_value: Any = dtypes.NA, + sparse: bool = False, + ) -> "Dataset": """ Unstack existing dimensions corresponding to MultiIndexes into multiple new dimensions. @@ -3380,6 +3385,8 @@ def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": dim : Hashable or iterable of Hashable, optional Dimension(s) over which to unstack. By default unstacks all MultiIndexes. + fill_value: value to be filled. By default, np.nan + sparse: use sparse if True. Returns ------- @@ -3417,7 +3424,7 @@ def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim) + result = result._unstack_once(dim, fill_value, sparse) return result def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e630dc4b457..6582aba5d6e 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1341,7 +1341,7 @@ def _stack_once(self, dims, new_dim): def stack(self, dimensions=None, **dimensions_kwargs): """ - Stack any number of existing dimensions into a single new dimension. + Stack any nimber of existing dimensions into a single new dimension. New dimensions will be added at the end, and the order of the data along each new dimension will be in contiguous (C) order. diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 780843f2e61..051b44c64aa 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2794,6 +2794,23 @@ def test_unstack_errors(self): with raises_regex(ValueError, "do not have a MultiIndex"): ds.unstack("x") + def test_unstack_fill_value(self): + ds = xr.Dataset( + {"var": (("x",), np.arange(6))}, + coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, + ) + # make ds incomplete + ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) + # test fill_value + actual = ds.unstack("index", fill_value=-1) + expected = ds.unstack("index").fillna(-1).astype(np.int) + assert actual["var"].dtype == np.int + assert actual.equals(expected) + + actual = ds["var"].unstack("index", fill_value=-1) + expected = ds["var"].unstack("index").fillna(-1).astype(np.int) + assert actual.equals(expected) + def test_stack_unstack_fast(self): ds = Dataset( { From e7b470d87772f7f8f7c3aae68e3e4421f0e5cccf Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sat, 16 Nov 2019 20:14:27 +0900 Subject: [PATCH 02/10] remove sparse option and fix unintended changes --- xarray/core/dataarray.py | 2 -- xarray/core/dataset.py | 6 ++---- xarray/core/variable.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 9ea175bc991..f613a2f1d82 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1729,7 +1729,6 @@ def unstack( self, dim: Union[Hashable, Sequence[Hashable], None] = None, fill_value: Any = dtypes.NA, - sparse: bool = False, ) -> "DataArray": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -1743,7 +1742,6 @@ def unstack( Dimension(s) over which to unstack. By default unstacks all MultiIndexes. fill_value: value to be filled. By default, np.nan - sparse: use sparse if True. Returns ------- diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 87a0eda0803..371e0d6bf26 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3333,7 +3333,7 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable, fill_value, sparse: bool) -> "Dataset": + def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": index = self.get_index(dim) index = index.remove_unused_levels() full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3372,7 +3372,6 @@ def unstack( self, dim: Union[Hashable, Iterable[Hashable]] = None, fill_value: Any = dtypes.NA, - sparse: bool = False, ) -> "Dataset": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -3386,7 +3385,6 @@ def unstack( Dimension(s) over which to unstack. By default unstacks all MultiIndexes. fill_value: value to be filled. By default, np.nan - sparse: use sparse if True. Returns ------- @@ -3424,7 +3422,7 @@ def unstack( result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim, fill_value, sparse) + result = result._unstack_once(dim, fill_value) return result def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 6582aba5d6e..e630dc4b457 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1341,7 +1341,7 @@ def _stack_once(self, dims, new_dim): def stack(self, dimensions=None, **dimensions_kwargs): """ - Stack any nimber of existing dimensions into a single new dimension. + Stack any number of existing dimensions into a single new dimension. New dimensions will be added at the end, and the order of the data along each new dimension will be in contiguous (C) order. From 1df4a3ce56263f5b1ce238f06da070898000443a Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sat, 16 Nov 2019 20:23:16 +0900 Subject: [PATCH 03/10] a bug fix --- xarray/core/dataarray.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index f613a2f1d82..23342fc5e0d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1773,7 +1773,7 @@ def unstack( -------- DataArray.stack """ - ds = self._to_temp_dataset().unstack(dim) + ds = self._to_temp_dataset().unstack(dim, fill_value) return self._from_temp_dataset(ds) def to_unstacked_dataset(self, dim, level=0): From 6a66831897e35f8867a41f7da0c5d33862204a14 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sat, 16 Nov 2019 23:16:17 +0900 Subject: [PATCH 04/10] Added sparse option to unstack and reindex --- doc/whats-new.rst | 5 ++++- xarray/core/alignment.py | 5 +++++ xarray/core/dataarray.py | 4 +++- xarray/core/dataset.py | 13 +++++++++--- xarray/core/duck_array_ops.py | 7 ++++++- xarray/core/variable.py | 33 +++++++++++++++++++++++++++++++ xarray/tests/test_dataset.py | 37 +++++++++++++++++++++++++++++++++++ xarray/tests/test_variable.py | 12 ++++++++++++ 8 files changed, 110 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6bf495713fe..8069bb7e6fa 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,7 +38,10 @@ Breaking changes New Features ~~~~~~~~~~~~ - +- Added the ``sparse`` option to :py:meth:`~xarray.DataArray.unstack`, + :py:meth:`~xarray.Dataset.unstack`, :py:meth:`~xarray.DataArray.reindex`, + :py:meth:`~xarray.Dataset.reindex` (:issue:`3518`). + By `Keisuke Fujii `_. - Added the ``fill_value`` option to :py:meth:`~xarray.DataArray.unstack` and :py:meth:`~xarray.Dataset.unstack` (:issue:`3518`). By `Keisuke Fujii `_. diff --git a/xarray/core/alignment.py b/xarray/core/alignment.py index 41ff5a3b32d..749de6c13e2 100644 --- a/xarray/core/alignment.py +++ b/xarray/core/alignment.py @@ -466,6 +466,7 @@ def reindex_variables( tolerance: Any = None, copy: bool = True, fill_value: Optional[Any] = dtypes.NA, + sparse: bool = False, ) -> Tuple[Dict[Hashable, Variable], Dict[Hashable, pd.Index]]: """Conform a dictionary of aligned variables onto a new set of variables, filling in missing values with NaN. @@ -503,6 +504,8 @@ def reindex_variables( the input. In either case, new xarray objects are always returned. fill_value : scalar, optional Value to use for newly missing values + sparse: bool, optional + Use an sparse-array Returns ------- @@ -571,6 +574,8 @@ def reindex_variables( for name, var in variables.items(): if name not in indexers: + if sparse: + var = var._as_sparse(fill_value=fill_value) key = tuple( slice(None) if d in unchanged_dims else int_indexers.get(d, slice(None)) for d in var.dims diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 23342fc5e0d..1ed4b5566d7 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1729,6 +1729,7 @@ def unstack( self, dim: Union[Hashable, Sequence[Hashable], None] = None, fill_value: Any = dtypes.NA, + sparse: bool = False, ) -> "DataArray": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -1742,6 +1743,7 @@ def unstack( Dimension(s) over which to unstack. By default unstacks all MultiIndexes. fill_value: value to be filled. By default, np.nan + sparse: use sparse-array if True Returns ------- @@ -1773,7 +1775,7 @@ def unstack( -------- DataArray.stack """ - ds = self._to_temp_dataset().unstack(dim, fill_value) + ds = self._to_temp_dataset().unstack(dim, fill_value, sparse) return self._from_temp_dataset(ds) def to_unstacked_dataset(self, dim, level=0): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 371e0d6bf26..e6c732c5a26 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2254,6 +2254,7 @@ def reindex( tolerance: Number = None, copy: bool = True, fill_value: Any = dtypes.NA, + sparse: bool = False, **indexers_kwargs: Any, ) -> "Dataset": """Conform this object onto a new set of indexes, filling in @@ -2286,6 +2287,7 @@ def reindex( the input. In either case, a new xarray object is always returned. fill_value : scalar, optional Value to use for newly missing values + sparse: use sparse-array. By default, False **indexers_kwarg : {dim: indexer, ...}, optional Keyword arguments in the same form as ``indexers``. One of indexers or indexers_kwargs must be provided. @@ -2444,6 +2446,7 @@ def reindex( tolerance, copy=copy, fill_value=fill_value, + sparse=sparse ) coord_names = set(self._coord_names) coord_names.update(indexers) @@ -3333,7 +3336,7 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": + def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset": index = self.get_index(dim) index = index.remove_unused_levels() full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3342,7 +3345,9 @@ def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": if index.equals(full_idx): obj = self else: - obj = self.reindex({dim: full_idx}, copy=False, fill_value=fill_value) + obj = self.reindex( + {dim: full_idx}, copy=False, fill_value=fill_value, sparse=sparse + ) new_dim_names = index.names new_dim_sizes = [lev.size for lev in index.levels] @@ -3372,6 +3377,7 @@ def unstack( self, dim: Union[Hashable, Iterable[Hashable]] = None, fill_value: Any = dtypes.NA, + sparse: bool = False ) -> "Dataset": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -3385,6 +3391,7 @@ def unstack( Dimension(s) over which to unstack. By default unstacks all MultiIndexes. fill_value: value to be filled. By default, np.nan + sparse: use sparse-array if True Returns ------- @@ -3422,7 +3429,7 @@ def unstack( result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim, fill_value) + result = result._unstack_once(dim, fill_value, sparse) return result def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 71e79335c3d..af7d1a4cdce 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,12 +13,14 @@ from . import dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast -from .pycompat import dask_array_type +from .pycompat import dask_array_type, sparse_array_type try: import dask.array as dask_array + import sparse except ImportError: dask_array = None # type: ignore + sparse = None # type: ignore def _dask_or_eager_func( @@ -251,6 +253,9 @@ def count(data, axis=None): def where(condition, x, y): """Three argument where() with better dtype promotion rules.""" + # sparse support + if isinstance(x, sparse_array_type) or isinstance(y, sparse_array_type): + return sparse.where(condition, x, y) return _where(condition, *as_shared_dtype([x, y])) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index e630dc4b457..7e4f9750824 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -993,6 +993,31 @@ def chunk(self, chunks=None, name=None, lock=False): return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True) + def _as_sparse(self, sparse_format=_default, fill_value=dtypes.NA): + """ + use sparse-array as backend. + """ + import sparse + # TODO what to do if dask-backended? + if fill_value is dtypes.NA: + dtype, fill_value = dtypes.maybe_promote(self.dtype) + else: + dtype = self.dtype + + if sparse_format is _default: + sparse_format = 'coo' + as_sparse = getattr(sparse, 'as_{}'.format(sparse_format.lower())) + data = as_sparse(self.data.astype(dtype), fill_value=fill_value) + return self._replace(data=data) + + def _to_dense(self): + """ + Change backend from sparse to np.array + """ + if hasattr(self._data, 'todense'): + return self._replace(data=self._data.todense()) + return self.copy(deep=False) + def isel( self: VariableType, indexers: Mapping[Hashable, Any] = None, @@ -2021,6 +2046,14 @@ def chunk(self, chunks=None, name=None, lock=False): # Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() return self.copy(deep=False) + def _as_sparse(self, sparse_format=_default, fill_value=_default): + # Dummy + return self.copy(deep=False) + + def _to_dense(self): + # Dummy + return self.copy(deep=False) + def _finalize_indexing_result(self, dims, data): if getattr(data, "ndim", 0) != 1: # returns Variable rather than IndexVariable if multi-dimensional diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 051b44c64aa..b25c2e86704 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1748,6 +1748,25 @@ def test_reindex(self): actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) assert_identical(expected, actual) + @requires_sparse + def test_reindex_sparse(self): + data = create_test_data() + dim3 = list("abdeghijk") + actual = data.reindex(dim3=dim3, sparse=True) + expected = data.reindex(dim3=dim3, sparse=False) + for k, v in data.data_vars.items(): + np.testing.assert_equal( + actual[k].data.todense(), expected[k].data + ) + + data['var3'] = data['var3'].astype(int) + actual = data.reindex(dim3=dim3, sparse=True, fill_value=-10) + expected = data.reindex(dim3=dim3, sparse=False, fill_value=-10) + for k, v in data.data_vars.items(): + np.testing.assert_equal( + actual[k].data.todense(), expected[k].data + ) + def test_reindex_warning(self): data = create_test_data() @@ -2811,6 +2830,24 @@ def test_unstack_fill_value(self): expected = ds["var"].unstack("index").fillna(-1).astype(np.int) assert actual.equals(expected) + @requires_sparse + def test_unstack_fill_value(self): + ds = xr.Dataset( + {"var": (("x",), np.arange(6))}, + coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, + ) + # make ds incomplete + ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) + # test fill_value + actual = ds.unstack("index", sparse=True) + expected = ds.unstack("index") + assert actual['var'].variable._to_dense().equals( + expected['var'].variable) + + actual = ds["var"].unstack("index", sparse=True) + expected = ds["var"].unstack("index") + assert actual.variable._to_dense().equals(expected.variable) + def test_stack_unstack_fast(self): ds = Dataset( { diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d92a68729b5..8c847d05d97 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -33,6 +33,7 @@ assert_identical, raises_regex, requires_dask, + requires_sparse, source_ndarray, ) @@ -1862,6 +1863,17 @@ def test_getitem_with_mask_nd_indexer(self): ) +@requires_sparse +class TestVariableWithSparse(): + # TODO inherit VariableSubclassobjects to cover more tests + + def test_as_sparse(self): + data = np.arange(12).reshape(3, 4) + var = Variable(('x', 'y'), data)._as_sparse(fill_value=-1) + actual = var._to_dense() + assert_identical(var, actual) + + class TestIndexVariable(VariableSubclassobjects): cls = staticmethod(IndexVariable) From 3a369a1471876603721b8d826894164962e3d354 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sat, 16 Nov 2019 23:28:29 +0900 Subject: [PATCH 05/10] black --- xarray/core/dataset.py | 4 ++-- xarray/core/variable.py | 7 ++++--- xarray/tests/test_dataset.py | 13 ++++--------- xarray/tests/test_variable.py | 4 ++-- 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e6c732c5a26..b324ec09614 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2446,7 +2446,7 @@ def reindex( tolerance, copy=copy, fill_value=fill_value, - sparse=sparse + sparse=sparse, ) coord_names = set(self._coord_names) coord_names.update(indexers) @@ -3377,7 +3377,7 @@ def unstack( self, dim: Union[Hashable, Iterable[Hashable]] = None, fill_value: Any = dtypes.NA, - sparse: bool = False + sparse: bool = False, ) -> "Dataset": """ Unstack existing dimensions corresponding to MultiIndexes into diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 7e4f9750824..f3f18204953 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -998,6 +998,7 @@ def _as_sparse(self, sparse_format=_default, fill_value=dtypes.NA): use sparse-array as backend. """ import sparse + # TODO what to do if dask-backended? if fill_value is dtypes.NA: dtype, fill_value = dtypes.maybe_promote(self.dtype) @@ -1005,8 +1006,8 @@ def _as_sparse(self, sparse_format=_default, fill_value=dtypes.NA): dtype = self.dtype if sparse_format is _default: - sparse_format = 'coo' - as_sparse = getattr(sparse, 'as_{}'.format(sparse_format.lower())) + sparse_format = "coo" + as_sparse = getattr(sparse, "as_{}".format(sparse_format.lower())) data = as_sparse(self.data.astype(dtype), fill_value=fill_value) return self._replace(data=data) @@ -1014,7 +1015,7 @@ def _to_dense(self): """ Change backend from sparse to np.array """ - if hasattr(self._data, 'todense'): + if hasattr(self._data, "todense"): return self._replace(data=self._data.todense()) return self.copy(deep=False) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index b25c2e86704..304b45bcbb6 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1755,17 +1755,13 @@ def test_reindex_sparse(self): actual = data.reindex(dim3=dim3, sparse=True) expected = data.reindex(dim3=dim3, sparse=False) for k, v in data.data_vars.items(): - np.testing.assert_equal( - actual[k].data.todense(), expected[k].data - ) + np.testing.assert_equal(actual[k].data.todense(), expected[k].data) - data['var3'] = data['var3'].astype(int) + data["var3"] = data["var3"].astype(int) actual = data.reindex(dim3=dim3, sparse=True, fill_value=-10) expected = data.reindex(dim3=dim3, sparse=False, fill_value=-10) for k, v in data.data_vars.items(): - np.testing.assert_equal( - actual[k].data.todense(), expected[k].data - ) + np.testing.assert_equal(actual[k].data.todense(), expected[k].data) def test_reindex_warning(self): data = create_test_data() @@ -2841,8 +2837,7 @@ def test_unstack_fill_value(self): # test fill_value actual = ds.unstack("index", sparse=True) expected = ds.unstack("index") - assert actual['var'].variable._to_dense().equals( - expected['var'].variable) + assert actual["var"].variable._to_dense().equals(expected["var"].variable) actual = ds["var"].unstack("index", sparse=True) expected = ds["var"].unstack("index") diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 8c847d05d97..ee8d54e567e 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1864,12 +1864,12 @@ def test_getitem_with_mask_nd_indexer(self): @requires_sparse -class TestVariableWithSparse(): +class TestVariableWithSparse: # TODO inherit VariableSubclassobjects to cover more tests def test_as_sparse(self): data = np.arange(12).reshape(3, 4) - var = Variable(('x', 'y'), data)._as_sparse(fill_value=-1) + var = Variable(("x", "y"), data)._as_sparse(fill_value=-1) actual = var._to_dense() assert_identical(var, actual) From 179cc1f85233f68ee08415337eaae85cbdd1d99e Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sat, 16 Nov 2019 23:46:24 +0900 Subject: [PATCH 06/10] More tests --- xarray/tests/test_dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index f36a6535b80..69f8d401d7a 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1756,12 +1756,14 @@ def test_reindex_sparse(self): expected = data.reindex(dim3=dim3, sparse=False) for k, v in data.data_vars.items(): np.testing.assert_equal(actual[k].data.todense(), expected[k].data) + assert actual['var3'].data.density < 1.0 data["var3"] = data["var3"].astype(int) actual = data.reindex(dim3=dim3, sparse=True, fill_value=-10) expected = data.reindex(dim3=dim3, sparse=False, fill_value=-10) for k, v in data.data_vars.items(): np.testing.assert_equal(actual[k].data.todense(), expected[k].data) + assert actual['var3'].data.density < 1.0 def test_reindex_warning(self): data = create_test_data() @@ -2827,7 +2829,7 @@ def test_unstack_fill_value(self): assert actual.equals(expected) @requires_sparse - def test_unstack_fill_value(self): + def test_unstack_sparse(self): ds = xr.Dataset( {"var": (("x",), np.arange(6))}, coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, @@ -2838,10 +2840,12 @@ def test_unstack_fill_value(self): actual = ds.unstack("index", sparse=True) expected = ds.unstack("index") assert actual["var"].variable._to_dense().equals(expected["var"].variable) + assert actual["var"].data.density < 1.0 actual = ds["var"].unstack("index", sparse=True) expected = ds["var"].unstack("index") assert actual.variable._to_dense().equals(expected.variable) + assert actual.data.density < 1.0 def test_stack_unstack_fast(self): ds = Dataset( From 5d8ab27fc6c0debf3fa36d2dd05d4c12120d1380 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sat, 16 Nov 2019 23:53:25 +0900 Subject: [PATCH 07/10] black --- xarray/tests/test_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 69f8d401d7a..bc0d60371fd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1756,14 +1756,14 @@ def test_reindex_sparse(self): expected = data.reindex(dim3=dim3, sparse=False) for k, v in data.data_vars.items(): np.testing.assert_equal(actual[k].data.todense(), expected[k].data) - assert actual['var3'].data.density < 1.0 + assert actual["var3"].data.density < 1.0 data["var3"] = data["var3"].astype(int) actual = data.reindex(dim3=dim3, sparse=True, fill_value=-10) expected = data.reindex(dim3=dim3, sparse=False, fill_value=-10) for k, v in data.data_vars.items(): np.testing.assert_equal(actual[k].data.todense(), expected[k].data) - assert actual['var3'].data.density < 1.0 + assert actual["var3"].data.density < 1.0 def test_reindex_warning(self): data = create_test_data() From 13ad6837254770e4e9f654a86f76b4f8fe11b939 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Sun, 17 Nov 2019 15:11:19 +0900 Subject: [PATCH 08/10] Remove sparse option from reindex --- xarray/core/dataset.py | 26 ++++++++++++++++++++++++-- xarray/core/duck_array_ops.py | 8 ++++++-- xarray/core/variable.py | 8 ++++++-- xarray/tests/test_dataset.py | 17 ----------------- 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index b324ec09614..71288757cb7 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2254,7 +2254,6 @@ def reindex( tolerance: Number = None, copy: bool = True, fill_value: Any = dtypes.NA, - sparse: bool = False, **indexers_kwargs: Any, ) -> "Dataset": """Conform this object onto a new set of indexes, filling in @@ -2430,6 +2429,29 @@ def reindex( the original and desired indexes. If you do want to fill in the `NaN` values present in the original dataset, use the :py:meth:`~Dataset.fillna()` method. + """ + return self._reindex( + indexers, + method, + tolerance, + copy, + fill_value, + sparse=False, + **indexers_kwargs, + ) + + def _reindex( + self, + indexers: Mapping[Hashable, Any] = None, + method: str = None, + tolerance: Number = None, + copy: bool = True, + fill_value: Any = dtypes.NA, + sparse: bool = False, + **indexers_kwargs: Any, + ) -> "Dataset": + """ + same to _reindex but support sparse option """ indexers = utils.either_dict_or_kwargs(indexers, indexers_kwargs, "reindex") @@ -3345,7 +3367,7 @@ def _unstack_once(self, dim: Hashable, fill_value, sparse) -> "Dataset": if index.equals(full_idx): obj = self else: - obj = self.reindex( + obj = self._reindex( {dim: full_idx}, copy=False, fill_value=fill_value, sparse=sparse ) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index af7d1a4cdce..63c24ea65a2 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -17,9 +17,12 @@ try: import dask.array as dask_array - import sparse except ImportError: dask_array = None # type: ignore + +try: + import sparse +except ImportError: sparse = None # type: ignore @@ -253,7 +256,8 @@ def count(data, axis=None): def where(condition, x, y): """Three argument where() with better dtype promotion rules.""" - # sparse support + # TODO sparse is not working with np.result_type and x.astype(copy=False) + # The following two lines may be removed after they are supported. if isinstance(x, sparse_array_type) or isinstance(y, sparse_array_type): return sparse.where(condition, x, y) return _where(condition, *as_shared_dtype([x, y])) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f3f18204953..55e8f64d56c 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1003,11 +1003,15 @@ def _as_sparse(self, sparse_format=_default, fill_value=dtypes.NA): if fill_value is dtypes.NA: dtype, fill_value = dtypes.maybe_promote(self.dtype) else: - dtype = self.dtype + dtype = dtypes.result_type(self.dtype, fill_value) if sparse_format is _default: sparse_format = "coo" - as_sparse = getattr(sparse, "as_{}".format(sparse_format.lower())) + try: + as_sparse = getattr(sparse, "as_{}".format(sparse_format.lower())) + except AttributeError: + raise ValueError("{} is not a valid sparse format".format(sparse_format)) + data = as_sparse(self.data.astype(dtype), fill_value=fill_value) return self._replace(data=data) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index bc0d60371fd..b09203f91a2 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -1748,23 +1748,6 @@ def test_reindex(self): actual = ds.reindex(x=[0, 1, 3], y=[0, 1]) assert_identical(expected, actual) - @requires_sparse - def test_reindex_sparse(self): - data = create_test_data() - dim3 = list("abdeghijk") - actual = data.reindex(dim3=dim3, sparse=True) - expected = data.reindex(dim3=dim3, sparse=False) - for k, v in data.data_vars.items(): - np.testing.assert_equal(actual[k].data.todense(), expected[k].data) - assert actual["var3"].data.density < 1.0 - - data["var3"] = data["var3"].astype(int) - actual = data.reindex(dim3=dim3, sparse=True, fill_value=-10) - expected = data.reindex(dim3=dim3, sparse=False, fill_value=-10) - for k, v in data.data_vars.items(): - np.testing.assert_equal(actual[k].data.todense(), expected[k].data) - assert actual["var3"].data.density < 1.0 - def test_reindex_warning(self): data = create_test_data() From ac41ef82e301542d30be11114cf18adcbc3c5433 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Mon, 18 Nov 2019 07:30:13 +0900 Subject: [PATCH 09/10] try __array_function__ where --- xarray/core/duck_array_ops.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 63c24ea65a2..3619f6d3a91 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -20,11 +20,6 @@ except ImportError: dask_array = None # type: ignore -try: - import sparse -except ImportError: - sparse = None # type: ignore - def _dask_or_eager_func( name, @@ -256,10 +251,6 @@ def count(data, axis=None): def where(condition, x, y): """Three argument where() with better dtype promotion rules.""" - # TODO sparse is not working with np.result_type and x.astype(copy=False) - # The following two lines may be removed after they are supported. - if isinstance(x, sparse_array_type) or isinstance(y, sparse_array_type): - return sparse.where(condition, x, y) return _where(condition, *as_shared_dtype([x, y])) From 92ce6cdbfc0ff59a1963933bdb46612908ab4de2 Mon Sep 17 00:00:00 2001 From: keisukefujii Date: Mon, 18 Nov 2019 18:01:30 +0900 Subject: [PATCH 10/10] flake8 --- xarray/core/duck_array_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index 3619f6d3a91..71e79335c3d 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -13,7 +13,7 @@ from . import dask_array_ops, dtypes, npcompat, nputils from .nputils import nanfirst, nanlast -from .pycompat import dask_array_type, sparse_array_type +from .pycompat import dask_array_type try: import dask.array as dask_array