-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
sparse option to reindex and unstack #3542
Changes from 9 commits
4a6237a
e7b470d
1df4a3c
6a66831
3a369a1
6fe30e2
179cc1f
5d8ab27
13ad683
ac41ef8
92ce6cd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -993,6 +993,36 @@ def chunk(self, chunks=None, name=None, lock=False): | |
|
||
return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True) | ||
|
||
def _as_sparse(self, sparse_format=_default, fill_value=dtypes.NA): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently, this is a private method. |
||
""" | ||
use sparse-array as backend. | ||
""" | ||
import sparse | ||
|
||
# TODO what to do if dask-backended? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hopefully sparse will raise an error if you try to convert a dask array into a sparse array! If not, we should do that ourselves. Long term, the best solution would be to convert a dask array from dense chunks to sparse chunks. |
||
if fill_value is dtypes.NA: | ||
dtype, fill_value = dtypes.maybe_promote(self.dtype) | ||
else: | ||
dtype = dtypes.result_type(self.dtype, fill_value) | ||
|
||
if sparse_format is _default: | ||
sparse_format = "coo" | ||
try: | ||
as_sparse = getattr(sparse, "as_{}".format(sparse_format.lower())) | ||
except AttributeError: | ||
raise ValueError("{} is not a valid sparse format".format(sparse_format)) | ||
|
||
data = as_sparse(self.data.astype(dtype), fill_value=fill_value) | ||
return self._replace(data=data) | ||
|
||
def _to_dense(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also private, as is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should make these public in Can be left for a future PR though :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
""" | ||
Change backend from sparse to np.array | ||
""" | ||
if hasattr(self._data, "todense"): | ||
return self._replace(data=self._data.todense()) | ||
return self.copy(deep=False) | ||
|
||
def isel( | ||
self: VariableType, | ||
indexers: Mapping[Hashable, Any] = None, | ||
|
@@ -2021,6 +2051,14 @@ def chunk(self, chunks=None, name=None, lock=False): | |
# Dummy - do not chunk. This method is invoked e.g. by Dataset.chunk() | ||
return self.copy(deep=False) | ||
|
||
def _as_sparse(self, sparse_format=_default, fill_value=_default): | ||
# Dummy | ||
return self.copy(deep=False) | ||
|
||
def _to_dense(self): | ||
# Dummy | ||
return self.copy(deep=False) | ||
|
||
def _finalize_indexing_result(self, dims, data): | ||
if getattr(data, "ndim", 0) != 1: | ||
# returns Variable rather than IndexVariable if multi-dimensional | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2811,6 +2811,25 @@ def test_unstack_fill_value(self): | |
expected = ds["var"].unstack("index").fillna(-1).astype(np.int) | ||
assert actual.equals(expected) | ||
|
||
@requires_sparse | ||
def test_unstack_sparse(self): | ||
ds = xr.Dataset( | ||
{"var": (("x",), np.arange(6))}, | ||
coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, | ||
) | ||
# make ds incomplete | ||
ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) | ||
# test fill_value | ||
actual = ds.unstack("index", sparse=True) | ||
expected = ds.unstack("index") | ||
assert actual["var"].variable._to_dense().equals(expected["var"].variable) | ||
assert actual["var"].data.density < 1.0 | ||
|
||
actual = ds["var"].unstack("index", sparse=True) | ||
expected = ds["var"].unstack("index") | ||
assert actual.variable._to_dense().equals(expected.variable) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we test whether |
||
assert actual.data.density < 1.0 | ||
|
||
def test_stack_unstack_fast(self): | ||
ds = Dataset( | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ | |
assert_identical, | ||
raises_regex, | ||
requires_dask, | ||
requires_sparse, | ||
source_ndarray, | ||
) | ||
|
||
|
@@ -1862,6 +1863,17 @@ def test_getitem_with_mask_nd_indexer(self): | |
) | ||
|
||
|
||
@requires_sparse | ||
class TestVariableWithSparse: | ||
# TODO inherit VariableSubclassobjects to cover more tests | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
def test_as_sparse(self): | ||
data = np.arange(12).reshape(3, 4) | ||
var = Variable(("x", "y"), data)._as_sparse(fill_value=-1) | ||
actual = var._to_dense() | ||
assert_identical(var, actual) | ||
|
||
|
||
class TestIndexVariable(VariableSubclassobjects): | ||
cls = staticmethod(IndexVariable) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am a little surprised this is necessary. Does
sparse
not support__array_function__
fornp.where
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well, yes. sparse looks not working with
np.result_type
andastype(copy=False)
.I'll add a TODO here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you have the latest version of sparse installed?
when I test this on my machine, it works:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. You are right.
I was running with sparse 0.7.0. With 0.8.0, it is running.