-
-
Notifications
You must be signed in to change notification settings - Fork 18k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BUG: Series.combine() fails with ExtensionArray inside of Series #21183
Changes from 1 commit
7469ca9
bbb6640
339b23a
61a09e7
d862e83
4c925fc
27480ac
f96372e
677fe18
9fceee7
1010cb5
aceea9f
79506ac
0e4720b
2a21117
e08f832
d3ed2c7
4ca28b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2185,18 +2185,34 @@ def _binop(self, other, func, level=None, fill_value=None): | |
|
||
this_vals, other_vals = ops.fill_binop(this.values, other.values, | ||
fill_value) | ||
|
||
with np.errstate(all='ignore'): | ||
result = func(this_vals, other_vals) | ||
name = ops.get_op_result_name(self, other) | ||
|
||
if is_extension_array_dtype(this) or is_extension_array_dtype(other): | ||
try: | ||
result = func(this_vals, other_vals) | ||
except TypeError: | ||
result = NotImplemented | ||
except Exception as e: | ||
raise e | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this last There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
|
||
if result is NotImplemented: | ||
result = [func(a, b) for a, b in zip(this_vals, other_vals)] | ||
if is_extension_array_dtype(this): | ||
excons = type(this_vals)._from_sequence | ||
else: | ||
excons = type(other_vals)._from_sequence | ||
result = excons(result) | ||
else: | ||
with np.errstate(all='ignore'): | ||
result = func(this_vals, other_vals) | ||
result = self._constructor(result, index=new_index, name=name) | ||
result = result.__finalize__(self) | ||
if name is None: | ||
# When name is None, __finalize__ overwrites current name | ||
result.name = None | ||
return result | ||
|
||
def combine(self, other, func, fill_value=np.nan): | ||
def combine(self, other, func, fill_value=None): | ||
""" | ||
Perform elementwise binary operation on two Series using given function | ||
with optional fill value when an index is missing from one Series or | ||
|
@@ -2208,6 +2224,9 @@ def combine(self, other, func, fill_value=np.nan): | |
func : function | ||
Function that takes two scalars as inputs and return a scalar | ||
fill_value : scalar value | ||
The default specifies to use np.nan unless self is | ||
backed by ExtensionArray, in which case the ExtensionArray | ||
na_value is used. | ||
|
||
Returns | ||
------- | ||
|
@@ -2227,20 +2246,33 @@ def combine(self, other, func, fill_value=np.nan): | |
Series.combine_first : Combine Series values, choosing the calling | ||
Series's values first | ||
""" | ||
self_is_ext = is_extension_array_dtype(self) | ||
if fill_value is None: | ||
if self_is_ext: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thikn There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I made that change. |
||
fill_value = self.dtype.na_value | ||
else: | ||
fill_value = np.nan | ||
if isinstance(other, Series): | ||
new_index = self.index.union(other.index) | ||
new_name = ops.get_op_result_name(self, other) | ||
new_values = np.empty(len(new_index), dtype=self.dtype) | ||
new_values = [] | ||
for i, idx in enumerate(new_index): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
lv = self.get(idx, fill_value) | ||
rv = other.get(idx, fill_value) | ||
with np.errstate(all='ignore'): | ||
new_values[i] = func(lv, rv) | ||
new_values.append(func(lv, rv)) | ||
else: | ||
new_index = self.index | ||
with np.errstate(all='ignore'): | ||
new_values = func(self._values, other) | ||
if not self_is_ext: | ||
with np.errstate(all='ignore'): | ||
new_values = func(self._values, other) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also don't really understand (but this is related with the current implementation, not your changes) why we don't do it element-wise here (no loop over the values as is the case if For me, this seems like a bug in the current implementation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche You're correct. I created a new issue #21248 . I will fix that here. |
||
else: | ||
new_values = [func(lv, other) for lv in self._values] | ||
new_name = self.name | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you put a comment on what is going on here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
if (self_is_ext and self.values.is_sequence_of_dtype(new_values)): | ||
new_values = self._values._from_sequence(new_values) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Under what conditions is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks. How important is it to allow coercion of output type? The previous code certainly considered dtype-preserving functions to be the expected case, since the pre-allocated Without having studied the uses of Anyway, my aversion is to having to perform a full scan of the data just to determine the dtype. That's what types are for :) Can we ask the user to provide an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider the following (using the implementation in this PR):
Note that with the implementation as in this PR, we get a Series of dtype The previous behavior would product The implementation is already doing this element-by-element, so we are doing a full scan of both the left and right arrays. This is an extra scan on the result. We can add an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would rather not do this here at all, prefering instead to dispatch to the EA itself for a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of introducing the I would prefer not to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jorisvandenbossche My most recent commit has this change you suggested and removes In terms of whether |
||
|
||
return self._constructor(new_values, index=new_index, name=new_name) | ||
|
||
def combine_first(self, other): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,9 @@ | |
|
||
import pytest | ||
import numpy as np | ||
import pandas as pd | ||
|
||
import pandas.util.testing as tm | ||
|
||
from pandas.api.types import CategoricalDtype | ||
from pandas import Categorical | ||
|
@@ -157,3 +160,13 @@ def test_value_counts(self, all_data, dropna): | |
|
||
class TestCasting(base.BaseCastingTests): | ||
pass | ||
|
||
|
||
def test_combine(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this be moved to be within the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
orig_data1 = make_data() | ||
orig_data2 = make_data() | ||
s1 = pd.Series(Categorical(orig_data1, ordered=True)) | ||
s2 = pd.Series(Categorical(orig_data2, ordered=True)) | ||
result = s1.combine(s2, lambda x1, x2: x1 <= x2) | ||
expected = pd.Series([a <= b for (a, b) in zip(orig_data1, orig_data2)]) | ||
tm.assert_series_equal(result, expected) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,7 +41,7 @@ def win_types(request): | |
return request.param | ||
|
||
|
||
@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian', 'slepian']) | ||
@pytest.fixture(params=['kaiser', 'gaussian', 'general_gaussian']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Merge conflict? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, something went wrong with git. I didn't touch that code, but did a rebase, and I think it pulled that in. |
||
def win_types_special(request): | ||
return request.param | ||
|
||
|
@@ -1079,8 +1079,7 @@ def test_cmov_window_special(self, win_types_special): | |
kwds = { | ||
'kaiser': {'beta': 1.}, | ||
'gaussian': {'std': 1.}, | ||
'general_gaussian': {'power': 2., 'width': 2.}, | ||
'slepian': {'width': 0.5}} | ||
'general_gaussian': {'power': 2., 'width': 2.}} | ||
|
||
vals = np.array([6.95, 15.21, 4.72, 9.12, 13.81, 13.49, 16.68, 9.48, | ||
10.63, 14.48]) | ||
|
@@ -1090,8 +1089,6 @@ def test_cmov_window_special(self, win_types_special): | |
13.65671, 12.01002, np.nan, np.nan], | ||
'general_gaussian': [np.nan, np.nan, 9.85011, 10.71589, 11.73161, | ||
13.08516, 12.95111, 12.74577, np.nan, np.nan], | ||
'slepian': [np.nan, np.nan, 9.81073, 10.89359, 11.70284, 12.88331, | ||
12.96079, 12.77008, np.nan, np.nan], | ||
'kaiser': [np.nan, np.nan, 9.86851, 11.02969, 11.65161, 12.75129, | ||
12.90702, 12.83757, np.nan, np.nan] | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,7 +30,7 @@ | |
is_categorical_dtype, | ||
is_interval_dtype, | ||
is_sequence, | ||
is_list_like) | ||
is_list_like, is_extension_array_dtype) | ||
from pandas.io.formats.printing import pprint_thing | ||
from pandas.core.algorithms import take_1d | ||
import pandas.core.common as com | ||
|
@@ -1118,10 +1118,12 @@ def assert_extension_array_equal(left, right): | |
right_na = right.isna() | ||
assert_numpy_array_equal(left_na, right_na) | ||
|
||
left_valid = left[~left_na].astype(object) | ||
right_valid = right[~right_na].astype(object) | ||
if len(left_na) > 0 and len(right_na) > 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the changes here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If
fails. There was some test case that was failing without this change (but I don't remember which one, and it may have been in the ops stuff). I will leave this change out and see if this fix for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given that this works
then we should ensure that Series[extension_array] works too. If you find a reproducible example, could you make a new issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I took this code out of this PR. It's not needed here. When we get to the ops stuff, I'll have to reintroduce it, and figure out the boundary case that required this particular case. |
||
|
||
assert_numpy_array_equal(left_valid, right_valid) | ||
left_valid = left[~left_na].astype(object) | ||
right_valid = right[~right_na].astype(object) | ||
|
||
assert_numpy_array_equal(left_valid, right_valid) | ||
|
||
|
||
# This could be refactored to use the NDFrame.equals method | ||
|
@@ -1224,6 +1226,9 @@ def assert_series_equal(left, right, check_dtype=True, | |
left = pd.IntervalIndex(left) | ||
right = pd.IntervalIndex(right) | ||
assert_index_equal(left, right, obj='{obj}.index'.format(obj=obj)) | ||
elif (is_extension_array_dtype(left) and not is_categorical_dtype(left) and | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was left over from the ops stuff, so I have removed it. |
||
is_extension_array_dtype(right) and not is_categorical_dtype(right)): | ||
return assert_extension_array_equal(left.values, right.values) | ||
|
||
else: | ||
_testing.assert_almost_equal(left.get_values(), right.get_values(), | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed