From ecf3def42d7c17a866bfa2f3a84fe7014ade56e9 Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Mon, 30 Apr 2018 18:00:44 -0400 Subject: [PATCH 1/2] Support operators for ExtensionArray --- pandas/core/arrays/base.py | 21 ++++ pandas/core/indexes/base.py | 22 +++-- pandas/core/ops.py | 96 ++++++++++++++++++- pandas/core/series.py | 48 ++++++++-- pandas/tests/extension/base/getitem.py | 12 +++ .../extension/category/test_categorical.py | 13 +++ .../tests/extension/decimal/test_decimal.py | 59 ++++++++++++ pandas/tests/extension/json/array.py | 17 ++++ pandas/tests/extension/json/test_json.py | 42 ++++++++ pandas/tests/series/test_operators.py | 6 +- pandas/util/testing.py | 13 ++- 11 files changed, 327 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1922801c30719..14382f59028b4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -53,6 +53,13 @@ class ExtensionArray(object): * factorize / _values_for_factorize * argsort / _values_for_argsort + For logical operators, the default is to return a Series of boolean. + However, if the underlying ExtensionDtype overrides the logical + operators, then the implementer may want to have an ExtensionArray + subclass contain the result. This can be done by changing the property + _logical_result from its default value of None to the _from_sequence + method of the ExtensionArray subclass. + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is @@ -567,6 +574,9 @@ def copy(self, deep=False): """ raise AbstractMethodError(self) + # See documentation above + _logical_result = None + # ------------------------------------------------------------------------ # Block-related methods # ------------------------------------------------------------------------ @@ -610,3 +620,14 @@ def _ndarray_values(self): used for interacting with our indexers. """ return np.array(self) + + # ------------------------------------------------------------------------ + # Utilities for use by subclasses + # ------------------------------------------------------------------------ + def is_sequence_of_dtype(self, seq): + """ + Given a sequence, determine whether all members have the appropriate + type for this instance of an ExtensionArray + """ + thistype = self.dtype.type + return all([isinstance(i, thistype) for i in seq]) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2ceec1592d49b..6f96b78a9dbc6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3068,13 +3068,23 @@ def get_value(self, series, key): # if we have something that is Index-like, then # use this, e.g. DatetimeIndex s = getattr(series, '_values', None) - if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): - try: - return s[key] - except (IndexError, ValueError): + if is_scalar(key): + if isinstance(s, Index): + try: + return s[key] + except (IndexError, ValueError): - # invalid type as an indexer - pass + # invalid type as an indexer + pass + elif isinstance(s, ExtensionArray): + try: + # This should call the ExtensionArray __getitem__ + iloc = self.get_loc(key) + return s[iloc] + except (IndexError, ValueError): + + # invalid type as an indexer + pass s = com._values_from_object(series) k = com._values_from_object(key) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index e14f82906cd06..2939ab1c021b7 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -6,6 +6,7 @@ # necessary to enforce truediv in Python 2.X from __future__ import division import operator +import inspect import numpy as np import pandas as pd @@ -30,7 +31,7 @@ is_bool_dtype, is_list_like, is_scalar, - _ensure_object) + _ensure_object, is_extension_array_dtype) from pandas.core.dtypes.cast import ( maybe_upcast_putmask, find_common_type, construct_1d_object_array_from_listlike) @@ -990,6 +991,93 @@ def _construct_divmod_result(left, result, index, name, dtype): ) +def dispatch_to_extension_op(left, right, op_name=None, is_logical=False): + """ + Assume that left is a Series backed by an ExtensionArray, + apply the operator defined by op_name. + """ + + method = getattr(left.values, op_name, None) + deflen = len(left) + excons = type(left.values)._from_sequence + exclass = type(left.values) + testseq = left.values + + if is_logical: + if exclass._logical_result is not None: + excons = exclass._logical_result + else: + excons = None # Indicates boolean + + # The idea here is as follows. First we see if the op is + # defined in the ExtensionArray subclass, and returns a + # result that is not NotImplemented. If so, we use that + # result. If that fails, then we try an + # element by element operator, invoking the operator + # on each element + + # First see if the extension array object supports the op + res = NotImplemented + if method is not None and inspect.ismethod(method): + rvalues = right + if is_extension_array_dtype(right) and isinstance(right, ABCSeries): + rvalues = right.values + try: + res = method(rvalues) + except TypeError: + pass + except Exception as e: + raise e + + def convert_values(parm): + if is_extension_array_dtype(parm): + ovalues = parm.values + elif is_list_like(parm): + ovalues = parm + else: # Assume its an object + ovalues = [parm] * deflen + return ovalues + + if res is NotImplemented: + # Try it on each element. Support operation to another + # ExtensionArray, or something that is list like, or + # a single object. This allows a result of an operator + # to be an object or any type + lvalues = convert_values(left) + rvalues = convert_values(right) + + # Get the method for each object. + def callfunc(a, b): + f = getattr(a, op_name, None) + if f is not None: + return f(b) + else: + return NotImplemented + res = [callfunc(a, b) for (a, b) in zip(lvalues, rvalues)] + + # We can't use (NotImplemented in res) because the + # results might be objects that have overridden __eq__ + if any([isinstance(r, type(NotImplemented)) for r in res]): + msg = "invalid operation {opn} between {one} and {two}" + raise TypeError(msg.format(opn=op_name, + one=type(lvalues), + two=type(rvalues))) + + # At this point we have the result + # always return a full value series here + res_values = com._values_from_object(res) + if excons is not None: + if testseq.is_sequence_of_dtype(res_values): + # Convert to the ExtensionArray type if each result is of that + # type. If _logical_result was not None, this will then use + # the function set there to return an appropriate result + res_values = excons(res_values) + + res_name = get_op_result_name(left, right) + return left._constructor(res_values, index=left.index, + name=res_name) + + def _arith_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid @@ -1058,6 +1146,9 @@ def wrapper(left, right): raise TypeError("{typ} cannot perform the operation " "{op}".format(typ=type(left).__name__, op=str_rep)) + elif is_extension_array_dtype(left): + return dispatch_to_extension_op(left, right, op_name) + lvalues = left.values rvalues = right if isinstance(rvalues, ABCSeries): @@ -1208,6 +1299,9 @@ def wrapper(self, other, axis=None): return self._constructor(res_values, index=self.index, name=res_name) + elif is_extension_array_dtype(self): + return dispatch_to_extension_op(self, other, op_name, True) + elif isinstance(other, ABCSeries): # By this point we have checked that self._indexed_same(other) res_values = na_op(self.values, other.values) diff --git a/pandas/core/series.py b/pandas/core/series.py index a14f3299e11e9..3ef431ed39761 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2174,10 +2174,26 @@ def _binop(self, other, func, level=None, fill_value=None): this_vals, other_vals = ops.fill_binop(this.values, other.values, fill_value) - - with np.errstate(all='ignore'): - result = func(this_vals, other_vals) name = ops.get_op_result_name(self, other) + + if is_extension_array_dtype(this) or is_extension_array_dtype(other): + try: + result = func(this_vals, other_vals) + except TypeError: + result = NotImplemented + except Exception as e: + raise e + + if result is NotImplemented: + result = [func(a, b) for a, b in zip(this_vals, other_vals)] + if is_extension_array_dtype(this): + excons = type(this_vals)._from_sequence + else: + excons = type(other_vals)._from_sequence + result = excons(result) + else: + with np.errstate(all='ignore'): + result = func(this_vals, other_vals) result = self._constructor(result, index=new_index, name=name) result = result.__finalize__(self) if name is None: @@ -2185,7 +2201,7 @@ def _binop(self, other, func, level=None, fill_value=None): result.name = None return result - def combine(self, other, func, fill_value=np.nan): + def combine(self, other, func, fill_value=None): """ Perform elementwise binary operation on two Series using given function with optional fill value when an index is missing from one Series or @@ -2197,6 +2213,9 @@ def combine(self, other, func, fill_value=np.nan): func : function Function that takes two scalars as inputs and return a scalar fill_value : scalar value + The default specifies to use np.nan unless self is + backed by ExtensionArray, in which case the ExtensionArray + na_value is used. Returns ------- @@ -2216,20 +2235,33 @@ def combine(self, other, func, fill_value=np.nan): Series.combine_first : Combine Series values, choosing the calling Series's values first """ + self_is_ext = is_extension_array_dtype(self) + if fill_value is None: + if self_is_ext: + fill_value = self.dtype.na_value + else: + fill_value = np.nan if isinstance(other, Series): new_index = self.index.union(other.index) new_name = ops.get_op_result_name(self, other) - new_values = np.empty(len(new_index), dtype=self.dtype) + new_values = [] for i, idx in enumerate(new_index): lv = self.get(idx, fill_value) rv = other.get(idx, fill_value) with np.errstate(all='ignore'): - new_values[i] = func(lv, rv) + new_values.append(func(lv, rv)) else: new_index = self.index - with np.errstate(all='ignore'): - new_values = func(self._values, other) + if not self_is_ext: + with np.errstate(all='ignore'): + new_values = func(self._values, other) + else: + new_values = [func(lv, other) for lv in self._values] new_name = self.name + + if (self_is_ext and self.values.is_sequence_of_dtype(new_values)): + new_values = self._values._from_sequence(new_values) + return self._constructor(new_values, index=new_index, name=new_name) def combine_first(self, other): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 5c9ede1079079..238ab81e009df 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -117,6 +117,18 @@ def test_getitem_slice(self, data): result = data[slice(1)] # scalar assert isinstance(result, type(data)) + def test_get(self, data): + # GH 20882 + s = pd.Series(data, index=[2 * i for i in range(len(data))]) + assert s.get(4) == s.iloc[2] + + result = s.get([4, 6]) + expected = s.iloc[[2, 3]] + self.assert_series_equal(result, expected) + + s = pd.Series(data[:6], index=list('abcdef')) + assert s.get('c') == s.iloc[2] + def test_take_sequence(self, data): result = pd.Series(data)[[0, 1, 3]] assert result.iloc[0] == data[0] diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 530a4e7a22a7a..3e9a97cfae402 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -2,6 +2,9 @@ import pytest import numpy as np +import pandas as pd + +import pandas.util.testing as tm from pandas.api.types import CategoricalDtype from pandas import Categorical @@ -157,3 +160,13 @@ def test_value_counts(self, all_data, dropna): class TestCasting(base.BaseCastingTests): pass + + +def test_combine(): + orig_data1 = make_data() + orig_data2 = make_data() + s1 = pd.Series(Categorical(orig_data1, ordered=True)) + s2 = pd.Series(Categorical(orig_data2, ordered=True)) + result = s1.combine(s2, lambda x1, x2: x1 <= x2) + expected = pd.Series([a <= b for (a, b) in zip(orig_data1, orig_data2)]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 1f8cf0264f62f..32690595bcd2b 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -7,6 +7,9 @@ from pandas.tests.extension import base +from pandas.tests.series.test_operators import TestSeriesOperators +from pandas.util._decorators import cache_readonly + from .array import DecimalDtype, DecimalArray, make_data @@ -183,3 +186,59 @@ def test_dataframe_constructor_with_different_dtype_raises(): xpr = "Cannot coerce extension array to dtype 'int64'. " with tm.assert_raises_regex(ValueError, xpr): pd.DataFrame({"A": arr}, dtype='int64') + + +def test_addition(data): + s = pd.Series(data) + result = s + 10 + expected = pd.Series(DecimalArray([i + 10 for i in data])) + tm.assert_series_equal(result, expected) + + result = 10 + s + tm.assert_series_equal(result, expected) + + result = s + s + expected = pd.Series(DecimalArray([i + i for i in data])) + tm.assert_series_equal(result, expected) + + result = s + list(data) + tm.assert_series_equal(result, expected) + + result = list(data) + s + tm.assert_series_equal(result, expected) + + result = (s <= 10) + expected = pd.Series([i <= 10 for i in data]) + tm.assert_series_equal(result, expected) + +_ts = pd.Series(DecimalArray(make_data())) + + +class TestOperator(BaseDecimal, TestSeriesOperators): + @cache_readonly + def ts(self): + ts = _ts.copy() + ts.name = 'ts' + return ts + + def test_operators(self): + def absfunc(v): + if isinstance(v, pd.Series): + vals = v.values + return pd.Series(vals._from_sequence([abs(i) for i in vals])) + else: + return abs(v) + context = decimal.getcontext() + divbyzerotrap = context.traps[decimal.DivisionByZero] + invalidoptrap = context.traps[decimal.InvalidOperation] + context.traps[decimal.DivisionByZero] = 0 + context.traps[decimal.InvalidOperation] = 0 + super(TestOperator, self).test_operators(absfunc) + context.traps[decimal.DivisionByZero] = divbyzerotrap + context.traps[decimal.InvalidOperation] = invalidoptrap + + def test_operators_corner(self): + pytest.skip("Cannot add empty Series of float64 to DecimalArray") + + def test_divmod(self): + pytest.skip("divmod not appropriate for Decimal type") diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 88bb66f38b35c..334336700e79d 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -169,6 +169,23 @@ def _values_for_argsort(self): frozen = [()] + list(tuple(x.items()) for x in self) return np.array(frozen, dtype=object)[1:] + def __add__(self, other): + def merge_two_dicts(x, y): + z = x.copy() + z.update(y) + return z + + if isinstance(other, type(self)): + seq = [merge_two_dicts(a, b) + for (a, b) in zip(self.data, other.data)] + elif isinstance(other, self.dtype.type): + seq = [merge_two_dicts(a, other) + for a in self.data] + else: + raise TypeError("Cannot add JSONArray and type ", type(other)) + + return self._from_sequence(seq) + def make_data(): # TODO: Use a regular dict. See _NDFrameIndexer._setitem_with_indexer diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index b7ac8033f3f6d..3987fd7004730 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -230,3 +230,45 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): super(TestGroupby, self).test_groupby_extension_agg( as_index, data_for_grouping ) + + +class TestSomeOps(BaseJSON): + def test_some_ops(self): + # Just testing that addition through the JSONArray works + # and then subtraction raises a TypeError + d1 = make_data() + d2 = make_data() + s1 = pd.Series(JSONArray(d1)) + s2 = pd.Series(JSONArray(d2)) + result = s1 + s2 + + def merge_two_dicts(x, y): + z = x.copy() + z.update(y) + return z + + expected = pd.Series(JSONArray([merge_two_dicts(a, b) + for (a, b) in zip(d1, d2)])) + self.assert_series_equal(result, expected) + + toadd = s2.iloc[5] + result = s1 + toadd + expected = pd.Series(JSONArray([merge_two_dicts(a, toadd) + for a in d1])) + self.assert_series_equal(result, expected) + + with pytest.raises(TypeError): + # __add__ is implemented, but __radd__ is not + result = toadd + s1 + + with pytest.raises(TypeError): + # Cannot add a constant + result = s1 + 29 + + with pytest.raises(TypeError): + # __sub__ is not implemented + result = s1 - toadd + + with pytest.raises(TypeError): + # __sub__ is not implemented + result = s1 - s2 diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index f90fcce973f00..8e27b4a2e421d 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -1198,11 +1198,11 @@ def test_neg(self): def test_invert(self): assert_series_equal(-(self.series < 0), ~(self.series < 0)) - def test_operators(self): + def test_operators(self, absfunc=np.abs): def _check_op(series, other, op, pos_only=False, check_dtype=True): - left = np.abs(series) if pos_only else series - right = np.abs(other) if pos_only else other + left = absfunc(series) if pos_only else series + right = absfunc(other) if pos_only else other cython_or_numpy = op(left, right) python = left.combine(right, op) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index e1484a9c1b390..1a7fc7940f56e 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -30,7 +30,7 @@ is_categorical_dtype, is_interval_dtype, is_sequence, - is_list_like) + is_list_like, is_extension_array_dtype) from pandas.io.formats.printing import pprint_thing from pandas.core.algorithms import take_1d import pandas.core.common as com @@ -1113,10 +1113,12 @@ def assert_extension_array_equal(left, right): right_na = right.isna() assert_numpy_array_equal(left_na, right_na) - left_valid = left[~left_na].astype(object) - right_valid = right[~right_na].astype(object) + if len(left_na) > 0 and len(right_na) > 0: - assert_numpy_array_equal(left_valid, right_valid) + left_valid = left[~left_na].astype(object) + right_valid = right[~right_na].astype(object) + + assert_numpy_array_equal(left_valid, right_valid) # This could be refactored to use the NDFrame.equals method @@ -1219,6 +1221,9 @@ def assert_series_equal(left, right, check_dtype=True, left = pd.IntervalIndex(left) right = pd.IntervalIndex(right) assert_index_equal(left, right, obj='{obj}.index'.format(obj=obj)) + elif (is_extension_array_dtype(left) and not is_categorical_dtype(left) and + is_extension_array_dtype(right) and not is_categorical_dtype(right)): + return assert_extension_array_equal(left.values, right.values) else: _testing.assert_almost_equal(left.get_values(), right.get_values(), From 40ac877c99e67ed4ecf1afbcb8c0e3b1568dc14b Mon Sep 17 00:00:00 2001 From: Dr-Irv Date: Tue, 1 May 2018 12:18:00 -0400 Subject: [PATCH 2/2] fix lint issues --- pandas/core/arrays/base.py | 2 +- pandas/core/ops.py | 2 +- pandas/tests/extension/decimal/test_decimal.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 14382f59028b4..e65d02bdbcf14 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -630,4 +630,4 @@ def is_sequence_of_dtype(self, seq): type for this instance of an ExtensionArray """ thistype = self.dtype.type - return all([isinstance(i, thistype) for i in seq]) + return all(isinstance(i, thistype) for i in seq) diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 2939ab1c021b7..294c591de6959 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1057,7 +1057,7 @@ def callfunc(a, b): # We can't use (NotImplemented in res) because the # results might be objects that have overridden __eq__ - if any([isinstance(r, type(NotImplemented)) for r in res]): + if any(isinstance(r, type(NotImplemented)) for r in res): msg = "invalid operation {opn} between {one} and {two}" raise TypeError(msg.format(opn=op_name, one=type(lvalues), diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 32690595bcd2b..06da37300d24f 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -211,6 +211,7 @@ def test_addition(data): expected = pd.Series([i <= 10 for i in data]) tm.assert_series_equal(result, expected) + _ts = pd.Series(DecimalArray(make_data()))