From 9e8d78277185bf907d5ee2dd4f7dd661ab438f7f Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 31 Mar 2016 04:28:22 +0900 Subject: [PATCH] CLN: Move boxing logic to BlockManager --- doc/source/whatsnew/v0.18.1.txt | 8 +- pandas/core/common.py | 10 - pandas/core/frame.py | 52 +---- pandas/core/internals.py | 121 +++++++--- pandas/core/series.py | 91 ++------ pandas/tests/frame/test_analytics.py | 212 +---------------- pandas/tests/frame/test_apply.py | 18 ++ pandas/tests/frame/test_quantile.py | 319 ++++++++++++++++++++++++++ pandas/tests/series/test_analytics.py | 262 +-------------------- pandas/tests/series/test_apply.py | 257 +++++++++++++++++++++ pandas/tests/series/test_misc_api.py | 36 +++ pandas/tests/series/test_quantile.py | 178 ++++++++++++++ pandas/tseries/base.py | 5 + 13 files changed, 942 insertions(+), 627 deletions(-) create mode 100644 pandas/tests/frame/test_quantile.py create mode 100644 pandas/tests/series/test_apply.py create mode 100644 pandas/tests/series/test_quantile.py diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index f20b961455ba7..5af0b97173da2 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -204,10 +204,14 @@ Bug Fixes - Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`) - - - Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`) - Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`) - Bug in ``.describe()`` resets categorical columns information (:issue:`11558`) - Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`) - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`) + + + + +- Bug in ``.quantile`` with interpolation may coerce to ``float`` unexpectedly (:issue:`12772`) +- Bug in ``.quantile`` with empty Series may return scalar rather than empty Series (:issue:`12772`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 379e59394b6f5..6de6da4afedc8 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2377,16 +2377,6 @@ def needs_i8_conversion(arr_or_dtype): is_datetime64tz_dtype(arr_or_dtype)) -def i8_boxer(arr_or_dtype): - """ return the scalar boxer for the dtype """ - if (is_datetime64_dtype(arr_or_dtype) or - is_datetime64tz_dtype(arr_or_dtype)): - return lib.Timestamp - elif is_timedelta64_dtype(arr_or_dtype): - return lambda x: lib.Timedelta(x, unit='ns') - raise ValueError("cannot find a scalar boxer for {0}".format(arr_or_dtype)) - - def is_numeric_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, (np.number, np.bool_)) and diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a504f91705733..af03f1a17ea75 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -40,7 +40,6 @@ from pandas.core.categorical import Categorical import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval -from numpy import percentile as _quantile from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) from pandas import compat @@ -63,7 +62,6 @@ import pandas.algos as _algos from pandas.core.config import get_option -from pandas import _np_version_under1p9 # --------------------------------------------------------------------- # Docstring templates @@ -4227,10 +4225,7 @@ def applymap(self, func): # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): - if com.needs_i8_conversion(x): - f = com.i8_boxer(x) - x = lib.map_infer(_values_from_object(x), f) - return lib.map_infer(_values_from_object(x), func) + return lib.map_infer(x.asobject, func) return self.apply(infer) @@ -4974,55 +4969,26 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, 0.1 1.3 3.7 0.5 2.5 55.0 """ - self._check_percentile(q) - per = np.asarray(q) * 100 - - if not com.is_list_like(per): - per = [per] + if not com.is_list_like(q): q = [q] squeeze = True else: squeeze = False - if _np_version_under1p9: - if interpolation != 'linear': - raise ValueError("Interpolation methods other than linear " - "are not supported in numpy < 1.9") - - def f(arr, per, interpolation): - if arr._is_datelike_mixed_type: - values = _values_from_object(arr).view('i8') - else: - values = arr.astype(float) - values = values[notnull(values)] - if len(values) == 0: - return NA - else: - if _np_version_under1p9: - return _quantile(values, per) - else: - return _quantile(values, per, interpolation=interpolation) - data = self._get_numeric_data() if numeric_only else self - axis = self._get_axis_number(axis) + def _quantile(series): + res = series.quantile(q, interpolation=interpolation) + return series.name, res + if axis == 1: data = data.T - # need to know which cols are timestamp going in so that we can - # map timestamp over them after getting the quantile. - is_dt_col = data.dtypes.map(com.is_datetime64_dtype) - is_dt_col = is_dt_col[is_dt_col].index - - quantiles = [[f(vals, x, interpolation) for x in per] - for (_, vals) in data.iteritems()] - - result = self._constructor(quantiles, index=data._info_axis, - columns=q).T - if len(is_dt_col) > 0: - result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp) + # unable to use DataFrame.apply, becasuse data may be empty + result = dict(_quantile(s) for (_, s) in data.iteritems()) + result = self._constructor(result, columns=data.columns) if squeeze: if result.shape == (1, 1): result = result.T.iloc[:, 0] # don't want scalar diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a31bd347e674a..1b29ececa984a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -6,6 +6,8 @@ from collections import defaultdict import numpy as np +from numpy import percentile as _quantile + from pandas.core.base import PandasObject from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE, @@ -131,6 +133,8 @@ def get_values(self, dtype=None): return an internal format, currently just the ndarray this is often overriden to handle to_dense like operations """ + if com.is_object_dtype(dtype): + return self.values.astype(object) return self.values def to_dense(self): @@ -141,6 +145,10 @@ def to_object_block(self, mgr): values = self.get_values(dtype=object) return self.make_block(values, klass=ObjectBlock) + @property + def _na_value(self): + return np.nan + @property def fill_value(self): return np.nan @@ -1247,6 +1255,19 @@ def equals(self, other): return False return array_equivalent(self.values, other.values) + def quantile(self, values, qs, **kwargs): + if len(values) == 0: + if com.is_list_like(qs): + return np.array([self.fill_value]) + else: + return self._na_value + + if com.is_list_like(qs): + values = [_quantile(values, x * 100, **kwargs) for x in qs] + return np.array(values) + else: + return _quantile(values, qs * 100, **kwargs) + class NonConsolidatableMixIn(object): """ hold methods for the nonconsolidatable blocks """ @@ -1455,15 +1476,55 @@ def should_store(self, value): return com.is_integer_dtype(value) and value.dtype == self.dtype -class TimeDeltaBlock(IntBlock): +class DatetimeLikeBlockMixin(object): + + @property + def _na_value(self): + return tslib.NaT + + @property + def fill_value(self): + return tslib.iNaT + + def _try_operate(self, values): + """ return a version to operate on """ + return values.view('i8') + + def get_values(self, dtype=None): + """ + return object dtype as boxed values, such as Timestamps/Timedelta + """ + if com.is_object_dtype(dtype): + return lib.map_infer(self.values.ravel(), + self._box_func).reshape(self.values.shape) + return self.values + + def quantile(self, values, qs, **kwargs): + values = values.view('i8') + mask = values == self.fill_value + if mask.any(): + values = values[~mask] + result = Block.quantile(self, values, qs, **kwargs) + + if com.is_datetime64tz_dtype(self): + # ToDo: Temp logic to avoid GH 12619 and GH 12772 + # which affects to DatetimeBlockTZ_try_coerce_result for np.ndarray + if isinstance(result, np.ndarray) and values.ndim > 0: + result = self._holder(result, tz='UTC') + result = result.tz_convert(self.values.tz) + return result + return self._try_coerce_result(result) + + +class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () is_timedelta = True _can_hold_na = True is_numeric = False @property - def fill_value(self): - return tslib.iNaT + def _box_func(self): + return lambda x: tslib.Timedelta(x, unit='ns') def fillna(self, value, **kwargs): @@ -1516,10 +1577,6 @@ def _try_coerce_args(self, values, other): return values, values_mask, other, other_mask - def _try_operate(self, values): - """ return a version to operate on """ - return values.view('i8') - def _try_coerce_result(self, result): """ reverse of try_coerce_args / try_operate """ if isinstance(result, np.ndarray): @@ -1527,8 +1584,8 @@ def _try_coerce_result(self, result): if result.dtype.kind in ['i', 'f', 'O']: result = result.astype('m8[ns]') result[mask] = tslib.iNaT - elif isinstance(result, np.integer): - result = lib.Timedelta(result) + elif isinstance(result, (np.integer, np.float)): + result = self._box_func(result) return result def should_store(self, value): @@ -1558,13 +1615,6 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, dtype=object) return rvalues - def get_values(self, dtype=None): - # return object dtypes as Timedelta - if dtype == object: - return lib.map_infer(self.values.ravel(), - lib.Timedelta).reshape(self.values.shape) - return self.values - class BoolBlock(NumericBlock): __slots__ = () @@ -1954,7 +2004,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs): return values.reshape(1, len(values)) -class DatetimeBlock(Block): +class DatetimeBlock(DatetimeLikeBlockMixin, Block): __slots__ = () is_datetime = True _can_hold_na = True @@ -1998,10 +2048,6 @@ def _try_cast(self, element): except: return element - def _try_operate(self, values): - """ return a version to operate on """ - return values.view('i8') - def _try_coerce_args(self, values, other): """ Coerce values and other to dtype 'i8'. NaN and NaT convert to @@ -2029,7 +2075,7 @@ def _try_coerce_args(self, values, other): other = tslib.iNaT other_mask = True elif isinstance(other, (datetime, np.datetime64, date)): - other = lib.Timestamp(other) + other = self._box_func(other) if getattr(other, 'tz') is not None: raise TypeError("cannot coerce a Timestamp with a tz on a " "naive Block") @@ -2056,13 +2102,13 @@ def _try_coerce_result(self, result): if isinstance(result, np.ndarray): if result.dtype.kind in ['i', 'f', 'O']: result = result.astype('M8[ns]') - elif isinstance(result, (np.integer, np.datetime64)): - result = lib.Timestamp(result) + elif isinstance(result, (np.integer, np.float, np.datetime64)): + result = self._box_func(result) return result @property - def fill_value(self): - return tslib.iNaT + def _box_func(self): + return tslib.Timestamp def to_native_types(self, slicer=None, na_rep=None, date_format=None, quoting=None, **kwargs): @@ -2098,13 +2144,6 @@ def set(self, locs, values, check=False): self.values[locs] = values - def get_values(self, dtype=None): - # return object dtype as Timestamps - if dtype == object: - return lib.map_infer( - self.values.ravel(), lib.Timestamp).reshape(self.values.shape) - return self.values - class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ @@ -2145,7 +2184,7 @@ def external_values(self): def get_values(self, dtype=None): # return object dtype as Timestamps with the zones - if dtype == object: + if com.is_object_dtype(dtype): f = lambda x: lib.Timestamp(x, tz=self.values.tz) return lib.map_infer( self.values.ravel(), f).reshape(self.values.shape) @@ -2228,10 +2267,14 @@ def _try_coerce_result(self, result): if isinstance(result, np.ndarray): result = self._holder(result, tz=self.values.tz) - elif isinstance(result, (np.integer, np.datetime64)): + elif isinstance(result, (np.integer, np.float, np.datetime64)): result = lib.Timestamp(result, tz=self.values.tz) return result + @property + def _box_func(self): + return lambda x: tslib.Timestamp(x, tz=self.dtype.tz) + def shift(self, periods, axis=0, mgr=None): """ shift the block by periods """ @@ -3852,6 +3895,14 @@ def get_values(self): """ return a dense type view """ return np.array(self._block.to_dense(), copy=False) + @property + def asobject(self): + """ + return a object dtype array. datetime/timedelta like values are boxed + to Timestamp/Timedelta instances. + """ + return self._block.get_values(dtype=object) + @property def itemsize(self): return self._block.values.itemsize diff --git a/pandas/core/series.py b/pandas/core/series.py index cc58b32de999a..ce0600b9329ca 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -17,8 +17,8 @@ _default_index, _maybe_upcast, _asarray_tuplesafe, _infer_dtype_from_scalar, is_list_like, _values_from_object, - is_categorical_dtype, needs_i8_conversion, - i8_boxer, _possibly_cast_to_datetime, + is_categorical_dtype, + _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, _try_sort, is_internal_type, is_datetimetz, _maybe_match_name, ABCSparseArray, @@ -56,7 +56,6 @@ import pandas.tslib as tslib import pandas.index as _index -from numpy import percentile as _quantile from pandas.core.config import get_option from pandas import _np_version_under1p9 @@ -374,6 +373,15 @@ def get_values(self): """ same as values (but handles sparseness conversions); is a view """ return self._data.get_values() + @property + def asobject(self): + """ + return object Series which contains boxed values + + *this is an internal non-public method* + """ + return self._data.asobject + # ops def ravel(self, order='C'): """ @@ -1050,9 +1058,8 @@ def _get_repr(self, name=False, header=True, index=True, length=True, def __iter__(self): """ provide iteration over the values of the Series box values if necessary """ - if needs_i8_conversion(self.dtype): - boxer = i8_boxer(self) - return (boxer(x) for x in self._values) + if com.is_datetimelike(self): + return (_maybe_box_datetimelike(x) for x in self._values) else: return iter(self._values) @@ -1343,21 +1350,20 @@ def quantile(self, q=0.5, interpolation='linear'): raise ValueError("Interpolation methods other than linear " "are not supported in numpy < 1.9.") - def multi(values, qs, **kwargs): - if com.is_list_like(qs): - values = [_quantile(values, x * 100, **kwargs) for x in qs] - # let empty result to be Float64Index - qs = Float64Index(qs) - return self._constructor(values, index=qs, name=self.name) - else: - return _quantile(values, qs * 100, **kwargs) - kwargs = dict() if not _np_version_under1p9: kwargs.update({'interpolation': interpolation}) - return self._maybe_box(lambda values: multi(values, q, **kwargs), - dropna=True) + result = self._data._block.quantile(self.dropna()._values, + q, **kwargs) + + if com.is_list_like(result): + # explicitly use Float64Index to coerce empty result to float dtype + index = Float64Index(q) + return self._constructor(result, index=index, name=self.name) + else: + # scalar + return result def corr(self, other, method='pearson', min_periods=None): """ @@ -2061,10 +2067,7 @@ def map(self, arg, na_action=None): y : Series same index as caller """ - values = self._values - if needs_i8_conversion(values.dtype): - boxer = i8_boxer(values) - values = lib.map_infer(values, boxer) + values = self.asobject if na_action == 'ignore': mask = isnull(values) @@ -2194,12 +2197,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): if isinstance(f, np.ufunc): return f(self) - values = _values_from_object(self) - if needs_i8_conversion(values.dtype): - boxer = i8_boxer(values) - values = lib.map_infer(values, boxer) - - mapped = lib.map_infer(values, f, convert=convert_dtype) + mapped = lib.map_infer(self.asobject, f, convert=convert_dtype) if len(mapped) and isinstance(mapped[0], Series): from pandas.core.frame import DataFrame return DataFrame(mapped.tolist(), index=self.index) @@ -2229,45 +2227,6 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, numeric_only=numeric_only, filter_type=filter_type, **kwds) - def _maybe_box(self, func, dropna=False): - """ - evaluate a function with possible input/output conversion if we are i8 - - Parameters - ---------- - dropna : bool, default False - whether to drop values if necessary - - """ - if dropna: - values = self.dropna()._values - else: - values = self._values - - if needs_i8_conversion(self): - boxer = i8_boxer(self) - - if len(values) == 0: - return boxer(tslib.iNaT) - - values = values.view('i8') - result = func(values) - - if com.is_list_like(result): - result = result.map(boxer) - else: - result = boxer(result) - - else: - - # let the function return nan if appropriate - if dropna: - if len(values) == 0: - return np.nan - result = func(values) - - return result - def _reindex_indexer(self, new_index, indexer, copy): if indexer is None: if copy: diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 74682c506c769..a395c667188eb 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -13,7 +13,7 @@ from pandas.compat import lrange from pandas import (compat, isnull, notnull, DataFrame, Series, - MultiIndex, date_range, Timestamp, _np_version_under1p11) + MultiIndex, date_range, Timestamp) import pandas as pd import pandas.core.common as com import pandas.core.nanops as nanops @@ -25,8 +25,6 @@ assertRaisesRegexp) import pandas.util.testing as tm -from pandas import _np_version_under1p9 - from pandas.tests.frame.common import TestData @@ -503,214 +501,6 @@ def test_numeric_only_flag(self): self.assertRaises(TypeError, lambda: getattr(df2, meth) (axis=1, numeric_only=False)) - def test_quantile(self): - from numpy import percentile - - q = self.tsframe.quantile(0.1, axis=0) - self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) - q = self.tsframe.quantile(0.9, axis=1) - q = self.intframe.quantile(0.1) - self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) - - # test degenerate case - q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) - assert(np.isnan(q['x']) and np.isnan(q['y'])) - - # non-numeric exclusion - df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) - rs = df.quantile(0.5) - xp = df.median() - assert_series_equal(rs, xp) - - # axis - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1) - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) - assert_series_equal(result, expected) - - result = df.quantile([.5, .75], axis=1) - expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75], - 3: [3.5, 3.75]}, index=[0.5, 0.75]) - assert_frame_equal(result, expected, check_index_type=True) - - # We may want to break API in the future to change this - # so that we exclude non-numeric along the same axis - # See GH #7312 - df = DataFrame([[1, 2, 3], - ['a', 'b', 4]]) - result = df.quantile(.5, axis=1) - expected = Series([3., 4.], index=[0, 1]) - assert_series_equal(result, expected) - - def test_quantile_axis_parameter(self): - # GH 9543/9544 - - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - - result = df.quantile(.5, axis=0) - - expected = Series([2., 3.], index=["A", "B"]) - assert_series_equal(result, expected) - - expected = df.quantile(.5, axis="index") - assert_series_equal(result, expected) - - result = df.quantile(.5, axis=1) - - expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) - assert_series_equal(result, expected) - - result = df.quantile(.5, axis="columns") - assert_series_equal(result, expected) - - self.assertRaises(ValueError, df.quantile, 0.1, axis=-1) - self.assertRaises(ValueError, df.quantile, 0.1, axis="column") - - def test_quantile_interpolation(self): - # GH #10174 - if _np_version_under1p9: - raise nose.SkipTest("Numpy version under 1.9") - - from numpy import percentile - - # interpolation = linear (default case) - q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') - self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) - q = self.intframe.quantile(0.1) - self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) - - # test with and without interpolation keyword - q1 = self.intframe.quantile(0.1) - self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) - assert_series_equal(q, q1) - - # interpolation method other than default linear - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - result = df.quantile(.5, axis=1, interpolation='nearest') - expected = Series([1., 2., 3.], index=[1, 2, 3]) - assert_series_equal(result, expected) - - # axis - result = df.quantile([.5, .75], axis=1, interpolation='lower') - expected = DataFrame({1: [1., 1.], 2: [2., 2.], - 3: [3., 3.]}, index=[0.5, 0.75]) - assert_frame_equal(result, expected) - - # test degenerate case - df = DataFrame({'x': [], 'y': []}) - q = df.quantile(0.1, axis=0, interpolation='higher') - assert(np.isnan(q['x']) and np.isnan(q['y'])) - - # multi - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - result = df.quantile([.25, .5], interpolation='midpoint') - - # https://github.com/numpy/numpy/issues/7163 - if _np_version_under1p11: - expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]], - index=[.25, .5], columns=['a', 'b', 'c']) - else: - expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], - index=[.25, .5], columns=['a', 'b', 'c']) - assert_frame_equal(result, expected) - - def test_quantile_interpolation_np_lt_1p9(self): - # GH #10174 - if not _np_version_under1p9: - raise nose.SkipTest("Numpy version is greater than 1.9") - - from numpy import percentile - - # interpolation = linear (default case) - q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') - self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) - q = self.intframe.quantile(0.1) - self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) - - # test with and without interpolation keyword - q1 = self.intframe.quantile(0.1) - self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) - assert_series_equal(q, q1) - - # interpolation method other than default linear - expErrMsg = "Interpolation methods other than linear" - df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) - with assertRaisesRegexp(ValueError, expErrMsg): - df.quantile(.5, axis=1, interpolation='nearest') - - with assertRaisesRegexp(ValueError, expErrMsg): - df.quantile([.5, .75], axis=1, interpolation='lower') - - # test degenerate case - df = DataFrame({'x': [], 'y': []}) - with assertRaisesRegexp(ValueError, expErrMsg): - q = df.quantile(0.1, axis=0, interpolation='higher') - - # multi - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - with assertRaisesRegexp(ValueError, expErrMsg): - df.quantile([.25, .5], interpolation='midpoint') - - def test_quantile_multi(self): - df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], - columns=['a', 'b', 'c']) - result = df.quantile([.25, .5]) - expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], - index=[.25, .5], columns=['a', 'b', 'c']) - assert_frame_equal(result, expected) - - # axis = 1 - result = df.quantile([.25, .5], axis=1) - expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], - index=[.25, .5], columns=[0, 1, 2]) - - # empty - result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) - expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]}, - index=[.1, .9]) - assert_frame_equal(result, expected) - - def test_quantile_datetime(self): - df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) - - # exclude datetime - result = df.quantile(.5) - expected = Series([2.5], index=['b']) - - # datetime - result = df.quantile(.5, numeric_only=False) - expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], - index=['a', 'b']) - assert_series_equal(result, expected) - - # datetime w/ multi - result = df.quantile([.5], numeric_only=False) - expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], - index=[.5], columns=['a', 'b']) - assert_frame_equal(result, expected) - - # axis = 1 - df['c'] = pd.to_datetime(['2011', '2012']) - result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) - expected = Series([Timestamp('2010-07-02 12:00:00'), - Timestamp('2011-07-02 12:00:00')], - index=[0, 1]) - assert_series_equal(result, expected) - - result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) - expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), - Timestamp('2011-07-02 12:00:00')]], - index=[0.5], columns=[0, 1]) - assert_frame_equal(result, expected) - - def test_quantile_invalid(self): - msg = 'percentiles should all be in the interval \\[0, 1\\]' - for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: - with tm.assertRaisesRegexp(ValueError, msg): - self.tsframe.quantile(invalid) - def test_cumsum(self): self.tsframe.ix[5:10, 0] = nan self.tsframe.ix[10:15, 1] = nan diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 3312e83bae419..2b619b84a5994 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -403,6 +403,24 @@ def test_applymap(self): for f in ['datetime', 'timedelta']: self.assertEqual(result.loc[0, f], str(df.loc[0, f])) + def test_applymap_box(self): + # ufunc will not be boxed. Same test cases as the test_map_box + df = pd.DataFrame({'a': [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02')], + 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern')], + 'c': [pd.Timedelta('1 days'), + pd.Timedelta('2 days')], + 'd': [pd.Period('2011-01-01', freq='M'), + pd.Period('2011-01-02', freq='M')]}) + + res = df.applymap(lambda x: '{0}'.format(x.__class__.__name__)) + exp = pd.DataFrame({'a': ['Timestamp', 'Timestamp'], + 'b': ['Timestamp', 'Timestamp'], + 'c': ['Timedelta', 'Timedelta'], + 'd': ['Period', 'Period']}) + tm.assert_frame_equal(res, exp) + # See gh-12244 def test_apply_non_numpy_dtype(self): df = DataFrame({'dt': pd.date_range( diff --git a/pandas/tests/frame/test_quantile.py b/pandas/tests/frame/test_quantile.py new file mode 100644 index 0000000000000..8ef46423d0d50 --- /dev/null +++ b/pandas/tests/frame/test_quantile.py @@ -0,0 +1,319 @@ +# -*- coding: utf-8 -*- + +from __future__ import print_function + + +import nose +import numpy as np + +from pandas import (DataFrame, Series, Timestamp, _np_version_under1p11) +import pandas as pd + +from pandas.util.testing import (assert_series_equal, + assert_frame_equal, + assertRaisesRegexp) + +import pandas.util.testing as tm +from pandas import _np_version_under1p9 + +from pandas.tests.frame.common import TestData + + +class TestDataFrameQuantile(tm.TestCase, TestData): + + _multiprocess_can_split_ = True + + def test_quantile(self): + from numpy import percentile + + q = self.tsframe.quantile(0.1, axis=0) + self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + q = self.tsframe.quantile(0.9, axis=1) + q = self.intframe.quantile(0.1) + self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + + # test degenerate case + q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) + assert(np.isnan(q['x']) and np.isnan(q['y'])) + + # non-numeric exclusion + df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]}) + rs = df.quantile(0.5) + xp = df.median() + assert_series_equal(rs, xp) + + # axis + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(.5, axis=1) + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + assert_series_equal(result, expected) + + result = df.quantile([.5, .75], axis=1) + expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75], + 3: [3.5, 3.75]}, index=[0.5, 0.75]) + assert_frame_equal(result, expected, check_index_type=True) + + # We may want to break API in the future to change this + # so that we exclude non-numeric along the same axis + # See GH #7312 + df = DataFrame([[1, 2, 3], + ['a', 'b', 4]]) + result = df.quantile(.5, axis=1) + expected = Series([3., 4.], index=[0, 1]) + assert_series_equal(result, expected) + + def test_quantile_axis_parameter(self): + # GH 9543/9544 + + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + + result = df.quantile(.5, axis=0) + + expected = Series([2., 3.], index=["A", "B"]) + assert_series_equal(result, expected) + + expected = df.quantile(.5, axis="index") + assert_series_equal(result, expected) + + result = df.quantile(.5, axis=1) + + expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3]) + assert_series_equal(result, expected) + + result = df.quantile(.5, axis="columns") + assert_series_equal(result, expected) + + self.assertRaises(ValueError, df.quantile, 0.1, axis=-1) + self.assertRaises(ValueError, df.quantile, 0.1, axis="column") + + def test_quantile_interpolation(self): + # GH #10174 + if _np_version_under1p9: + raise nose.SkipTest("Numpy version under 1.9") + + from numpy import percentile + + # interpolation = linear (default case) + q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') + self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + q = self.intframe.quantile(0.1) + self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + + # test with and without interpolation keyword + q1 = self.intframe.quantile(0.1) + self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) + assert_series_equal(q, q1) + + # interpolation method other than default linear + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + result = df.quantile(.5, axis=1, interpolation='nearest') + expected = Series([1, 2, 3], index=[1, 2, 3]) + assert_series_equal(result, expected) + # cross-check interpolation=nearest results in original dtype + exp = np.percentile(np.array([[1, 2, 3], [2, 3, 4]]), .5, + axis=0, interpolation='nearest') + expected = Series(exp, index=[1, 2, 3]) + assert_series_equal(result, expected) + + # float + df = DataFrame({"A": [1., 2., 3.], "B": [2., 3., 4.]}, index=[1, 2, 3]) + result = df.quantile(.5, axis=1, interpolation='nearest') + expected = Series([1., 2., 3.], index=[1, 2, 3]) + assert_series_equal(result, expected) + exp = np.percentile(np.array([[1., 2., 3.], [2., 3., 4.]]), .5, + axis=0, interpolation='nearest') + expected = Series(exp, index=[1, 2, 3]) + assert_series_equal(result, expected) + + # axis + result = df.quantile([.5, .75], axis=1, interpolation='lower') + expected = DataFrame({1: [1., 1.], 2: [2., 2.], + 3: [3., 3.]}, index=[0.5, 0.75]) + assert_frame_equal(result, expected) + + # test degenerate case + df = DataFrame({'x': [], 'y': []}) + q = df.quantile(0.1, axis=0, interpolation='higher') + assert(np.isnan(q['x']) and np.isnan(q['y'])) + + # multi + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=['a', 'b', 'c']) + result = df.quantile([.25, .5], interpolation='midpoint') + + # https://github.com/numpy/numpy/issues/7163 + if _np_version_under1p11: + expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]], + index=[.25, .5], columns=['a', 'b', 'c']) + else: + expected = DataFrame([[1.5, 1.5, 1.5], [2.0, 2.0, 2.0]], + index=[.25, .5], columns=['a', 'b', 'c']) + assert_frame_equal(result, expected) + + def test_quantile_interpolation_np_lt_1p9(self): + # GH #10174 + if not _np_version_under1p9: + raise nose.SkipTest("Numpy version is greater than 1.9") + + from numpy import percentile + + # interpolation = linear (default case) + q = self.tsframe.quantile(0.1, axis=0, interpolation='linear') + self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) + q = self.intframe.quantile(0.1) + self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) + + # test with and without interpolation keyword + q1 = self.intframe.quantile(0.1) + self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10)) + assert_series_equal(q, q1) + + # interpolation method other than default linear + expErrMsg = "Interpolation methods other than linear" + df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3]) + with assertRaisesRegexp(ValueError, expErrMsg): + df.quantile(.5, axis=1, interpolation='nearest') + + with assertRaisesRegexp(ValueError, expErrMsg): + df.quantile([.5, .75], axis=1, interpolation='lower') + + # test degenerate case + df = DataFrame({'x': [], 'y': []}) + with assertRaisesRegexp(ValueError, expErrMsg): + q = df.quantile(0.1, axis=0, interpolation='higher') + + # multi + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=['a', 'b', 'c']) + with assertRaisesRegexp(ValueError, expErrMsg): + df.quantile([.25, .5], interpolation='midpoint') + + def test_quantile_multi(self): + df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]], + columns=['a', 'b', 'c']) + result = df.quantile([.25, .5]) + expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], + index=[.25, .5], columns=['a', 'b', 'c']) + assert_frame_equal(result, expected) + + # axis = 1 + result = df.quantile([.25, .5], axis=1) + expected = DataFrame([[1.5, 1.5, 1.5], [2., 2., 2.]], + index=[.25, .5], columns=[0, 1, 2]) + + # empty + result = DataFrame({'x': [], 'y': []}).quantile([0.1, .9], axis=0) + expected = DataFrame({'x': [np.nan, np.nan], 'y': [np.nan, np.nan]}, + index=[.1, .9]) + assert_frame_equal(result, expected) + + def test_quantile_datetime(self): + df = DataFrame({'a': pd.to_datetime(['2010', '2011']), 'b': [0, 5]}) + + # exclude datetime + result = df.quantile(.5) + expected = Series([2.5], index=['b']) + + # datetime + result = df.quantile(.5, numeric_only=False) + expected = Series([Timestamp('2010-07-02 12:00:00'), 2.5], + index=['a', 'b']) + assert_series_equal(result, expected) + + # datetime w/ multi + result = df.quantile([.5], numeric_only=False) + expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), 2.5]], + index=[.5], columns=['a', 'b']) + assert_frame_equal(result, expected) + + # axis = 1 + df['c'] = pd.to_datetime(['2011', '2012']) + result = df[['a', 'c']].quantile(.5, axis=1, numeric_only=False) + expected = Series([Timestamp('2010-07-02 12:00:00'), + Timestamp('2011-07-02 12:00:00')], + index=[0, 1]) + assert_series_equal(result, expected) + + result = df[['a', 'c']].quantile([.5], axis=1, numeric_only=False) + expected = DataFrame([[Timestamp('2010-07-02 12:00:00'), + Timestamp('2011-07-02 12:00:00')]], + index=[0.5], columns=[0, 1]) + assert_frame_equal(result, expected) + + def test_quantile_invalid(self): + msg = 'percentiles should all be in the interval \\[0, 1\\]' + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with tm.assertRaisesRegexp(ValueError, msg): + self.tsframe.quantile(invalid) + + def test_quantile_box(self): + df = DataFrame({'A': [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03')], + 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03', tz='US/Eastern')], + 'C': [pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('3 days')]}) + res = df.quantile(0.5, numeric_only=False) + # when squeezed, result.name is explicitly reset + exp = pd.Series([pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timedelta('2 days')], + name=None, index=['A', 'B', 'C']) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timedelta('2 days')]], + index=[0.5], columns=['A', 'B', 'C']) + tm.assert_frame_equal(res, exp) + + # DatetimeBlock may be consolidated and contain NaT in different loc + df = DataFrame({'A': [pd.Timestamp('2011-01-01'), + pd.NaT, + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03')], + 'a': [pd.Timestamp('2011-01-01'), + pd.Timestamp('2011-01-02'), + pd.NaT, + pd.Timestamp('2011-01-03')], + 'B': [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.NaT, + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03', tz='US/Eastern')], + 'b': [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.NaT, + pd.Timestamp('2011-01-03', tz='US/Eastern')], + 'C': [pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('3 days'), + pd.NaT], + 'c': [pd.NaT, + pd.Timedelta('1 days'), + pd.Timedelta('2 days'), + pd.Timedelta('3 days')]}, + columns=list('AaBbCc')) + + res = df.quantile(0.5, numeric_only=False) + exp = pd.Series([pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timedelta('2 days'), + pd.Timedelta('2 days')], + name=None, index=list('AaBbCc')) + tm.assert_series_equal(res, exp) + + res = df.quantile([0.5], numeric_only=False) + exp = pd.DataFrame([[pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timedelta('2 days'), + pd.Timedelta('2 days')]], + index=[0.5], columns=list('AaBbCc')) + tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 1d15a5552a13a..2edd8b752aeff 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -11,8 +11,8 @@ import numpy as np import pandas as pd -from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range, - date_range, _np_version_under1p9) +from pandas import (Series, DataFrame, isnull, notnull, bdate_range, + date_range) from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp from pandas.tseries.tdi import Timedelta @@ -542,100 +542,6 @@ def test_prod_numpy16_bug(self): result = s.prod() self.assertNotIsInstance(result, Series) - def test_quantile(self): - from numpy import percentile - - q = self.ts.quantile(0.1) - self.assertEqual(q, percentile(self.ts.valid(), 10)) - - q = self.ts.quantile(0.9) - self.assertEqual(q, percentile(self.ts.valid(), 90)) - - # object dtype - q = Series(self.ts, dtype=object).quantile(0.9) - self.assertEqual(q, percentile(self.ts.valid(), 90)) - - # datetime64[ns] dtype - dts = self.ts.index.to_series() - q = dts.quantile(.2) - self.assertEqual(q, Timestamp('2000-01-10 19:12:00')) - - # timedelta64[ns] dtype - tds = dts.diff() - q = tds.quantile(.25) - self.assertEqual(q, pd.to_timedelta('24:00:00')) - - # GH7661 - result = Series([np.timedelta64('NaT')]).sum() - self.assertTrue(result is pd.NaT) - - msg = 'percentiles should all be in the interval \\[0, 1\\]' - for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: - with tm.assertRaisesRegexp(ValueError, msg): - self.ts.quantile(invalid) - - def test_quantile_multi(self): - from numpy import percentile - - qs = [.1, .9] - result = self.ts.quantile(qs) - expected = pd.Series([percentile(self.ts.valid(), 10), - percentile(self.ts.valid(), 90)], - index=qs, name=self.ts.name) - assert_series_equal(result, expected) - - dts = self.ts.index.to_series() - dts.name = 'xxx' - result = dts.quantile((.2, .2)) - expected = Series([Timestamp('2000-01-10 19:12:00'), - Timestamp('2000-01-10 19:12:00')], - index=[.2, .2], name='xxx') - assert_series_equal(result, expected) - - result = self.ts.quantile([]) - expected = pd.Series([], name=self.ts.name, index=Index( - [], dtype=float)) - assert_series_equal(result, expected) - - def test_quantile_interpolation(self): - # GH #10174 - if _np_version_under1p9: - raise nose.SkipTest("Numpy version is under 1.9") - - from numpy import percentile - - # interpolation = linear (default case) - q = self.ts.quantile(0.1, interpolation='linear') - self.assertEqual(q, percentile(self.ts.valid(), 10)) - q1 = self.ts.quantile(0.1) - self.assertEqual(q1, percentile(self.ts.valid(), 10)) - - # test with and without interpolation keyword - self.assertEqual(q, q1) - - def test_quantile_interpolation_np_lt_1p9(self): - # GH #10174 - if not _np_version_under1p9: - raise nose.SkipTest("Numpy version is greater than 1.9") - - from numpy import percentile - - # interpolation = linear (default case) - q = self.ts.quantile(0.1, interpolation='linear') - self.assertEqual(q, percentile(self.ts.valid(), 10)) - q1 = self.ts.quantile(0.1) - self.assertEqual(q1, percentile(self.ts.valid(), 10)) - - # interpolation other than linear - expErrMsg = "Interpolation methods other than " - with tm.assertRaisesRegexp(ValueError, expErrMsg): - self.ts.quantile(0.9, interpolation='nearest') - - # object dtype - with tm.assertRaisesRegexp(ValueError, expErrMsg): - q = Series(self.ts, dtype=object).quantile(0.7, - interpolation='higher') - def test_all_any(self): ts = tm.makeTimeSeries() bool_series = ts > 0 @@ -1367,11 +1273,6 @@ def test_ptp(self): with self.assertRaises(NotImplementedError): s.ptp(numeric_only=True) - def test_datetime_timedelta_quantiles(self): - # covers #9694 - self.assertTrue(pd.isnull(Series([], dtype='M8[ns]').quantile(.5))) - self.assertTrue(pd.isnull(Series([], dtype='m8[ns]').quantile(.5))) - def test_empty_timeseries_redections_return_nat(self): # covers #11245 for dtype in ('m8[ns]', 'm8[ns]', 'M8[ns]', 'M8[ns, UTC]'): @@ -1875,165 +1776,6 @@ def test_sortlevel(self): res = s.sortlevel(['A', 'B'], sort_remaining=False) assert_series_equal(s, res) - def test_map(self): - index, data = tm.getMixedTypeDict() - - source = Series(data['B'], index=data['C']) - target = Series(data['C'][:4], index=data['D'][:4]) - - merged = target.map(source) - - for k, v in compat.iteritems(merged): - self.assertEqual(v, source[target[k]]) - - # input could be a dict - merged = target.map(source.to_dict()) - - for k, v in compat.iteritems(merged): - self.assertEqual(v, source[target[k]]) - - # function - result = self.ts.map(lambda x: x * 2) - self.assert_numpy_array_equal(result, self.ts * 2) - - # GH 10324 - a = Series([1, 2, 3, 4]) - b = Series(["even", "odd", "even", "odd"], dtype="category") - c = Series(["even", "odd", "even", "odd"]) - - exp = Series(["odd", "even", "odd", np.nan], dtype="category") - self.assert_series_equal(a.map(b), exp) - exp = Series(["odd", "even", "odd", np.nan]) - self.assert_series_equal(a.map(c), exp) - - a = Series(['a', 'b', 'c', 'd']) - b = Series([1, 2, 3, 4], - index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) - c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e'])) - - exp = Series([np.nan, 1, 2, 3]) - self.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, 1, 2, 3]) - self.assert_series_equal(a.map(c), exp) - - a = Series(['a', 'b', 'c', 'd']) - b = Series(['B', 'C', 'D', 'E'], dtype='category', - index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) - c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) - - exp = Series([np.nan, 'B', 'C', 'D'], dtype='category') - self.assert_series_equal(a.map(b), exp) - exp = Series([np.nan, 'B', 'C', 'D']) - self.assert_series_equal(a.map(c), exp) - - def test_map_compat(self): - # related GH 8024 - s = Series([True, True, False], index=[1, 2, 3]) - result = s.map({True: 'foo', False: 'bar'}) - expected = Series(['foo', 'foo', 'bar'], index=[1, 2, 3]) - assert_series_equal(result, expected) - - def test_map_int(self): - left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4}) - right = Series({1: 11, 2: 22, 3: 33}) - - self.assertEqual(left.dtype, np.float_) - self.assertTrue(issubclass(right.dtype.type, np.integer)) - - merged = left.map(right) - self.assertEqual(merged.dtype, np.float_) - self.assertTrue(isnull(merged['d'])) - self.assertTrue(not isnull(merged['c'])) - - def test_map_type_inference(self): - s = Series(lrange(3)) - s2 = s.map(lambda x: np.where(x == 0, 0, 1)) - self.assertTrue(issubclass(s2.dtype.type, np.integer)) - - def test_map_decimal(self): - from decimal import Decimal - - result = self.series.map(lambda x: Decimal(str(x))) - self.assertEqual(result.dtype, np.object_) - tm.assertIsInstance(result[0], Decimal) - - def test_map_na_exclusion(self): - s = Series([1.5, np.nan, 3, np.nan, 5]) - - result = s.map(lambda x: x * 2, na_action='ignore') - exp = s * 2 - assert_series_equal(result, exp) - - def test_map_dict_with_tuple_keys(self): - ''' - Due to new MultiIndex-ing behaviour in v0.14.0, - dicts with tuple keys passed to map were being - converted to a multi-index, preventing tuple values - from being mapped properly. - ''' - df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) - label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} - df['labels'] = df['a'].map(label_mappings) - df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) - # All labels should be filled now - tm.assert_series_equal(df['labels'], df['expected_labels'], - check_names=False) - - def test_apply(self): - assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) - - # elementwise-apply - import math - assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) - - # how to handle Series result, #2316 - result = self.ts.apply(lambda x: Series( - [x, x ** 2], index=['x', 'x^2'])) - expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) - tm.assert_frame_equal(result, expected) - - # empty series - s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) - rs = s.apply(lambda x: x) - tm.assert_series_equal(s, rs) - # check all metadata (GH 9322) - self.assertIsNot(s, rs) - self.assertIs(s.index, rs.index) - self.assertEqual(s.dtype, rs.dtype) - self.assertEqual(s.name, rs.name) - - # index but no data - s = Series(index=[1, 2, 3]) - rs = s.apply(lambda x: x) - tm.assert_series_equal(s, rs) - - def test_apply_same_length_inference_bug(self): - s = Series([1, 2]) - f = lambda x: (x, x + 1) - - result = s.apply(f) - expected = s.map(f) - assert_series_equal(result, expected) - - s = Series([1, 2, 3]) - result = s.apply(f) - expected = s.map(f) - assert_series_equal(result, expected) - - def test_apply_dont_convert_dtype(self): - s = Series(np.random.randn(10)) - - f = lambda x: x if x > 0 else np.nan - result = s.apply(f, convert_dtype=False) - self.assertEqual(result.dtype, object) - - def test_apply_args(self): - s = Series(['foo,bar']) - - result = s.apply(str.split, args=(',', )) - self.assertEqual(result[0], ['foo', 'bar']) - tm.assertIsInstance(result[0], list) - def test_shift_int(self): ts = self.ts.astype(int) shifted = ts.shift(1) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py new file mode 100644 index 0000000000000..87369a0e6ef90 --- /dev/null +++ b/pandas/tests/series/test_apply.py @@ -0,0 +1,257 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import numpy as np +import pandas as pd + +from pandas import (Index, Series, DataFrame, isnull) +from pandas.compat import lrange +from pandas import compat +from pandas.util.testing import assert_series_equal +import pandas.util.testing as tm + +from .common import TestData + + +class TestSeriesApply(TestData, tm.TestCase): + + _multiprocess_can_split_ = True + + def test_apply(self): + assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) + + # elementwise-apply + import math + assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) + + # how to handle Series result, #2316 + result = self.ts.apply(lambda x: Series( + [x, x ** 2], index=['x', 'x^2'])) + expected = DataFrame({'x': self.ts, 'x^2': self.ts ** 2}) + tm.assert_frame_equal(result, expected) + + # empty series + s = Series(dtype=object, name='foo', index=pd.Index([], name='bar')) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) + # check all metadata (GH 9322) + self.assertIsNot(s, rs) + self.assertIs(s.index, rs.index) + self.assertEqual(s.dtype, rs.dtype) + self.assertEqual(s.name, rs.name) + + # index but no data + s = Series(index=[1, 2, 3]) + rs = s.apply(lambda x: x) + tm.assert_series_equal(s, rs) + + def test_apply_same_length_inference_bug(self): + s = Series([1, 2]) + f = lambda x: (x, x + 1) + + result = s.apply(f) + expected = s.map(f) + assert_series_equal(result, expected) + + s = Series([1, 2, 3]) + result = s.apply(f) + expected = s.map(f) + assert_series_equal(result, expected) + + def test_apply_dont_convert_dtype(self): + s = Series(np.random.randn(10)) + + f = lambda x: x if x > 0 else np.nan + result = s.apply(f, convert_dtype=False) + self.assertEqual(result.dtype, object) + + def test_apply_args(self): + s = Series(['foo,bar']) + + result = s.apply(str.split, args=(',', )) + self.assertEqual(result[0], ['foo', 'bar']) + tm.assertIsInstance(result[0], list) + + def test_apply_box(self): + # ufunc will not be boxed. Same test cases as the test_map_box + vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'datetime64[ns]') + # boxed value must be Timestamp instance + res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, + x.day, x.tz)) + exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None']) + tm.assert_series_equal(res, exp) + + vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'datetime64[ns, US/Eastern]') + res = s.apply(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, + x.day, x.tz)) + exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern']) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'timedelta64[ns]') + res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days)) + exp = pd.Series(['Timedelta_1', 'Timedelta_2']) + tm.assert_series_equal(res, exp) + + # period (object dtype, not boxed) + vals = [pd.Period('2011-01-01', freq='M'), + pd.Period('2011-01-02', freq='M')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'object') + res = s.apply(lambda x: '{0}_{1}'.format(x.__class__.__name__, + x.freqstr)) + exp = pd.Series(['Period_M', 'Period_M']) + tm.assert_series_equal(res, exp) + + +class TestSeriesMap(TestData, tm.TestCase): + + _multiprocess_can_split_ = True + + def test_map(self): + index, data = tm.getMixedTypeDict() + + source = Series(data['B'], index=data['C']) + target = Series(data['C'][:4], index=data['D'][:4]) + + merged = target.map(source) + + for k, v in compat.iteritems(merged): + self.assertEqual(v, source[target[k]]) + + # input could be a dict + merged = target.map(source.to_dict()) + + for k, v in compat.iteritems(merged): + self.assertEqual(v, source[target[k]]) + + # function + result = self.ts.map(lambda x: x * 2) + self.assert_numpy_array_equal(result, self.ts * 2) + + # GH 10324 + a = Series([1, 2, 3, 4]) + b = Series(["even", "odd", "even", "odd"], dtype="category") + c = Series(["even", "odd", "even", "odd"]) + + exp = Series(["odd", "even", "odd", np.nan], dtype="category") + self.assert_series_equal(a.map(b), exp) + exp = Series(["odd", "even", "odd", np.nan]) + self.assert_series_equal(a.map(c), exp) + + a = Series(['a', 'b', 'c', 'd']) + b = Series([1, 2, 3, 4], + index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) + c = Series([1, 2, 3, 4], index=Index(['b', 'c', 'd', 'e'])) + + exp = Series([np.nan, 1, 2, 3]) + self.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, 1, 2, 3]) + self.assert_series_equal(a.map(c), exp) + + a = Series(['a', 'b', 'c', 'd']) + b = Series(['B', 'C', 'D', 'E'], dtype='category', + index=pd.CategoricalIndex(['b', 'c', 'd', 'e'])) + c = Series(['B', 'C', 'D', 'E'], index=Index(['b', 'c', 'd', 'e'])) + + exp = Series([np.nan, 'B', 'C', 'D'], dtype='category') + self.assert_series_equal(a.map(b), exp) + exp = Series([np.nan, 'B', 'C', 'D']) + self.assert_series_equal(a.map(c), exp) + + def test_map_compat(self): + # related GH 8024 + s = Series([True, True, False], index=[1, 2, 3]) + result = s.map({True: 'foo', False: 'bar'}) + expected = Series(['foo', 'foo', 'bar'], index=[1, 2, 3]) + assert_series_equal(result, expected) + + def test_map_int(self): + left = Series({'a': 1., 'b': 2., 'c': 3., 'd': 4}) + right = Series({1: 11, 2: 22, 3: 33}) + + self.assertEqual(left.dtype, np.float_) + self.assertTrue(issubclass(right.dtype.type, np.integer)) + + merged = left.map(right) + self.assertEqual(merged.dtype, np.float_) + self.assertTrue(isnull(merged['d'])) + self.assertTrue(not isnull(merged['c'])) + + def test_map_type_inference(self): + s = Series(lrange(3)) + s2 = s.map(lambda x: np.where(x == 0, 0, 1)) + self.assertTrue(issubclass(s2.dtype.type, np.integer)) + + def test_map_decimal(self): + from decimal import Decimal + + result = self.series.map(lambda x: Decimal(str(x))) + self.assertEqual(result.dtype, np.object_) + tm.assertIsInstance(result[0], Decimal) + + def test_map_na_exclusion(self): + s = Series([1.5, np.nan, 3, np.nan, 5]) + + result = s.map(lambda x: x * 2, na_action='ignore') + exp = s * 2 + assert_series_equal(result, exp) + + def test_map_dict_with_tuple_keys(self): + ''' + Due to new MultiIndex-ing behaviour in v0.14.0, + dicts with tuple keys passed to map were being + converted to a multi-index, preventing tuple values + from being mapped properly. + ''' + df = pd.DataFrame({'a': [(1, ), (2, ), (3, 4), (5, 6)]}) + label_mappings = {(1, ): 'A', (2, ): 'B', (3, 4): 'A', (5, 6): 'B'} + df['labels'] = df['a'].map(label_mappings) + df['expected_labels'] = pd.Series(['A', 'B', 'A', 'B'], index=df.index) + # All labels should be filled now + tm.assert_series_equal(df['labels'], df['expected_labels'], + check_names=False) + + def test_map_box(self): + vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'datetime64[ns]') + # boxed value must be Timestamp instance + res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, + x.day, x.tz)) + exp = pd.Series(['Timestamp_1_None', 'Timestamp_2_None']) + tm.assert_series_equal(res, exp) + + vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'datetime64[ns, US/Eastern]') + res = s.map(lambda x: '{0}_{1}_{2}'.format(x.__class__.__name__, + x.day, x.tz)) + exp = pd.Series(['Timestamp_1_US/Eastern', 'Timestamp_2_US/Eastern']) + tm.assert_series_equal(res, exp) + + # timedelta + vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'timedelta64[ns]') + res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__, x.days)) + exp = pd.Series(['Timedelta_1', 'Timedelta_2']) + tm.assert_series_equal(res, exp) + + # period (object dtype, not boxed) + vals = [pd.Period('2011-01-01', freq='M'), + pd.Period('2011-01-02', freq='M')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'object') + res = s.map(lambda x: '{0}_{1}'.format(x.__class__.__name__, + x.freqstr)) + exp = pd.Series(['Period_M', 'Period_M']) + tm.assert_series_equal(res, exp) diff --git a/pandas/tests/series/test_misc_api.py b/pandas/tests/series/test_misc_api.py index acf002f316513..ffb360c5871c7 100644 --- a/pandas/tests/series/test_misc_api.py +++ b/pandas/tests/series/test_misc_api.py @@ -163,6 +163,42 @@ def test_iter(self): for i, val in enumerate(self.ts): self.assertEqual(val, self.ts[i]) + def test_iter_box(self): + vals = [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'datetime64[ns]') + for res, exp in zip(s, vals): + self.assertIsInstance(res, pd.Timestamp) + self.assertEqual(res, exp) + self.assertIsNone(res.tz) + + vals = [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'datetime64[ns, US/Eastern]') + for res, exp in zip(s, vals): + self.assertIsInstance(res, pd.Timestamp) + self.assertEqual(res, exp) + self.assertEqual(res.tz, exp.tz) + + # timedelta + vals = [pd.Timedelta('1 days'), pd.Timedelta('2 days')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'timedelta64[ns]') + for res, exp in zip(s, vals): + self.assertIsInstance(res, pd.Timedelta) + self.assertEqual(res, exp) + + # period (object dtype, not boxed) + vals = [pd.Period('2011-01-01', freq='M'), + pd.Period('2011-01-02', freq='M')] + s = pd.Series(vals) + self.assertEqual(s.dtype, 'object') + for res, exp in zip(s, vals): + self.assertIsInstance(res, pd.Period) + self.assertEqual(res, exp) + self.assertEqual(res.freq, 'M') + def test_keys(self): # HACK: By doing this in two stages, we avoid 2to3 wrapping the call # to .keys() in a list() diff --git a/pandas/tests/series/test_quantile.py b/pandas/tests/series/test_quantile.py new file mode 100644 index 0000000000000..f538fa4e90401 --- /dev/null +++ b/pandas/tests/series/test_quantile.py @@ -0,0 +1,178 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import nose +import numpy as np +import pandas as pd + +from pandas import (Index, Series, _np_version_under1p9) +from pandas.tseries.index import Timestamp +import pandas.core.common as com +import pandas.util.testing as tm + +from .common import TestData + + +class TestSeriesQuantile(TestData, tm.TestCase): + + def test_quantile(self): + from numpy import percentile + + q = self.ts.quantile(0.1) + self.assertEqual(q, percentile(self.ts.valid(), 10)) + + q = self.ts.quantile(0.9) + self.assertEqual(q, percentile(self.ts.valid(), 90)) + + # object dtype + q = Series(self.ts, dtype=object).quantile(0.9) + self.assertEqual(q, percentile(self.ts.valid(), 90)) + + # datetime64[ns] dtype + dts = self.ts.index.to_series() + q = dts.quantile(.2) + self.assertEqual(q, Timestamp('2000-01-10 19:12:00')) + + # timedelta64[ns] dtype + tds = dts.diff() + q = tds.quantile(.25) + self.assertEqual(q, pd.to_timedelta('24:00:00')) + + # GH7661 + result = Series([np.timedelta64('NaT')]).sum() + self.assertTrue(result is pd.NaT) + + msg = 'percentiles should all be in the interval \\[0, 1\\]' + for invalid in [-1, 2, [0.5, -1], [0.5, 2]]: + with tm.assertRaisesRegexp(ValueError, msg): + self.ts.quantile(invalid) + + def test_quantile_multi(self): + from numpy import percentile + + qs = [.1, .9] + result = self.ts.quantile(qs) + expected = pd.Series([percentile(self.ts.valid(), 10), + percentile(self.ts.valid(), 90)], + index=qs, name=self.ts.name) + tm.assert_series_equal(result, expected) + + dts = self.ts.index.to_series() + dts.name = 'xxx' + result = dts.quantile((.2, .2)) + expected = Series([Timestamp('2000-01-10 19:12:00'), + Timestamp('2000-01-10 19:12:00')], + index=[.2, .2], name='xxx') + tm.assert_series_equal(result, expected) + + result = self.ts.quantile([]) + expected = pd.Series([], name=self.ts.name, index=Index( + [], dtype=float)) + tm.assert_series_equal(result, expected) + + def test_quantile_interpolation(self): + # GH #10174 + if _np_version_under1p9: + raise nose.SkipTest("Numpy version is under 1.9") + + from numpy import percentile + + # interpolation = linear (default case) + q = self.ts.quantile(0.1, interpolation='linear') + self.assertEqual(q, percentile(self.ts.valid(), 10)) + q1 = self.ts.quantile(0.1) + self.assertEqual(q1, percentile(self.ts.valid(), 10)) + + # test with and without interpolation keyword + self.assertEqual(q, q1) + + def test_quantile_interpolation_dtype(self): + # GH #10174 + if _np_version_under1p9: + raise nose.SkipTest("Numpy version is under 1.9") + + from numpy import percentile + + # interpolation = linear (default case) + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='lower') + self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) + self.assertTrue(com.is_integer(q)) + + q = pd.Series([1, 3, 4]).quantile(0.5, interpolation='higher') + self.assertEqual(q, percentile(np.array([1, 3, 4]), 50)) + self.assertTrue(com.is_integer(q)) + + def test_quantile_interpolation_np_lt_1p9(self): + # GH #10174 + if not _np_version_under1p9: + raise nose.SkipTest("Numpy version is greater than 1.9") + + from numpy import percentile + + # interpolation = linear (default case) + q = self.ts.quantile(0.1, interpolation='linear') + self.assertEqual(q, percentile(self.ts.valid(), 10)) + q1 = self.ts.quantile(0.1) + self.assertEqual(q1, percentile(self.ts.valid(), 10)) + + # interpolation other than linear + expErrMsg = "Interpolation methods other than " + with tm.assertRaisesRegexp(ValueError, expErrMsg): + self.ts.quantile(0.9, interpolation='nearest') + + # object dtype + with tm.assertRaisesRegexp(ValueError, expErrMsg): + q = Series(self.ts, dtype=object).quantile(0.7, + interpolation='higher') + + def test_quantile_nan(self): + cases = [Series([]), Series([np.nan, np.nan])] + + for s in cases: + res = s.quantile(0.5) + self.assertTrue(np.isnan(res)) + + res = s.quantile([0.5]) + tm.assert_series_equal(res, pd.Series([np.nan], index=[0.5])) + + res = s.quantile([0.2, 0.3]) + tm.assert_series_equal(res, pd.Series([np.nan, np.nan], + index=[0.2, 0.3])) + + def test_quantile_box(self): + cases = [[pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03')], + [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03', tz='US/Eastern')], + [pd.Timedelta('1 days'), pd.Timedelta('2 days'), + pd.Timedelta('3 days')], + # NaT + [pd.Timestamp('2011-01-01'), pd.Timestamp('2011-01-02'), + pd.Timestamp('2011-01-03'), pd.NaT], + [pd.Timestamp('2011-01-01', tz='US/Eastern'), + pd.Timestamp('2011-01-02', tz='US/Eastern'), + pd.Timestamp('2011-01-03', tz='US/Eastern'), pd.NaT], + [pd.Timedelta('1 days'), pd.Timedelta('2 days'), + pd.Timedelta('3 days'), pd.NaT]] + + for case in cases: + s = pd.Series(case, name='XXX') + res = s.quantile(0.5) + self.assertEqual(res, case[1]) + + res = s.quantile([0.5]) + exp = pd.Series([case[1]], index=[0.5], name='XXX') + tm.assert_series_equal(res, exp) + + def test_datetime_timedelta_quantiles(self): + # covers #9694 + self.assertTrue(pd.isnull(Series([], dtype='M8[ns]').quantile(.5))) + self.assertTrue(pd.isnull(Series([], dtype='m8[ns]').quantile(.5))) + + def test_quantile_nat(self): + res = Series([pd.NaT, pd.NaT]).quantile(0.5) + self.assertTrue(res is pd.NaT) + + res = Series([pd.NaT, pd.NaT]).quantile([0.5]) + tm.assert_series_equal(res, pd.Series([pd.NaT], index=[0.5])) diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 48e17fd84a3b2..6e7b0ac9bade8 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -292,6 +292,11 @@ def hasnans(self): @property def asobject(self): + """ + return object Index which contains boxed values + + *this is an internal non-public method* + """ from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object)