Skip to content

Commit

Permalink
CLN: Move boxing logic to BlockManager
Browse files Browse the repository at this point in the history
  • Loading branch information
sinhrks committed Apr 3, 2016
1 parent 101d81d commit 9e8d782
Show file tree
Hide file tree
Showing 13 changed files with 942 additions and 627 deletions.
8 changes: 6 additions & 2 deletions doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,14 @@ Bug Fixes
- Bug in ``concat`` raises ``AttributeError`` when input data contains tz-aware datetime and timedelta (:issue:`12620`)




- Bug in ``pivot_table`` when ``margins=True`` and ``dropna=True`` where nulls still contributed to margin count (:issue:`12577`)
- Bug in ``Series.name`` when ``name`` attribute can be a hashable type (:issue:`12610`)
- Bug in ``.describe()`` resets categorical columns information (:issue:`11558`)
- Bug where ``loffset`` argument was not applied when calling ``resample().count()`` on a timeseries (:issue:`12725`)
- ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`)




- Bug in ``.quantile`` with interpolation may coerce to ``float`` unexpectedly (:issue:`12772`)
- Bug in ``.quantile`` with empty Series may return scalar rather than empty Series (:issue:`12772`)
10 changes: 0 additions & 10 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2377,16 +2377,6 @@ def needs_i8_conversion(arr_or_dtype):
is_datetime64tz_dtype(arr_or_dtype))


def i8_boxer(arr_or_dtype):
""" return the scalar boxer for the dtype """
if (is_datetime64_dtype(arr_or_dtype) or
is_datetime64tz_dtype(arr_or_dtype)):
return lib.Timestamp
elif is_timedelta64_dtype(arr_or_dtype):
return lambda x: lib.Timedelta(x, unit='ns')
raise ValueError("cannot find a scalar boxer for {0}".format(arr_or_dtype))


def is_numeric_dtype(arr_or_dtype):
tipo = _get_dtype_type(arr_or_dtype)
return (issubclass(tipo, (np.number, np.bool_)) and
Expand Down
52 changes: 9 additions & 43 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
from pandas.core.categorical import Categorical
import pandas.computation.expressions as expressions
from pandas.computation.eval import eval as _eval
from numpy import percentile as _quantile
from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u,
OrderedDict, raise_with_traceback)
from pandas import compat
Expand All @@ -63,7 +62,6 @@
import pandas.algos as _algos

from pandas.core.config import get_option
from pandas import _np_version_under1p9

# ---------------------------------------------------------------------
# Docstring templates
Expand Down Expand Up @@ -4227,10 +4225,7 @@ def applymap(self, func):

# if we have a dtype == 'M8[ns]', provide boxed values
def infer(x):
if com.needs_i8_conversion(x):
f = com.i8_boxer(x)
x = lib.map_infer(_values_from_object(x), f)
return lib.map_infer(_values_from_object(x), func)
return lib.map_infer(x.asobject, func)

return self.apply(infer)

Expand Down Expand Up @@ -4974,55 +4969,26 @@ def quantile(self, q=0.5, axis=0, numeric_only=True,
0.1 1.3 3.7
0.5 2.5 55.0
"""

self._check_percentile(q)
per = np.asarray(q) * 100

if not com.is_list_like(per):
per = [per]
if not com.is_list_like(q):
q = [q]
squeeze = True
else:
squeeze = False

if _np_version_under1p9:
if interpolation != 'linear':
raise ValueError("Interpolation methods other than linear "
"are not supported in numpy < 1.9")

def f(arr, per, interpolation):
if arr._is_datelike_mixed_type:
values = _values_from_object(arr).view('i8')
else:
values = arr.astype(float)
values = values[notnull(values)]
if len(values) == 0:
return NA
else:
if _np_version_under1p9:
return _quantile(values, per)
else:
return _quantile(values, per, interpolation=interpolation)

data = self._get_numeric_data() if numeric_only else self

axis = self._get_axis_number(axis)

def _quantile(series):
res = series.quantile(q, interpolation=interpolation)
return series.name, res

if axis == 1:
data = data.T

# need to know which cols are timestamp going in so that we can
# map timestamp over them after getting the quantile.
is_dt_col = data.dtypes.map(com.is_datetime64_dtype)
is_dt_col = is_dt_col[is_dt_col].index

quantiles = [[f(vals, x, interpolation) for x in per]
for (_, vals) in data.iteritems()]

result = self._constructor(quantiles, index=data._info_axis,
columns=q).T
if len(is_dt_col) > 0:
result[is_dt_col] = result[is_dt_col].applymap(lib.Timestamp)
# unable to use DataFrame.apply, becasuse data may be empty
result = dict(_quantile(s) for (_, s) in data.iteritems())
result = self._constructor(result, columns=data.columns)
if squeeze:
if result.shape == (1, 1):
result = result.T.iloc[:, 0] # don't want scalar
Expand Down
121 changes: 86 additions & 35 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from collections import defaultdict

import numpy as np
from numpy import percentile as _quantile

from pandas.core.base import PandasObject

from pandas.core.common import (_possibly_downcast_to_dtype, isnull, _NS_DTYPE,
Expand Down Expand Up @@ -131,6 +133,8 @@ def get_values(self, dtype=None):
return an internal format, currently just the ndarray
this is often overriden to handle to_dense like operations
"""
if com.is_object_dtype(dtype):
return self.values.astype(object)
return self.values

def to_dense(self):
Expand All @@ -141,6 +145,10 @@ def to_object_block(self, mgr):
values = self.get_values(dtype=object)
return self.make_block(values, klass=ObjectBlock)

@property
def _na_value(self):
return np.nan

@property
def fill_value(self):
return np.nan
Expand Down Expand Up @@ -1247,6 +1255,19 @@ def equals(self, other):
return False
return array_equivalent(self.values, other.values)

def quantile(self, values, qs, **kwargs):
if len(values) == 0:
if com.is_list_like(qs):
return np.array([self.fill_value])
else:
return self._na_value

if com.is_list_like(qs):
values = [_quantile(values, x * 100, **kwargs) for x in qs]
return np.array(values)
else:
return _quantile(values, qs * 100, **kwargs)


class NonConsolidatableMixIn(object):
""" hold methods for the nonconsolidatable blocks """
Expand Down Expand Up @@ -1455,15 +1476,55 @@ def should_store(self, value):
return com.is_integer_dtype(value) and value.dtype == self.dtype


class TimeDeltaBlock(IntBlock):
class DatetimeLikeBlockMixin(object):

@property
def _na_value(self):
return tslib.NaT

@property
def fill_value(self):
return tslib.iNaT

def _try_operate(self, values):
""" return a version to operate on """
return values.view('i8')

def get_values(self, dtype=None):
"""
return object dtype as boxed values, such as Timestamps/Timedelta
"""
if com.is_object_dtype(dtype):
return lib.map_infer(self.values.ravel(),
self._box_func).reshape(self.values.shape)
return self.values

def quantile(self, values, qs, **kwargs):
values = values.view('i8')
mask = values == self.fill_value
if mask.any():
values = values[~mask]
result = Block.quantile(self, values, qs, **kwargs)

if com.is_datetime64tz_dtype(self):
# ToDo: Temp logic to avoid GH 12619 and GH 12772
# which affects to DatetimeBlockTZ_try_coerce_result for np.ndarray
if isinstance(result, np.ndarray) and values.ndim > 0:
result = self._holder(result, tz='UTC')
result = result.tz_convert(self.values.tz)
return result
return self._try_coerce_result(result)


class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
__slots__ = ()
is_timedelta = True
_can_hold_na = True
is_numeric = False

@property
def fill_value(self):
return tslib.iNaT
def _box_func(self):
return lambda x: tslib.Timedelta(x, unit='ns')

def fillna(self, value, **kwargs):

Expand Down Expand Up @@ -1516,19 +1577,15 @@ def _try_coerce_args(self, values, other):

return values, values_mask, other, other_mask

def _try_operate(self, values):
""" return a version to operate on """
return values.view('i8')

def _try_coerce_result(self, result):
""" reverse of try_coerce_args / try_operate """
if isinstance(result, np.ndarray):
mask = isnull(result)
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('m8[ns]')
result[mask] = tslib.iNaT
elif isinstance(result, np.integer):
result = lib.Timedelta(result)
elif isinstance(result, (np.integer, np.float)):
result = self._box_func(result)
return result

def should_store(self, value):
Expand Down Expand Up @@ -1558,13 +1615,6 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None,
dtype=object)
return rvalues

def get_values(self, dtype=None):
# return object dtypes as Timedelta
if dtype == object:
return lib.map_infer(self.values.ravel(),
lib.Timedelta).reshape(self.values.shape)
return self.values


class BoolBlock(NumericBlock):
__slots__ = ()
Expand Down Expand Up @@ -1954,7 +2004,7 @@ def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
return values.reshape(1, len(values))


class DatetimeBlock(Block):
class DatetimeBlock(DatetimeLikeBlockMixin, Block):
__slots__ = ()
is_datetime = True
_can_hold_na = True
Expand Down Expand Up @@ -1998,10 +2048,6 @@ def _try_cast(self, element):
except:
return element

def _try_operate(self, values):
""" return a version to operate on """
return values.view('i8')

def _try_coerce_args(self, values, other):
"""
Coerce values and other to dtype 'i8'. NaN and NaT convert to
Expand Down Expand Up @@ -2029,7 +2075,7 @@ def _try_coerce_args(self, values, other):
other = tslib.iNaT
other_mask = True
elif isinstance(other, (datetime, np.datetime64, date)):
other = lib.Timestamp(other)
other = self._box_func(other)
if getattr(other, 'tz') is not None:
raise TypeError("cannot coerce a Timestamp with a tz on a "
"naive Block")
Expand All @@ -2056,13 +2102,13 @@ def _try_coerce_result(self, result):
if isinstance(result, np.ndarray):
if result.dtype.kind in ['i', 'f', 'O']:
result = result.astype('M8[ns]')
elif isinstance(result, (np.integer, np.datetime64)):
result = lib.Timestamp(result)
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = self._box_func(result)
return result

@property
def fill_value(self):
return tslib.iNaT
def _box_func(self):
return tslib.Timestamp

def to_native_types(self, slicer=None, na_rep=None, date_format=None,
quoting=None, **kwargs):
Expand Down Expand Up @@ -2098,13 +2144,6 @@ def set(self, locs, values, check=False):

self.values[locs] = values

def get_values(self, dtype=None):
# return object dtype as Timestamps
if dtype == object:
return lib.map_infer(
self.values.ravel(), lib.Timestamp).reshape(self.values.shape)
return self.values


class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
Expand Down Expand Up @@ -2145,7 +2184,7 @@ def external_values(self):

def get_values(self, dtype=None):
# return object dtype as Timestamps with the zones
if dtype == object:
if com.is_object_dtype(dtype):
f = lambda x: lib.Timestamp(x, tz=self.values.tz)
return lib.map_infer(
self.values.ravel(), f).reshape(self.values.shape)
Expand Down Expand Up @@ -2228,10 +2267,14 @@ def _try_coerce_result(self, result):

if isinstance(result, np.ndarray):
result = self._holder(result, tz=self.values.tz)
elif isinstance(result, (np.integer, np.datetime64)):
elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = lib.Timestamp(result, tz=self.values.tz)
return result

@property
def _box_func(self):
return lambda x: tslib.Timestamp(x, tz=self.dtype.tz)

def shift(self, periods, axis=0, mgr=None):
""" shift the block by periods """

Expand Down Expand Up @@ -3852,6 +3895,14 @@ def get_values(self):
""" return a dense type view """
return np.array(self._block.to_dense(), copy=False)

@property
def asobject(self):
"""
return a object dtype array. datetime/timedelta like values are boxed
to Timestamp/Timedelta instances.
"""
return self._block.get_values(dtype=object)

@property
def itemsize(self):
return self._block.values.itemsize
Expand Down
Loading

0 comments on commit 9e8d782

Please sign in to comment.