Skip to content

Commit

Permalink
Merge pull request #3065 from jreback/replace
Browse files Browse the repository at this point in the history
BUG/ENH: guarantee blocks will upcast as needed, and split as needed
  • Loading branch information
jreback committed Mar 16, 2013
2 parents dde093e + 9e09328 commit fbfd16a
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 77 deletions.
2 changes: 1 addition & 1 deletion RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ pandas 0.11.0
doesn't have nans, then an int will be returned)
- backfill/pad/take/diff/ohlc will now support ``float32/int16/int8``
operations
- Integer block types will upcast as needed in where operations (GH2793_)
- Block types will upcast as needed in where/masking operations (GH2793_)
- Series now automatically will try to set the correct dtype based on passed
datetimelike objects (datetime/Timestamp)

Expand Down
12 changes: 12 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,13 @@ def mask_missing(arr, values_to_mask):
for x in nonna:
if mask is None:
mask = arr == x

# if x is a string and mask is not, then we get a scalar
# return value, which is not good
if not isinstance(mask,np.ndarray):
m = mask
mask = np.empty(arr.shape,dtype=np.bool)
mask.fill(m)
else:
mask = mask | (arr == x)

Expand Down Expand Up @@ -730,6 +737,11 @@ def _maybe_promote(dtype, fill_value=np.nan):
dtype = np.complex128
else:
dtype = np.object_

# in case we have a string that looked like a number
if issubclass(np.dtype(dtype).type, basestring):
dtype = np.object_

return dtype, fill_value


Expand Down
153 changes: 77 additions & 76 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from numpy import nan
import numpy as np

from pandas.core.common import _possibly_downcast_to_dtype
from pandas.core.common import _possibly_downcast_to_dtype, isnull
from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes
from pandas.core.indexing import _check_slice_bounds, _maybe_convert_indices
import pandas.core.common as com
Expand Down Expand Up @@ -260,32 +260,14 @@ def _try_cast_result(self, result):
return result

def replace(self, to_replace, value, inplace=False):
new_values = self.values if inplace else self.values.copy()
if self._can_hold_element(value):
value = self._try_cast(value)

if not isinstance(to_replace, (list, np.ndarray)):
if self._can_hold_element(to_replace):
to_replace = self._try_cast(to_replace)
msk = com.mask_missing(new_values, to_replace)
np.putmask(new_values, msk, value)
else:
try:
to_replace = np.array(to_replace, dtype=self.dtype)
msk = com.mask_missing(new_values, to_replace)
np.putmask(new_values, msk, value)
except Exception:
to_replace = np.array(to_replace, dtype=object)
for r in to_replace:
if self._can_hold_element(r):
r = self._try_cast(r)
msk = com.mask_missing(new_values, to_replace)
np.putmask(new_values, msk, value)

if inplace:
return self
else:
return make_block(new_values, self.items, self.ref_items)
""" replace the to_replace value with value, possible to create new blocks here
this is just a call to putmask """
mask = com.mask_missing(self.values, to_replace)
if not mask.any():
if inplace:
return [ self ]
return [ self.copy() ]
return self.putmask(mask, value, inplace=inplace)

def putmask(self, mask, new, inplace=False):
""" putmask the data to the block; it is possible that we may create a new dtype of block
Expand All @@ -309,19 +291,34 @@ def putmask(self, mask, new, inplace=False):

# maybe upcast me
elif mask.any():
# type of the new block
if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
isinstance(new, float)):
typ = np.float64
else:
typ = np.object_

# we need to exiplicty astype here to make a copy
new_values = new_values.astype(typ)
# need to go column by column
new_blocks = []
for i, item in enumerate(self.items):

# we create a new block type
np.putmask(new_values, mask, new)
return [ make_block(new_values, self.items, self.ref_items) ]
m = mask[i]

# need a new block
if m.any():

n = new[i] if isinstance(new, np.ndarray) else new

# type of the new block
dtype, _ = com._maybe_promote(np.array(n).dtype)

# we need to exiplicty astype here to make a copy
nv = new_values[i].astype(dtype)

# we create a new block type
np.putmask(nv, m, n)

else:
nv = new_values[i] if inplace else new_values[i].copy()

nv = _block_shape(nv)
new_blocks.append(make_block(nv, [ item ], self.ref_items))

return new_blocks

if inplace:
return [ self ]
Expand Down Expand Up @@ -350,7 +347,7 @@ def interpolate(self, method='pad', axis=0, inplace=False,
if missing is None:
mask = None
else: # todo create faster fill func without masking
mask = _mask_missing(transf(values), missing)
mask = com.mask_missing(transf(values), missing)

if method == 'pad':
com.pad_2d(transf(values), limit=limit, mask=mask)
Expand Down Expand Up @@ -532,31 +529,14 @@ def create_block(result, items, transpose = True):
if len(result) == 1:
result = np.repeat(result,self.shape[1:])

result = result.reshape(((1,) + self.shape[1:]))
result = _block_shape(result,ndim=self.ndim,shape=self.shape[1:])
result_blocks.append(create_block(result, item, transpose = False))

return result_blocks
else:
result = func(cond,values,other)
return create_block(result, self.items)

def _mask_missing(array, missing_values):
if not isinstance(missing_values, (list, np.ndarray)):
missing_values = [missing_values]

mask = None
missing_values = np.array(missing_values, dtype=object)
if com.isnull(missing_values).any():
mask = com.isnull(array)
missing_values = missing_values[com.notnull(missing_values)]

for v in missing_values:
if mask is None:
mask = array == missing_values
else:
mask |= array == missing_values
return mask

class NumericBlock(Block):
is_numeric = True
_can_hold_na = True
Expand Down Expand Up @@ -659,7 +639,7 @@ def convert(self, convert_dates = True, convert_numeric = True, copy = True):
values = self.get(c)

values = com._possibly_convert_objects(values, convert_dates=convert_dates, convert_numeric=convert_numeric)
values = values.reshape(((1,) + values.shape))
values = _block_shape(values)
items = self.items.take([i])
newb = make_block(values, items, self.ref_items)
blocks.append(newb)
Expand Down Expand Up @@ -949,23 +929,37 @@ def replace(self, *args, **kwargs):

def replace_list(self, src_lst, dest_lst, inplace=False):
""" do a list replace """
if not inplace:
self = self.copy()

sset = set(src_lst)
if any([k in sset for k in dest_lst]):
masks = {}
for s in src_lst:
masks[s] = [b.values == s for b in self.blocks]

for s, d in zip(src_lst, dest_lst):
[b.putmask(masks[s][i], d, inplace=True) for i, b in
enumerate(self.blocks)]
else:
for s, d in zip(src_lst, dest_lst):
self.replace(s, d, inplace=True)

return self
# figure out our mask a-priori to avoid repeated replacements
values = self.as_matrix()
def comp(s):
if isnull(s):
return isnull(values)
return values == s
masks = [ comp(s) for i, s in enumerate(src_lst) ]

result_blocks = []
for blk in self.blocks:

# its possible to get multiple result blocks here
# replace ALWAYS will return a list
rb = [ blk if inplace else blk.copy() ]
for i, d in enumerate(dest_lst):
new_rb = []
for b in rb:
# get our mask for this element, sized to this
# particular block
m = masks[i][b.ref_locs]
if m.any():
new_rb.extend(b.putmask(m, d, inplace=True))
else:
new_rb.append(b)
rb = new_rb
result_blocks.extend(rb)

bm = self.__class__(result_blocks, self.axes)
bm._consolidate_inplace()
return bm

def is_consolidated(self):
"""
Expand Down Expand Up @@ -1302,8 +1296,7 @@ def set(self, item, value):
Set new item in-place. Does not consolidate. Adds new Block if not
contained in the current set of items
"""
if value.ndim == self.ndim - 1:
value = value.reshape((1,) + value.shape)
value = _block_shape(value,self.ndim-1)
if value.shape[1:] != self.shape[1:]:
raise AssertionError('Shape of new values must be compatible '
'with manager shape')
Expand Down Expand Up @@ -1873,6 +1866,14 @@ def _merge_blocks(blocks, items):
return new_block.reindex_items_from(items)


def _block_shape(values, ndim=1, shape=None):
""" guarantee the shape of the values to be at least 1 d """
if values.ndim == ndim:
if shape is None:
shape = values.shape
values = values.reshape(tuple((1,) + shape))
return values

def _vstack(to_stack):
if all(x.dtype == _NS_DTYPE for x in to_stack):
# work around NumPy 1.6 bug
Expand Down
25 changes: 25 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -5596,6 +5596,31 @@ def test_replace_mixed(self):
assert_frame_equal(result, expected)
assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame)

# int block upcasting
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') })
expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64') })
result = df.replace(0, 0.5)
assert_frame_equal(result,expected)

df.replace(0, 0.5, inplace=True)
assert_frame_equal(df,expected)

# int block splitting
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64'), 'C' : Series([1,2],dtype='int64') })
expected = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0.5,1],dtype='float64'), 'C' : Series([1,2],dtype='int64') })
result = df.replace(0, 0.5)
assert_frame_equal(result,expected)

# to object block upcasting
df = DataFrame({ 'A' : Series([1.0,2.0],dtype='float64'), 'B' : Series([0,1],dtype='int64') })
expected = DataFrame({ 'A' : Series([1,'foo'],dtype='object'), 'B' : Series([0,1],dtype='int64') })
result = df.replace(2, 'foo')
assert_frame_equal(result,expected)

expected = DataFrame({ 'A' : Series(['foo','bar'],dtype='object'), 'B' : Series([0,'foo'],dtype='object') })
result = df.replace([1,2], ['foo','bar'])
assert_frame_equal(result,expected)

def test_replace_interpolate(self):
padded = self.tsframe.replace(nan, method='pad')
assert_frame_equal(padded, self.tsframe.fillna(method='pad'))
Expand Down

0 comments on commit fbfd16a

Please sign in to comment.