Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Optimize take_*; improve non-NA fill_value support #2819

Merged
merged 2 commits into from
Feb 10, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2018,7 +2018,7 @@ def group_median(ndarray[float64_t, ndim=2] out,
data = np.empty((K, N), dtype=np.float64)
ptr = <float64_t*> data.data

take_2d_axis1_float64(values.T, indexer, out=data)
take_2d_axis1_float64_float64(values.T, indexer, out=data)

for i in range(K):
# exclude NA group
Expand Down
571 changes: 360 additions & 211 deletions pandas/core/common.py

Large diffs are not rendered by default.

33 changes: 15 additions & 18 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2107,10 +2107,6 @@ def __setitem__(self, key, value):
def _boolean_set(self, key, value):
if key.values.dtype != np.bool_:
raise ValueError('Must pass DataFrame with boolean values only')

if self._is_mixed_type:
raise ValueError('Cannot do boolean setting on mixed-type frame')

self.where(-key, value, inplace=True)

def _set_item_multiple(self, keys, value):
Expand Down Expand Up @@ -2928,7 +2924,7 @@ def take(self, indices, axis=0):
new_columns = self.columns.take(indices)
return self.reindex(columns=new_columns)
else:
new_values = com.take_2d(self.values,
new_values = com.take_nd(self.values,
com._ensure_int64(indices),
axis=axis)
if axis == 0:
Expand Down Expand Up @@ -5229,16 +5225,19 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr

Parameters
----------
cond: boolean DataFrame or array
other: scalar or DataFrame
inplace: perform the operation in place on the data
try_cast: try to cast the result back to the input type (if possible), defaults to False
raise_on_error: should I raise on invalid data types (e.g. trying to where on strings),
defaults to True
cond : boolean DataFrame or array
other : scalar or DataFrame
inplace : boolean, default False
Whether to perform the operation in place on the data
try_cast : boolean, default False
try to cast the result back to the input type (if possible),
raise_on_error : boolean, default True
Whether to raise on invalid data types (e.g. trying to where on
strings)

Returns
-------
wh: DataFrame
wh : DataFrame
"""
if not hasattr(cond, 'shape'):
raise ValueError('where requires an ndarray like object for its '
Expand All @@ -5263,18 +5262,16 @@ def where(self, cond, other=NA, inplace=False, try_cast=False, raise_on_error=Tr
if isinstance(other, DataFrame):
_, other = self.align(other, join='left', fill_value=NA)
elif isinstance(other,np.ndarray):

if other.shape[0] != len(self.index) or other.shape[1] != len(self.columns):
raise ValueError('other must be the same shape as self when an ndarray')
other = DataFrame(other,self.index,self.columns)
if other.shape != self.shape:
raise ValueError('other must be the same shape as self '
'when an ndarray')
other = DataFrame(other, self.index, self.columns)

if inplace:

# we may have different type blocks come out of putmask, so reconstruct the block manager
self._data = self._data.putmask(cond,other,inplace=True)

else:

func = lambda values, others, conds: np.where(conds, values, others)
new_data = self._data.where(func, other, cond, raise_on_error=raise_on_error, try_cast=try_cast)

Expand Down
81 changes: 36 additions & 45 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,9 @@ def reindex_axis(self, indexer, mask, needs_masking, axis=0,
"""
Reindex using pre-computed indexer information
"""
if self.values.size > 0:
new_values = com.take_fast(self.values, indexer, mask,
needs_masking, axis=axis,
fill_value=fill_value)
else:
shape = list(self.shape)
shape[axis] = len(indexer)
new_values = np.empty(shape)
new_values.fill(fill_value)
new_values = com.take_fast(self.values, indexer,
mask, needs_masking, axis=axis,
fill_value=fill_value)
return make_block(new_values, self.items, self.ref_items)

def reindex_items_from(self, new_ref_items, copy=True):
Expand All @@ -155,12 +149,9 @@ def reindex_items_from(self, new_ref_items, copy=True):
mask = indexer != -1
masked_idx = indexer[mask]

if self.values.ndim == 2:
new_values = com.take_2d(self.values, masked_idx, axis=0,
needs_masking=False)
else:
new_values = self.values.take(masked_idx, axis=0)

new_values = com.take_fast(self.values, masked_idx,
mask=None, needs_masking=False,
axis=0)
new_items = self.items.take(masked_idx)
return make_block(new_values, new_items, new_ref_items)

Expand Down Expand Up @@ -301,24 +292,23 @@ def putmask(self, mask, new, inplace=False):
new_values = self.values if inplace else self.values.copy()

# may need to align the new
if hasattr(new,'reindex_axis'):
axis = getattr(new,'_het_axis',0)
if hasattr(new, 'reindex_axis'):
axis = getattr(new, '_het_axis', 0)
new = new.reindex_axis(self.items, axis=axis, copy=False).values.T

# may need to align the mask
if hasattr(mask,'reindex_axis'):
axis = getattr(mask,'_het_axis',0)
if hasattr(mask, 'reindex_axis'):
axis = getattr(mask, '_het_axis', 0)
mask = mask.reindex_axis(self.items, axis=axis, copy=False).values.T

if self._can_hold_element(new):
new = self._try_cast(new)
np.putmask(new_values, mask, new)

# upcast me
else:

# type of the new block
if isinstance(new,np.ndarray) and issubclass(new.dtype,np.number) or issubclass(type(new),float):
if ((isinstance(new, np.ndarray) and issubclass(new.dtype, np.number)) or
isinstance(new, float)):
typ = float
else:
typ = object
Expand Down Expand Up @@ -369,9 +359,8 @@ def interpolate(self, method='pad', axis=0, inplace=False,
def take(self, indexer, axis=1, fill_value=np.nan):
if axis < 1:
raise AssertionError('axis must be at least 1, got %d' % axis)
new_values = com.take_fast(self.values, indexer, None,
None, axis=axis,
fill_value=fill_value)
new_values = com.take_fast(self.values, indexer, None, False,
axis=axis, fill_value=fill_value)
return make_block(new_values, self.items, self.ref_items)

def get_values(self, dtype):
Expand Down Expand Up @@ -401,22 +390,21 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals

Parameters
----------
func : how to combine self,other
func : how to combine self, other
other : a ndarray/object
cond : the condition to respect, optional
raise_on_error : if True, raise when I can't perform the function, False by default (and just return
the data that we had coming in)
raise_on_error : if True, raise when I can't perform the function,
False by default (and just return the data that we had coming in)

Returns
-------
a new block, the result of the func
"""

values = self.values

# see if we can align other
if hasattr(other,'reindex_axis'):
axis = getattr(other,'_het_axis',0)
if hasattr(other, 'reindex_axis'):
axis = getattr(other, '_het_axis', 0)
other = other.reindex_axis(self.items, axis=axis, copy=True).values

# make sure that we can broadcast
Expand All @@ -428,17 +416,20 @@ def where(self, func, other, cond = None, raise_on_error = True, try_cast = Fals

# see if we can align cond
if cond is not None:
if not hasattr(cond,'shape'):
raise ValueError("where must have a condition that is ndarray like")
if hasattr(cond,'reindex_axis'):
axis = getattr(cond,'_het_axis',0)
cond = cond.reindex_axis(self.items, axis=axis, copy=True).values
if not hasattr(cond, 'shape'):
raise ValueError('where must have a condition that is ndarray'
' like')
if hasattr(cond, 'reindex_axis'):
axis = getattr(cond, '_het_axis', 0)
cond = cond.reindex_axis(self.items, axis=axis,
copy=True).values
else:
cond = cond.values

# may need to undo transpose of values
if hasattr(values, 'ndim'):
if values.ndim != cond.ndim or values.shape == cond.shape[::-1]:
if (values.ndim != cond.ndim or
values.shape == cond.shape[::-1]):
values = values.T
is_transposed = not is_transposed

Expand Down Expand Up @@ -494,7 +485,7 @@ class FloatBlock(NumericBlock):

def _can_hold_element(self, element):
if isinstance(element, np.ndarray):
return issubclass(element.dtype.type, (np.floating,np.integer))
return issubclass(element.dtype.type, (np.floating, np.integer))
return isinstance(element, (float, int))

def _try_cast(self, element):
Expand Down Expand Up @@ -541,7 +532,8 @@ def _try_cast(self, element):
def _try_cast_result(self, result):
# this is quite restrictive to convert
try:
if isinstance(result, np.ndarray) and issubclass(result.dtype.type, np.floating):
if (isinstance(result, np.ndarray) and
issubclass(result.dtype.type, np.floating)):
if com.notnull(result).all():
new_result = result.astype(self.dtype)
if (new_result == result).all():
Expand Down Expand Up @@ -958,7 +950,8 @@ def _get_clean_block_types(self, type_list):
return type_list

def get_bool_data(self, copy=False, as_blocks=False):
return self.get_numeric_data(copy=copy, type_list=(BoolBlock,), as_blocks=as_blocks)
return self.get_numeric_data(copy=copy, type_list=(BoolBlock,),
as_blocks=as_blocks)

def get_slice(self, slobj, axis=0):
new_axes = list(self.axes)
Expand Down Expand Up @@ -1429,7 +1422,7 @@ def take(self, indexer, axis=1):
if axis == 0:
raise NotImplementedError

indexer = np.asarray(indexer, dtype='i4')
indexer = com._ensure_platform_int(indexer)

n = len(self.axes[axis])
if ((indexer == -1) | (indexer >= n)).any():
Expand All @@ -1440,8 +1433,8 @@ def take(self, indexer, axis=1):
new_axes[axis] = self.axes[axis].take(indexer)
new_blocks = []
for blk in self.blocks:
new_values = com.take_fast(blk.values, indexer,
None, False, axis=axis)
new_values = com.take_fast(blk.values, indexer, None, False,
axis=axis)
newb = make_block(new_values, blk.items, self.items)
new_blocks.append(newb)

Expand Down Expand Up @@ -1765,8 +1758,6 @@ def _consolidate(blocks, items):
return new_blocks


# TODO: this could be much optimized

def _merge_blocks(blocks, items):
if len(blocks) == 1:
return blocks[0]
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def _make_sorted_values_labels(self):
indexer = algos.groupsort_indexer(comp_index, ngroups)[0]
indexer = _ensure_platform_int(indexer)

self.sorted_values = com.take_2d(self.values, indexer, axis=0)
self.sorted_values = com.take_nd(self.values, indexer, axis=0)
self.sorted_labels = [l.take(indexer) for l in to_sort]

def _make_selectors(self):
Expand Down Expand Up @@ -136,7 +136,7 @@ def get_result(self):
# rare case, level values not observed
if len(obs_ids) < self.full_shape[1]:
inds = (value_mask.sum(0) > 0).nonzero()[0]
values = com.take_2d(values, inds, axis=1)
values = com.take_nd(values, inds, axis=1)
columns = columns[inds]

return DataFrame(values, index=index, columns=columns)
Expand Down
4 changes: 3 additions & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,7 +794,9 @@ def convert_objects(self, convert_dates=True, convert_numeric=True):
converted : Series
"""
if self.dtype == np.object_:
return Series(com._possibly_convert_objects(self.values,convert_dates=convert_dates,convert_numeric=convert_numeric), index=self.index, name=self.name)
return Series(com._possibly_convert_objects(self.values,
convert_dates=convert_dates, convert_numeric=convert_numeric),
index=self.index, name=self.name)
return self.copy()

def repeat(self, reps):
Expand Down
Loading