Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/CoW: use lazy copy on replace method #49555

Closed
wants to merge 2 commits into from

Conversation

ntachukwu
Copy link
Contributor

@ntachukwu
Copy link
Contributor Author

@jorisvandenbossche I will need some clarification to help me handle CoW in replace. It feels like replace should return a copy.

@ntachukwu ntachukwu marked this pull request as draft November 7, 2022 06:09
@jorisvandenbossche
Copy link
Member

replace is probably a difficult case, because it has a quite complex implementation (eg

# ---------------------------------------------------------------------
# Replace
@final
def replace(
self,
to_replace,
value,
inplace: bool = False,
# mask may be pre-computed if we're called from replace_list
mask: npt.NDArray[np.bool_] | None = None,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
blocks here this is just a call to putmask.
"""
# Note: the checks we do in NDFrame.replace ensure we never get
# here with listlike to_replace or value, as those cases
# go through replace_list
values = self.values
if isinstance(values, Categorical):
# TODO: avoid special-casing
blk = self if inplace else self.copy()
# error: Item "ExtensionArray" of "Union[ndarray[Any, Any],
# ExtensionArray]" has no attribute "_replace"
blk.values._replace( # type: ignore[union-attr]
to_replace=to_replace, value=value, inplace=True
)
return [blk]
if not self._can_hold_element(to_replace):
# We cannot hold `to_replace`, so we know immediately that
# replacing it is a no-op.
# Note: If to_replace were a list, NDFrame.replace would call
# replace_list instead of replace.
return [self] if inplace else [self.copy()]
if mask is None:
mask = missing.mask_missing(values, to_replace)
if not mask.any():
# Note: we get here with test_replace_extension_other incorrectly
# bc _can_hold_element is incorrect.
return [self] if inplace else [self.copy()]
elif self._can_hold_element(value):
blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
if not (self.is_object and value is None):
# if the user *explicitly* gave None, we keep None, otherwise
# may downcast to NaN
blocks = blk.convert(numeric=False, copy=False)
else:
blocks = [blk]
return blocks
elif self.ndim == 1 or self.shape[0] == 1:
if value is None:
blk = self.astype(np.dtype(object))
else:
blk = self.coerce_to_target_dtype(value)
return blk.replace(
to_replace=to_replace,
value=value,
inplace=True,
mask=mask,
)
else:
# split so that we only upcast where necessary
blocks = []
for i, nb in enumerate(self._split()):
blocks.extend(
type(self).replace(
nb,
to_replace=to_replace,
value=value,
inplace=True,
mask=mask[i : i + 1],
)
)
return blocks
@final
def _replace_regex(
self,
to_replace,
value,
inplace: bool = False,
convert: bool = True,
mask=None,
) -> list[Block]:
"""
Replace elements by the given value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
inplace : bool, default False
Perform inplace modification.
convert : bool, default True
If true, try to coerce any object types to better types.
mask : array-like of bool, optional
True indicate corresponding element is ignored.
Returns
-------
List[Block]
"""
if not self._can_hold_element(to_replace):
# i.e. only ObjectBlock, but could in principle include a
# String ExtensionBlock
return [self] if inplace else [self.copy()]
rx = re.compile(to_replace)
new_values = self.values if inplace else self.values.copy()
replace_regex(new_values, rx, value, mask)
block = self.make_block(new_values)
return block.convert(numeric=False, copy=False)
@final
def replace_list(
self,
src_list: Iterable[Any],
dest_list: Sequence[Any],
inplace: bool = False,
regex: bool = False,
) -> list[Block]:
"""
See BlockManager.replace_list docstring.
"""
values = self.values
# Exclude anything that we know we won't contain
pairs = [
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
]
if not len(pairs):
# shortcut, nothing to replace
return [self] if inplace else [self.copy()]
src_len = len(pairs) - 1
if is_string_dtype(values.dtype):
# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(values)
masks = [
compare_or_regex_search(values, s[0], regex=regex, mask=mask)
for s in pairs
]
else:
# GH#38086 faster if we know we dont need to check for regex
masks = [missing.mask_missing(values, s[0]) for s in pairs]
# error: Argument 1 to "extract_bool_array" has incompatible type
# "Union[ExtensionArray, ndarray, bool]"; expected "Union[ExtensionArray,
# ndarray]"
masks = [extract_bool_array(x) for x in masks] # type: ignore[arg-type]
rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(pairs):
convert = i == src_len # only convert once at the end
new_rb: list[Block] = []
# GH-39338: _replace_coerce can split a block into
# single-column blocks, so track the index so we know
# where to index into the mask
for blk_num, blk in enumerate(rb):
if len(rb) == 1:
m = masks[i]
else:
mib = masks[i]
assert not isinstance(mib, bool)
m = mib[blk_num : blk_num + 1]
# error: Argument "mask" to "_replace_coerce" of "Block" has
# incompatible type "Union[ExtensionArray, ndarray[Any, Any], bool]";
# expected "ndarray[Any, dtype[bool_]]"
result = blk._replace_coerce(
to_replace=src,
value=dest,
mask=m, # type: ignore[arg-type]
inplace=inplace,
regex=regex,
)
if convert and blk.is_object and not all(x is None for x in dest_list):
# GH#44498 avoid unwanted cast-back
result = extend_blocks(
[b.convert(numeric=False, copy=True) for b in result]
)
new_rb.extend(result)
rb = new_rb
return rb
@final
def _replace_coerce(
self,
to_replace,
value,
mask: npt.NDArray[np.bool_],
inplace: bool = True,
regex: bool = False,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
value.
Parameters
----------
to_replace : object or pattern
Scalar to replace or regular expression to match.
value : object
Replacement object.
mask : np.ndarray[bool]
True indicate corresponding element is ignored.
inplace : bool, default True
Perform inplace modification.
regex : bool, default False
If true, perform regular expression substitution.
Returns
-------
List[Block]
"""
if should_use_regex(regex, to_replace):
return self._replace_regex(
to_replace,
value,
inplace=inplace,
convert=False,
mask=mask,
)
else:
if value is None:
# gh-45601, gh-45836, gh-46634
if mask.any():
nb = self.astype(np.dtype(object), copy=False)
if nb is self and not inplace:
nb = nb.copy()
putmask_inplace(nb.values, mask, value)
return [nb]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace, value=value, inplace=inplace, mask=mask
)
, and that's only part of it).
I think for replace there would be room for avoiding an actual column for those columns where no value actually gets replaced. But that will require going through the implementation to identify those cases.

I was going to suggest that it might be easier to start with another method (and I can give some suggestions), but I see you already started with set_index as well! Will take a look at that.

@github-actions
Copy link
Contributor

github-actions bot commented Dec 9, 2022

This pull request is stale because it has been open for thirty days with no activity. Please update and respond to this comment if you're still interested in working on this.

@github-actions github-actions bot added the Stale label Dec 9, 2022
@mroeschke
Copy link
Member

Thanks for the pull request, but it appears to have gone stale. If interested in continuing, please merge in the main branch, address any review comments and/or failing tests, and we can reopen.

@mroeschke mroeschke closed this Dec 28, 2022
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants