Skip to content

Commit

Permalink
[ArrowStringArray] use pyarrow.compute.match_substring_regex if avail…
Browse files Browse the repository at this point in the history
  • Loading branch information
simonjayhawkins authored and yeshsurya committed May 6, 2021
1 parent 20d46fd commit c4e3339
Showing 1 changed file with 27 additions and 7 deletions.
34 changes: 27 additions & 7 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
Sequence,
cast,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -766,16 +767,34 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
return lib.map_infer_mask(arr, f, mask.view("uint8"))

def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True):
if not regex and case:
result = pc.match_substring(self._data, pat)
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result
else:
if flags:
return super()._str_contains(pat, case, flags, na, regex)

if regex:
# match_substring_regex added in pyarrow 4.0.0
if hasattr(pc, "match_substring_regex") and case:
if re.compile(pat).groups:
warnings.warn(
"This pattern has match groups. To actually get the "
"groups, use str.extract.",
UserWarning,
stacklevel=3,
)
result = pc.match_substring_regex(self._data, pat)
else:
return super()._str_contains(pat, case, flags, na, regex)
else:
if case:
result = pc.match_substring(self._data, pat)
else:
result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
result = BooleanDtype().__from_arrow__(result)
if not isna(na):
result[isna(result)] = bool(na)
return result

def _str_startswith(self, pat, na=None):
# match_substring_regex added in pyarrow 4.0.0
if hasattr(pc, "match_substring_regex"):
result = pc.match_substring_regex(self._data, "^" + re.escape(pat))
result = BooleanDtype().__from_arrow__(result)
Expand All @@ -786,6 +805,7 @@ def _str_startswith(self, pat, na=None):
return super()._str_startswith(pat, na)

def _str_endswith(self, pat, na=None):
# match_substring_regex added in pyarrow 4.0.0
if hasattr(pc, "match_substring_regex"):
result = pc.match_substring_regex(self._data, re.escape(pat) + "$")
result = BooleanDtype().__from_arrow__(result)
Expand Down

0 comments on commit c4e3339

Please sign in to comment.