Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add regex flags parameter to python cudf strings split #10185

Merged
merged 40 commits into from
Feb 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
eaba42e
Add libcudf strings split API that accepts regex pattern
davidwendt Jan 26, 2022
a832436
add error-checking gtests
davidwendt Jan 26, 2022
d4e5746
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Jan 27, 2022
d33f79b
use count_matches utility
davidwendt Jan 27, 2022
9c74fdf
add split_re declaration
davidwendt Jan 27, 2022
1a89db5
split_re implementation and tests
davidwendt Jan 27, 2022
8599d0c
rename split_record_re.cu to split_re.cu
davidwendt Jan 28, 2022
b6d7453
refactored split_re/rsplit_re functions
davidwendt Jan 31, 2022
9556fc1
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Jan 31, 2022
7bc451b
remove unneeded if-check
davidwendt Jan 31, 2022
93887b1
add all empty and all null test cases
davidwendt Jan 31, 2022
0930513
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 1, 2022
c88eeae
add more maxsplit gtests
davidwendt Feb 1, 2022
f17065c
Add regex parameter to cudf strings split()
davidwendt Feb 1, 2022
7d9d30d
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 1, 2022
c775310
Merge branch 'fea-split-with-regex' into python-split-with-regex
davidwendt Feb 1, 2022
c76456d
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 3, 2022
22be900
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 3, 2022
b522d5a
Merge branch 'fea-split-with-regex' into python-split-with-regex
davidwendt Feb 3, 2022
3609f2b
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 3, 2022
773047d
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 4, 2022
623b47c
Merge branch 'fea-split-with-regex' into python-split-with-regex
davidwendt Feb 4, 2022
1e51736
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 7, 2022
eb8c326
fix doxygen typo in @throw line
davidwendt Feb 8, 2022
d6ee883
refactor max-tokens calculation into helper function
davidwendt Feb 8, 2022
ae6d77b
Merge branch 'fea-split-with-regex' into python-split-with-regex
davidwendt Feb 8, 2022
6094ed9
fix doxygen brief and examples
davidwendt Feb 8, 2022
0d1480b
Merge branch 'branch-22.04' into fea-split-with-regex
davidwendt Feb 8, 2022
f647cf0
fix doxygen brief and examples
davidwendt Feb 8, 2022
15c1839
Merge branch 'fea-split-with-regex' into python-split-with-regex
davidwendt Feb 8, 2022
7d5c254
Merge branch 'branch-22.04' into python-split-with-regex
davidwendt Feb 11, 2022
7394e74
add rsplit regex interface
davidwendt Feb 14, 2022
4056043
Merge branch 'branch-22.04' into python-split-with-regex
davidwendt Feb 14, 2022
3ff72ab
update copyright header in init.py
davidwendt Feb 14, 2022
a518415
correct copyright year in init.py
davidwendt Feb 14, 2022
42c4dcc
Merge branch 'branch-22.04' into python-split-with-regex
davidwendt Feb 14, 2022
eb67c7f
Merge branch 'branch-22.04' into python-split-with-regex
davidwendt Feb 15, 2022
75f20d5
add PANDAS_LT_140 check in rsplit test
davidwendt Feb 15, 2022
ba4c8a2
add inspect.signature check for rsplit regex parameter
davidwendt Feb 17, 2022
65d906d
Merge branch 'branch-22.04' into python-split-with-regex
davidwendt Feb 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion python/cudf/cudf/_lib/cpp/strings/split/split.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
Expand Down Expand Up @@ -32,3 +32,27 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
column_view source_strings,
string_scalar delimiter,
size_type maxsplit) except +


cdef extern from "cudf/strings/split/split_re.hpp" namespace \
"cudf::strings" nogil:

cdef unique_ptr[table] split_re(
const column_view& source_strings,
const string& pattern,
size_type maxsplit) except +

cdef unique_ptr[table] rsplit_re(
const column_view& source_strings,
const string& pattern,
size_type maxsplit) except +

cdef unique_ptr[column] split_record_re(
const column_view& source_strings,
const string& pattern,
size_type maxsplit) except +

cdef unique_ptr[column] rsplit_record_re(
const column_view& source_strings,
const string& pattern,
size_type maxsplit) except +
5 changes: 5 additions & 0 deletions python/cudf/cudf/_lib/strings/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
from cudf._lib.nvtext.generate_ngrams import (
generate_character_ngrams,
Expand Down Expand Up @@ -78,9 +79,13 @@
from cudf._lib.strings.split.partition import partition, rpartition
from cudf._lib.strings.split.split import (
rsplit,
rsplit_re,
rsplit_record,
rsplit_record_re,
split,
split_re,
split_record,
split_record_re,
)
from cudf._lib.strings.strip import lstrip, rstrip, strip
from cudf._lib.strings.substring import get, slice_from, slice_strings
Expand Down
102 changes: 101 additions & 1 deletion python/cudf/cudf/_lib/strings/split/split.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
Expand All @@ -10,9 +10,13 @@ from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.scalar.scalar cimport string_scalar
from cudf._lib.cpp.strings.split.split cimport (
rsplit as cpp_rsplit,
rsplit_re as cpp_rsplit_re,
rsplit_record as cpp_rsplit_record,
rsplit_record_re as cpp_rsplit_record_re,
split as cpp_split,
split_re as cpp_split_re,
split_record as cpp_split_record,
split_record_re as cpp_split_record_re,
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
Expand Down Expand Up @@ -137,3 +141,99 @@ def rsplit_record(Column source_strings,
return Column.from_unique_ptr(
move(c_result),
)


def split_re(Column source_strings,
object pattern,
size_type maxsplit):
"""
Returns data by splitting the `source_strings`
column around the delimiters identified by `pattern`.
"""
cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()

with nogil:
c_result = move(cpp_split_re(
source_view,
pattern_string,
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
)


def rsplit_re(Column source_strings,
object pattern,
size_type maxsplit):
"""
Returns data by splitting the `source_strings`
column around the delimiters identified by `pattern`.
The delimiters are searched starting from the end of each string.
"""
cdef unique_ptr[table] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()

with nogil:
c_result = move(cpp_rsplit_re(
source_view,
pattern_string,
maxsplit
))

return data_from_unique_ptr(
move(c_result),
column_names=range(0, c_result.get()[0].num_columns())
)


def split_record_re(Column source_strings,
object pattern,
size_type maxsplit):
"""
Returns a Column by splitting the `source_strings`
column around the delimiters identified by `pattern`.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()

with nogil:
c_result = move(cpp_split_record_re(
source_view,
pattern_string,
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
)


def rsplit_record_re(Column source_strings,
object pattern,
size_type maxsplit):
"""
Returns a Column by splitting the `source_strings`
column around the delimiters identified by `pattern`.
The delimiters are searched starting from the end of each string.
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()
cdef string pattern_string = <string>str(pattern).encode()

with nogil:
c_result = move(cpp_rsplit_record_re(
source_view,
pattern_string,
maxsplit
))

return Column.from_unique_ptr(
move(c_result),
)
84 changes: 65 additions & 19 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -2290,19 +2290,24 @@ def get_json_object(self, json_path):
return res

def split(
self, pat: str = None, n: int = -1, expand: bool = None
self,
pat: str = None,
n: int = -1,
expand: bool = None,
regex: bool = None,
) -> SeriesOrIndex:
"""
Split strings around given separator/delimiter.

Splits the string in the Series/Index from the beginning, at the
specified delimiter string. Equivalent to `str.split()
specified delimiter string. Similar to `str.split()
<https://docs.python.org/3/library/stdtypes.html#str.split>`_.

Parameters
----------
pat : str, default ' ' (space)
String to split on, does not yet support regular expressions.
pat : str, default None
String or regular expression to split on. If not specified, split
on whitespace.
n : int, default -1 (all)
Limit number of splits in output. `None`, 0, and -1 will all be
interpreted as "all splits".
Expand All @@ -2313,6 +2318,13 @@ def split(
dimensionality.
* If ``False``, return Series/Index, containing lists
of strings.
regex : bool, default None
Determines if the passed-in pattern is a regular expression:

* If ``True``, assumes the passed-in pattern is a regular
expression
* If ``False``, treats the pattern as a literal string.
* If pat length is 1, treats pat as a literal string.

Returns
-------
Expand Down Expand Up @@ -2412,38 +2424,54 @@ def split(
)

# Pandas treats 0 as all
if n == 0:
if n is None or n == 0:
n = -1

if pat is None:
pat = ""

if regex and isinstance(pat, re.Pattern):
pat = pat.pattern

if len(str(pat)) <= 1:
regex = False

if expand:
if self._column.null_count == len(self._column):
result_table = cudf.core.frame.Frame({0: self._column.copy()})
else:
data, index = libstrings.split(
self._column, cudf.Scalar(pat, "str"), n
)
if regex is True:
data, index = libstrings.split_re(self._column, pat, n)
else:
data, index = libstrings.split(
self._column, cudf.Scalar(pat, "str"), n
)
if len(data) == 1 and data[0].null_count == len(self._column):
result_table = cudf.core.frame.Frame({})
else:
result_table = cudf.core.frame.Frame(data, index)
else:
result_table = libstrings.split_record(
self._column, cudf.Scalar(pat, "str"), n
)
if regex is True:
result_table = libstrings.split_record_re(self._column, pat, n)
else:
result_table = libstrings.split_record(
self._column, cudf.Scalar(pat, "str"), n
)

return self._return_or_inplace(result_table, expand=expand)

def rsplit(
self, pat: str = None, n: int = -1, expand: bool = None
self,
pat: str = None,
n: int = -1,
expand: bool = None,
regex: bool = None,
) -> SeriesOrIndex:
"""
Split strings around given separator/delimiter.

Splits the string in the Series/Index from the end, at the
specified delimiter string. Equivalent to `str.rsplit()
specified delimiter string. Similar to `str.rsplit()
<https://docs.python.org/3/library/stdtypes.html#str.rsplit>`_.

Parameters
Expand All @@ -2460,6 +2488,13 @@ def rsplit(
dimensionality.
* If ``False``, return Series/Index, containing lists
of strings.
regex : bool, default None
Determines if the passed-in pattern is a regular expression:

* If ``True``, assumes the passed-in pattern is a regular
expression
* If ``False``, treats the pattern as a literal string.
* If pat length is 1, treats pat as a literal string.

Returns
-------
Expand Down Expand Up @@ -2574,21 +2609,32 @@ def rsplit(
if pat is None:
pat = ""

if regex and isinstance(pat, re.Pattern):
pat = pat.pattern

if expand:
if self._column.null_count == len(self._column):
result_table = cudf.core.frame.Frame({0: self._column.copy()})
else:
data, index = libstrings.rsplit(
self._column, cudf.Scalar(pat, "str"), n
)
if regex is True:
data, index = libstrings.rsplit_re(self._column, pat, n)
else:
data, index = libstrings.rsplit(
self._column, cudf.Scalar(pat, "str"), n
)
if len(data) == 1 and data[0].null_count == len(self._column):
result_table = cudf.core.frame.Frame({})
else:
result_table = cudf.core.frame.Frame(data, index)
else:
result_table = libstrings.rsplit_record(
self._column, cudf.Scalar(pat), n
)
if regex is True:
result_table = libstrings.rsplit_record_re(
self._column, pat, n
)
else:
result_table = libstrings.rsplit_record(
self._column, cudf.Scalar(pat), n
)

return self._return_or_inplace(result_table, expand=expand)

Expand Down
43 changes: 43 additions & 0 deletions python/cudf/cudf/tests/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,29 @@ def test_string_split(data, pat, n, expand):
assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
["a b", " c ", " d", "e ", "f"],
["a-b", "-c-", "---d", "e---", "f"],
["ab", "c", "d", "e", "f"],
[None, None, None, None, None],
],
)
@pytest.mark.parametrize("pat", [None, " ", "\\-+", "\\s+"])
@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10])
@pytest.mark.parametrize("expand", [True, False, None])
def test_string_split_re(data, pat, n, expand):
ps = pd.Series(data, dtype="str")
gs = cudf.Series(data, dtype="str")

# Pandas does not support the regex parameter until 1.4.0
expect = ps.str.split(pat=pat, n=n, expand=expand)
got = gs.str.split(pat=pat, n=n, expand=expand, regex=True)

assert_eq(expect, got)


@pytest.mark.parametrize(
"str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
)
Expand Down Expand Up @@ -1507,6 +1530,26 @@ def test_strings_rsplit(data, n, expand):
)


@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10])
@pytest.mark.parametrize("expand", [True, False, None])
def test_string_rsplit_re(n, expand):
data = ["a b", " c ", " d", "e ", "f"]
ps = pd.Series(data, dtype="str")
gs = cudf.Series(data, dtype="str")

# Pandas does not yet support the regex parameter for rsplit
import inspect

assert (
"regex"
not in inspect.signature(pd.Series.str.rsplit).parameters.keys()
)

expect = ps.str.rsplit(pat=" ", n=n, expand=expand)
vyasr marked this conversation as resolved.
Show resolved Hide resolved
got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True)
assert_eq(expect, got)


@pytest.mark.parametrize(
"data",
[
Expand Down