-
Notifications
You must be signed in to change notification settings - Fork 890
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add libcudf strings split API that accepts regex pattern (#10128)
Reference #3584 This PR adds 4 new libcudf strings APIs for split. - `cudf::strings::split_re` - split using regex to locate delimiters with table output like `cudf::strings::split`. - `cudf::strings::rsplit_re` - same as `split_re` but delimiter search starts from the end of each string - `cudf::strings::split_record_re` - same as `split_re` but returns a list column like `split_record` does - `cudf::strings::rsplit_record_re` - same as `split_record_re` but delimiter search starts from the end of each string Like `split/rsplit` the results try to match Pandas behavior for these. The `record` results are similar to specifying `expand=False` in the Pandas `split/rsplit` APIs. Python/Cython updates for cuDF will be in a follow-on PR. Currently, Pandas does not support regex for its `rsplit` even though it has been documented and there is an issue [here](pandas-dev/pandas#29633). New gtests have been added for these along with some additional tests that were missing for the non-regex versions of these APIs. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - AJ Schmidt (https://github.com/ajschmidt8) - https://github.com/nvdbaranec - Andy Grove (https://github.com/andygrove) - Nghia Truong (https://github.com/ttnghia) URL: #10128
- Loading branch information
1 parent
dcac052
commit 8cc84c6
Showing
5 changed files
with
834 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,232 @@ | ||
/* | ||
* Copyright (c) 2022, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
#include <cudf/table/table.hpp> | ||
|
||
namespace cudf { | ||
namespace strings { | ||
/** | ||
* @addtogroup strings_split | ||
* @{ | ||
* @file | ||
*/ | ||
|
||
/** | ||
* @brief Splits strings elements into a table of strings columns | ||
* using a regex pattern to delimit each string. | ||
* | ||
* Each element generates a vector of strings that are stored in corresponding | ||
* rows in the output table -- `table[col,row] = token[col] of strings[row]` | ||
* where `token` is a substring between delimiters. | ||
* | ||
* The number of rows in the output table will be the same as the number of | ||
* elements in the input column. The resulting number of columns will be the | ||
* maximum number of tokens found in any input row. | ||
* | ||
* The `pattern` is used to identify the delimiters within a string | ||
* and splitting stops when either `maxsplit` or the end of the string is reached. | ||
* | ||
* An empty input string will produce a corresponding empty string in the | ||
* corresponding row of the first column. | ||
* A null row will produce corresponding null rows in the output table. | ||
* | ||
* @code{.pseudo} | ||
* s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] | ||
* s1 = split_re(s, "[_ ]") | ||
* s1 is a table of strings columns: | ||
* [ ["a", "a", "", "ab"], | ||
* ["bc", "", "ab", "cd"], | ||
* ["def", "bc", "cd", ""], | ||
* ["g", null, null, null] ] | ||
* s2 = split_re(s, "[ _]", 1) | ||
* s2 is a table of strings columns: | ||
* [ ["a", "a", "", "ab"], | ||
* ["bc def_g", "_bc", "ab cd", "cd "] ] | ||
* @endcode | ||
* | ||
* @throw cudf::logic_error if `pattern` is empty. | ||
* | ||
* @param input A column of string elements to be split. | ||
* @param pattern The regex pattern for delimiting characters within each string. | ||
* @param maxsplit Maximum number of splits to perform. | ||
* Default of -1 indicates all possible splits on each string. | ||
* @param mr Device memory resource used to allocate the returned result's device memory. | ||
* @return A table of columns of strings. | ||
*/ | ||
std::unique_ptr<table> split_re( | ||
strings_column_view const& input, | ||
std::string const& pattern, | ||
size_type maxsplit = -1, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** | ||
* @brief Splits strings elements into a table of strings columns | ||
* using a regex pattern to delimit each string starting from the end of the string. | ||
* | ||
* Each element generates a vector of strings that are stored in corresponding | ||
* rows in the output table -- `table[col,row] = token[col] of string[row]` | ||
* where `token` is the substring between each delimiter. | ||
* | ||
* The number of rows in the output table will be the same as the number of | ||
* elements in the input column. The resulting number of columns will be the | ||
* maximum number of tokens found in any input row. | ||
* | ||
* Splitting occurs by traversing starting from the end of the input string. | ||
* The `pattern` is used to identify the delimiters within a string | ||
* and splitting stops when either `maxsplit` or the beginning of the string | ||
* is reached. | ||
* | ||
* An empty input string will produce a corresponding empty string in the | ||
* corresponding row of the first column. | ||
* A null row will produce corresponding null rows in the output table. | ||
* | ||
* @code{.pseudo} | ||
* s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] | ||
* s1 = rsplit_re(s, "[_ ]") | ||
* s1 is a table of strings columns: | ||
* [ ["a", "a", "", "ab"], | ||
* ["bc", "", "ab", "cd"], | ||
* ["def", "bc", "cd", ""], | ||
* ["g", null, null, null] ] | ||
* s2 = rsplit_re(s, "[ _]", 1) | ||
* s2 is a table of strings columns: | ||
* [ ["a_bc def", "a_", "_ab", "ab"], | ||
* ["g", "bc", "cd", "cd "] ] | ||
* @endcode | ||
* | ||
* @throw cudf::logic_error if `pattern` is empty. | ||
* | ||
* @param input A column of string elements to be split. | ||
* @param pattern The regex pattern for delimiting characters within each string. | ||
* @param maxsplit Maximum number of splits to perform. | ||
* Default of -1 indicates all possible splits on each string. | ||
* @param mr Device memory resource used to allocate the returned result's device memory. | ||
* @return A table of columns of strings. | ||
*/ | ||
std::unique_ptr<table> rsplit_re( | ||
strings_column_view const& input, | ||
std::string const& pattern, | ||
size_type maxsplit = -1, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** | ||
* @brief Splits strings elements into a list column of strings | ||
* using the given regex pattern to delimit each string. | ||
* | ||
* Each element generates an array of strings that are stored in an output | ||
* lists column -- `list[row] = [token1, token2, ...] found in input[row]` | ||
* where `token` is a substring between delimiters. | ||
* | ||
* The number of elements in the output column will be the same as the number of | ||
* elements in the input column. Each individual list item will contain the | ||
* new strings for that row. The resulting number of strings in each row can vary | ||
* from 0 to `maxsplit + 1`. | ||
* | ||
* The `pattern` is used to identify the delimiters within a string | ||
* and splitting stops when either `maxsplit` or the end of the string is reached. | ||
* | ||
* An empty input string will produce a corresponding empty list item output row. | ||
* A null row will produce a corresponding null output row. | ||
* | ||
* @code{.pseudo} | ||
* s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] | ||
* s1 = split_record_re(s, "[_ ]") | ||
* s1 is a lists column of strings: | ||
* [ ["a", "bc", "def", "g"], | ||
* ["a", "", "bc"], | ||
* ["", "ab", "cd"], | ||
* ["ab", "cd", ""] ] | ||
* s2 = split_record_re(s, "[ _]", 1) | ||
* s2 is a lists column of strings: | ||
* [ ["a", "bc def_g"], | ||
* ["a", "_bc"], | ||
* ["", "ab cd"], | ||
* ["ab", "cd "] ] | ||
* @endcode | ||
* | ||
* @throw cudf::logic_error if `pattern` is empty. | ||
* | ||
* @param input A column of string elements to be split. | ||
* @param pattern The regex pattern for delimiting characters within each string. | ||
* @param maxsplit Maximum number of splits to perform. | ||
* Default of -1 indicates all possible splits on each string. | ||
* @param mr Device memory resource used to allocate the returned result's device memory. | ||
* @return Lists column of strings. | ||
*/ | ||
std::unique_ptr<column> split_record_re( | ||
strings_column_view const& input, | ||
std::string const& pattern, | ||
size_type maxsplit = -1, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** | ||
* @brief Splits strings elements into a list column of strings | ||
* using the given regex pattern to delimit each string starting from the end of the string. | ||
* | ||
* Each element generates a vector of strings that are stored in an output | ||
* lists column -- `list[row] = [token1, token2, ...] found in input[row]` | ||
* where `token` is a substring between delimiters. | ||
* | ||
* The number of elements in the output column will be the same as the number of | ||
* elements in the input column. Each individual list item will contain the | ||
* new strings for that row. The resulting number of strings in each row can vary | ||
* from 0 to `maxsplit + 1`. | ||
* | ||
* Splitting occurs by traversing starting from the end of the input string. | ||
* The `pattern` is used to identify the separation points within a string | ||
* and splitting stops when either `maxsplit` or the beginning of the string | ||
* is reached. | ||
* | ||
* An empty input string will produce a corresponding empty list item output row. | ||
* A null row will produce a corresponding null output row. | ||
* | ||
* @code{.pseudo} | ||
* s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] | ||
* s1 = rsplit_record_re(s, "[_ ]") | ||
* s1 is a lists column of strings: | ||
* [ ["a", "bc", "def", "g"], | ||
* ["a", "", "bc"], | ||
* ["", "ab", "cd"], | ||
* ["ab", "cd", ""] ] | ||
* s2 = rsplit_record_re(s, "[ _]", 1) | ||
* s2 is a lists column of strings: | ||
* [ ["a_bc def", "g"], | ||
* ["a_", "bc"], | ||
* ["_ab", "cd"], | ||
* ["ab_cd", ""] ] | ||
* @endcode | ||
* | ||
* @throw cudf::logic_error if `pattern` is empty. | ||
* | ||
* @param input A column of string elements to be split. | ||
* @param pattern The regex pattern for delimiting characters within each string. | ||
* @param maxsplit Maximum number of splits to perform. | ||
* Default of -1 indicates all possible splits on each string. | ||
* @param mr Device memory resource used to allocate the returned result's device memory. | ||
* @return Lists column of strings. | ||
*/ | ||
std::unique_ptr<column> rsplit_record_re( | ||
strings_column_view const& input, | ||
std::string const& pattern, | ||
size_type maxsplit = -1, | ||
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); | ||
|
||
/** @} */ // end of doxygen group | ||
} // namespace strings | ||
} // namespace cudf |
Oops, something went wrong.