From eaba42e4d7631302c81c4caf2f3d29fb24f3c45d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 25 Jan 2022 19:47:19 -0500 Subject: [PATCH 01/20] Add libcudf strings split API that accepts regex pattern --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cudf/strings/split/split_re.hpp | 82 ++++++++ cpp/src/strings/split/split_record_re.cu | 215 ++++++++++++++++++++ cpp/tests/strings/split_tests.cpp | 37 +++- 5 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 cpp/include/cudf/strings/split/split_re.hpp create mode 100644 cpp/src/strings/split/split_record_re.cu diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 2cbe5173de0..01ad8d4e270 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -214,6 +214,7 @@ test: - test -f $PREFIX/include/cudf/strings/replace_re.hpp - test -f $PREFIX/include/cudf/strings/split/partition.hpp - test -f $PREFIX/include/cudf/strings/split/split.hpp + - test -f $PREFIX/include/cudf/strings/split/split_re.hpp - test -f $PREFIX/include/cudf/strings/string_view.hpp - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp - test -f $PREFIX/include/cudf/strings/strip.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4637408110..b25d6ff3703 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -444,6 +444,7 @@ add_library( src/strings/split/partition.cu src/strings/split/split.cu src/strings/split/split_record.cu + src/strings/split/split_record_re.cu src/strings/strings_column_factories.cu src/strings/strings_column_view.cpp src/strings/strings_scalar_factories.cpp diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp new file mode 100644 index 00000000000..b69bd1c5991 --- /dev/null +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace strings { +/** + * @addtogroup strings_split + * @{ + * @file + */ + +/** + * @brief Splits individual strings elements into a list of strings + * using a regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in an output + * lists column. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null list item output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_record(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = split_record(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a", "bc_def_g"], + * ["a", "_bc"], + * ["", "ab_cd"], + * ["ab", "cd_"] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings + * Each vector of the lists column holds splits from a single row + * element of the input column. + */ +std::unique_ptr split_record_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu new file mode 100644 index 00000000000..d197ee9c7e3 --- /dev/null +++ b/cpp/src/strings/split/split_record_re.cu @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +using string_index_pair = thrust::pair; + +namespace { + +/** + * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. + */ +template +struct token_counter_fn { + column_device_view const d_strings; // strings to split + reprog_device prog; + size_type const max_tokens; + + __device__ size_type operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return 0; } + + auto const d_str = d_strings.element(idx); + size_type token_count = 0; + + int32_t begin = 0; + int32_t end = -1; + while (token_count < max_tokens - 1) { + if (prog.find(idx, d_str, begin, end) <= 0) { break; } + token_count++; + begin = end + (begin == end); + end = -1; + } + return token_count + 1; // always at least one token + } +}; + +/** + * @brief Identify the tokens from the `idx'th` string element of `d_strings`. + */ +template +struct token_reader_fn { + column_device_view const d_strings; + reprog_device prog; + int32_t const* d_token_offsets; + string_index_pair* d_tokens; + + __device__ void operator()(size_type idx) + { + if (d_strings.is_null(idx)) { return; } + + auto const token_offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - token_offset; + auto d_result = d_tokens + token_offset; + auto const d_str = d_strings.element(idx); + if (d_str.empty()) { + // return empty string output for empty string input + *d_result = string_index_pair{"", 0}; + return; + } + + size_type token_idx = 0; + size_type begin = 0; + size_type end = d_str.length(); + size_type last_pos = 0; + while (token_idx < token_count - 1) { + if (prog.find(idx, d_str, begin, end) <= 0) { break; } + + auto const start_pos = d_str.byte_offset(begin); + auto const end_pos = d_str.byte_offset(end); + d_result[token_idx] = string_index_pair{d_str.data() + last_pos, start_pos - last_pos}; + + begin = end + (begin == end); + end = d_str.length(); + token_idx++; + last_pos = end_pos; + } + + // set last token to remainder of the string + if (last_pos <= d_str.size_bytes()) { + d_result[token_idx] = + string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; + } + } +}; + +} // namespace + +// The output is one list item per string +std::unique_ptr split_record_re( + strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits::max(); + auto const strings_count = input.size(); + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = make_numeric_column( + data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto d_offsets = offsets->mutable_view().data(); + + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + + // create offsets column by counting the number of tokens per string + auto const regex_insts = d_prog->insts_counts(); + if (regex_insts <= RX_SMALL_INSTS) { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } else if (regex_insts <= RX_LARGE_INSTS) { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } else { + token_counter_fn counter{*d_strings, *d_prog, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); + } + // convert counts into offsets + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + + // last entry is the total number of tokens to be generated + auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + + printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n", + regex_insts, + total_tokens, + *begin, + *end); + // split each string into an array of index-pair values + rmm::device_uvector tokens(total_tokens, stream); + if (regex_insts <= RX_SMALL_INSTS) { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else if (regex_insts <= RX_LARGE_INSTS) { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } else { + token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); + } + + // convert the index-pairs into one big strings column + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + // create a lists column using the offsets and the strings columns + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace detail + +// external APIs + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index de4e48fd70a..7eddc947d40 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -339,6 +340,40 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRecordRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto result = cudf::strings::split_record_re(sv, "[eé]"); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", " "}, LCW{"t", "st String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); +} + +TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto result = cudf::strings::split_record_re(sv, "\\s", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); +} + TEST_F(StringsSplitTest, RSplitRecord) { std::vector h_strings{ From a83243646975f0c398910bb0cb80affa40c214bf Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 26 Jan 2022 11:25:18 -0500 Subject: [PATCH 02/20] add error-checking gtests --- cpp/tests/strings/split_tests.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 7eddc947d40..badb84536ba 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -654,6 +654,11 @@ TEST_F(StringsSplitTest, InvalidParameter) cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)), + cudf::logic_error); + EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)), From d33f79bb9c89d014a4e6a374067e8a88c366aafa Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 27 Jan 2022 10:49:01 -0500 Subject: [PATCH 03/20] use count_matches utility --- cpp/src/strings/split/split_record_re.cu | 63 ++++++++---------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu index d197ee9c7e3..eff0c511393 100644 --- a/cpp/src/strings/split/split_record_re.cu +++ b/cpp/src/strings/split/split_record_re.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -42,30 +43,23 @@ using string_index_pair = thrust::pair; namespace { /** - * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`. + * @brief Convert match counts to token counts. + * + * The matches are the delimiters and the tokens are what is left: + * `token1, delimiter, token2, delimiter, token3, etc` + * Usually `token_count = match_count + 1` even with empty strings. + * However, we need to account for the max_tokens and null rows. */ -template -struct token_counter_fn { - column_device_view const d_strings; // strings to split - reprog_device prog; +struct match_to_token_count_fn { + column_device_view const d_strings; + size_type const* d_counts; size_type const max_tokens; __device__ size_type operator()(size_type idx) { if (d_strings.is_null(idx)) { return 0; } - - auto const d_str = d_strings.element(idx); - size_type token_count = 0; - - int32_t begin = 0; - int32_t end = -1; - while (token_count < max_tokens - 1) { - if (prog.find(idx, d_str, begin, end) <= 0) { break; } - token_count++; - begin = end + (begin == end); - end = -1; - } - return token_count + 1; // always at least one token + auto const match_count = d_counts[idx]; + return std::min(match_count, max_tokens) + 1; } }; @@ -130,34 +124,23 @@ std::unique_ptr split_record_re( { CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - auto const max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits::max(); + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); auto d_strings = column_device_view::create(input.parent(), stream); - auto offsets = make_numeric_column( - data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); auto d_offsets = offsets->mutable_view().data(); auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); - - // create offsets column by counting the number of tokens per string - auto const regex_insts = d_prog->insts_counts(); - if (regex_insts <= RX_SMALL_INSTS) { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } else if (regex_insts <= RX_MEDIUM_INSTS) { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } else if (regex_insts <= RX_LARGE_INSTS) { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } else { - token_counter_fn counter{*d_strings, *d_prog, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter); - } + // convert match counts to tokens + thrust::transform(rmm::exec_policy(stream), + begin, + end, + d_offsets, + match_to_token_count_fn{*d_strings, d_offsets, max_tokens}); // convert counts into offsets thrust::exclusive_scan( rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); @@ -165,13 +148,9 @@ std::unique_ptr split_record_re( // last entry is the total number of tokens to be generated auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); - printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n", - regex_insts, - total_tokens, - *begin, - *end); // split each string into an array of index-pair values rmm::device_uvector tokens(total_tokens, stream); + auto const regex_insts = d_prog->insts_counts(); if (regex_insts <= RX_SMALL_INSTS) { token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); From 9c74fdffbc2c3ddc7e4a248a4c837996c8c25bf4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 27 Jan 2022 10:49:16 -0500 Subject: [PATCH 04/20] add split_re declaration --- cpp/include/cudf/strings/split/split_re.hpp | 62 +++++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index b69bd1c5991..54d590fcf71 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include namespace cudf { namespace strings { @@ -26,6 +27,59 @@ namespace strings { * @file */ +/** + * @brief Splits individual strings elements into a table of strings columns + * using a regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in corresponding + * rows in the output table. + * + * The number of elements in the output table will be the same as the number of + * elements in the input column. The row for each column will contain the + * new strings produced from that input row. + * + * The resulting number of columns will be the maximum number of tokens found + * in any input row. + * + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the end of the string is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row in the first column. + * + * A null row will produce a corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = split_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = split_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc def_g", "_bc", "ab_cd", "cd "] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings + * Each vector of the lists column holds splits from a single row + * element of the input column. + */ +std::unique_ptr split_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Splits individual strings elements into a list of strings * using a regex pattern to delimit each string. @@ -54,10 +108,10 @@ namespace strings { * ["ab", "cd", ""] ] * s2 = split_record(s, "[ _]", 1) * s2 is a lists column of strings: - * [ ["a", "bc_def_g"], + * [ ["a", "bc def_g"], * ["a", "_bc"], - * ["", "ab_cd"], - * ["ab", "cd_"] ] + * ["", "ab cd"], + * ["ab", "cd "] ] * @endcode * * @throw cudf:logic_error if `pattern` is empty. From 1a89db5f53ed21952183d8ab9f2d4e6e800b1175 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 27 Jan 2022 18:01:11 -0500 Subject: [PATCH 05/20] split_re implementation and tests --- cpp/src/strings/split/split_record_re.cu | 193 ++++++++++++++++++----- cpp/tests/strings/split_tests.cpp | 110 +++++++++---- 2 files changed, 235 insertions(+), 68 deletions(-) diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu index eff0c511393..3f916d0138f 100644 --- a/cpp/src/strings/split/split_record_re.cu +++ b/cpp/src/strings/split/split_record_re.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ template struct token_reader_fn { column_device_view const d_strings; reprog_device prog; - int32_t const* d_token_offsets; + offset_type const* d_token_offsets; string_index_pair* d_tokens; __device__ void operator()(size_type idx) @@ -81,11 +82,6 @@ struct token_reader_fn { auto const token_count = d_token_offsets[idx + 1] - token_offset; auto d_result = d_tokens + token_offset; auto const d_str = d_strings.element(idx); - if (d_str.empty()) { - // return empty string output for empty string input - *d_result = string_index_pair{"", 0}; - return; - } size_type token_idx = 0; size_type begin = 0; @@ -112,61 +108,105 @@ struct token_reader_fn { } }; -} // namespace - -// The output is one list item per string -std::unique_ptr split_record_re( - strings_column_view const& input, - std::string const& pattern, - size_type maxsplit, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) -{ - CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); - auto const strings_count = input.size(); +struct tokens_transform_fn { + column_device_view const d_strings; + string_index_pair const* d_tokens; + offset_type const* d_token_offsets; + size_type const column_index; - auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); - auto d_strings = column_device_view::create(input.parent(), stream); + __device__ string_index_pair operator()(size_type idx) const + { + auto const offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - offset; + if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; } + if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; } + return d_tokens[offset + column_index]; + } +}; - auto offsets = count_matches(*d_strings, *d_prog, stream, mr); - auto d_offsets = offsets->mutable_view().data(); +/** + * @brief Call regex to split each input string into tokens. + * + * This will also convert the `offsets` values from counts to offsets. + * + * @param d_strings Strings to split + * @param d_prog Regex to evaluate against each string + * @param max_tokens The maximum number of tokens for each split. + * @param offsets The number of matches on input. + * The offsets for each token in each string on output. + * @param stream CUDA stream used for kernel launches. + */ +rmm::device_uvector split_utility(column_device_view const& d_strings, + reprog_device& d_prog, + size_type max_tokens, + mutable_column_view& offsets, + rmm::cuda_stream_view stream) +{ + auto d_offsets = offsets.data(); + auto const strings_count = d_strings.size(); auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); + // convert match counts to tokens - thrust::transform(rmm::exec_policy(stream), - begin, - end, - d_offsets, - match_to_token_count_fn{*d_strings, d_offsets, max_tokens}); + match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens}; + thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn); + // convert counts into offsets - thrust::exclusive_scan( - rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets.begin(), + offsets.end(), + offsets.begin()); - // last entry is the total number of tokens to be generated - auto total_tokens = cudf::detail::get_value(offsets->view(), strings_count, stream); + // the last entry is the total number of tokens to be generated + auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); - // split each string into an array of index-pair values + // generate tokens for each string rmm::device_uvector tokens(total_tokens, stream); - auto const regex_insts = d_prog->insts_counts(); + auto const regex_insts = d_prog.insts_counts(); if (regex_insts <= RX_SMALL_INSTS) { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_MEDIUM_INSTS) { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_LARGE_INSTS) { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else { - token_reader_fn reader{*d_strings, *d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } - // convert the index-pairs into one big strings column + return tokens; +} + +} // namespace + +// The output is one list item per string +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const strings_count = input.size(); + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets_view = offsets->mutable_view(); + + // get split tokens from the input column + auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); + + // convert the tokens into one big strings column auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + // create a lists column using the offsets and the strings columns return make_lists_column(strings_count, std::move(offsets), @@ -177,10 +217,83 @@ std::unique_ptr split_record_re( mr); } +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const strings_count = input.size(); + + std::vector> results; + if (strings_count == 0) { + results.push_back(make_empty_column(type_id::STRING)); + return std::make_unique
(std::move(results)); + } + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); + auto offsets_view = offsets->mutable_view(); + + // get split tokens from the input column + auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); + + // the columns_count is the maximum number of tokens for any string in the input column + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + auto d_offsets = offsets_view.data(); + auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type { + return d_offsets[idx + 1] - d_offsets[idx]; + }; + auto const columns_count = thrust::transform_reduce( + rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum{}); + + // boundary case: if no columns, return one all-null column (custrings issue #119) + if (columns_count == 0) { + results.push_back(std::make_unique( + data_type{type_id::STRING}, + strings_count, + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr), + strings_count)); + return std::make_unique
(std::move(results)); + } + + // convert the tokens into multiple strings columns + auto make_strings_lambda = [&](size_type column_index) { + // returns appropriate token for each row/column + auto indices_itr = cudf::detail::make_counting_transform_iterator( + 0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index}); + return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); + }; + // create each column of tokens + results.resize(columns_count); + std::transform(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(columns_count), + results.begin(), + make_strings_lambda); + + return std::make_unique
(std::move(results)); +} + } // namespace detail // external APIs +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::split_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + std::unique_ptr split_record_re(strings_column_view const& input, std::string const& pattern, size_type maxsplit, diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index badb84536ba..f541a6b0e81 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -247,33 +247,13 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); auto results = cudf::strings::split(zero_size_strings_column); EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); + EXPECT_TRUE(results->num_rows() == 0); results = cudf::strings::rsplit(zero_size_strings_column); EXPECT_TRUE(results->num_columns() == 1); - cudf::test::expect_strings_empty(results->get_column(0)); -} - -// This test specifically for https://github.com/rapidsai/custrings/issues/119 -TEST_F(StringsSplitTest, AllNullsCase) -{ - std::vector h_strings{nullptr, nullptr, nullptr}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - - auto results = cudf::strings::split(cudf::strings_column_view(strings)); - EXPECT_TRUE(results->num_columns() == 1); - auto column = results->get_column(0).view(); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); - results = cudf::strings::split(cudf::strings_column_view(strings), cudf::string_scalar("-")); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::split_re(zero_size_strings_column, "\\s"); EXPECT_TRUE(results->num_columns() == 1); - column = results->get_column(0); - EXPECT_TRUE(column.size() == 3); - EXPECT_TRUE(column.has_nulls()); - EXPECT_TRUE(column.null_count() == column.size()); + EXPECT_TRUE(results->num_rows() == 0); } TEST_F(StringsSplitTest, SplitRecord) @@ -340,6 +320,54 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } +TEST_F(StringsSplitTest, SplitRegex) +{ + std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::split_re(sv, "\\s+"); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2}); + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + } + + { + auto result = cudf::strings::split_re(sv, "[eé]"); + + cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, + {1, 0, 1, 1, 0}); + cudf::test::strings_column_wrapper col2({"s", "", " ", "", ""}, {1, 0, 1, 0, 0}); + cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); + auto expected = cudf::table_view({col0, col1, col2, col3}); + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + auto result = cudf::strings::split_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, + {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); +} + TEST_F(StringsSplitTest, SplitRecordRegex) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; @@ -469,10 +497,35 @@ TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto split_record_result = cudf::strings::split_record(zero_size_strings_column); - EXPECT_TRUE(split_record_result->size() == 0); - auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column); - EXPECT_TRUE(rsplit_record_result->size() == 0); + auto result = cudf::strings::split_record(zero_size_strings_column); + EXPECT_TRUE(result->size() == 0); + result = cudf::strings::rsplit_record(zero_size_strings_column); + EXPECT_TRUE(result->size() == 0); + result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(result->size() == 0); +} + +// This test specifically for https://github.com/rapidsai/custrings/issues/119 +TEST_F(StringsSplitTest, AllNullsCase) +{ + cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0}); + auto sv = cudf::strings_column_view(input); + + auto results = cudf::strings::split(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit(sv, cudf::string_scalar("-")); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::split_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); } TEST_F(StringsSplitTest, Partition) @@ -658,6 +711,7 @@ TEST_F(StringsSplitTest, InvalidParameter) cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)), cudf::logic_error); + EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); From 8599d0cba24ef963c28361455769969a3764a430 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 28 Jan 2022 08:16:31 -0500 Subject: [PATCH 06/20] rename split_record_re.cu to split_re.cu --- cpp/CMakeLists.txt | 2 +- cpp/src/strings/split/{split_record_re.cu => split_re.cu} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/src/strings/split/{split_record_re.cu => split_re.cu} (100%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f96ef4945b9..407e1f9a858 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -445,8 +445,8 @@ add_library( src/strings/search/find_multiple.cu src/strings/split/partition.cu src/strings/split/split.cu + src/strings/split/split_re.cu src/strings/split/split_record.cu - src/strings/split/split_record_re.cu src/strings/strings_column_factories.cu src/strings/strings_column_view.cpp src/strings/strings_scalar_factories.cpp diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_re.cu similarity index 100% rename from cpp/src/strings/split/split_record_re.cu rename to cpp/src/strings/split/split_re.cu From b6d7453b66c3548e4e47499de66e7eae0fa0b2fb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Jan 2022 14:05:56 -0500 Subject: [PATCH 07/20] refactored split_re/rsplit_re functions --- cpp/include/cudf/strings/split/split_re.hpp | 130 +++++++-- cpp/src/strings/split/split_re.cu | 281 ++++++++++++-------- cpp/tests/strings/split_tests.cpp | 162 +++++++---- 3 files changed, 394 insertions(+), 179 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index 54d590fcf71..cf6d23ccd28 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -32,22 +32,19 @@ namespace strings { * using a regex pattern to delimit each string. * * Each element generates an array of strings that are stored in corresponding - * rows in the output table. + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. * * The number of elements in the output table will be the same as the number of - * elements in the input column. The row for each column will contain the - * new strings produced from that input row. - * - * The resulting number of columns will be the maximum number of tokens found - * in any input row. + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. * * The `pattern` is used to identify the separation points within a string * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty string in the - * corresponding row in the first column. - * - * A null row will produce a corresponding null rows in the output table. + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] @@ -60,7 +57,7 @@ namespace strings { * s2 = split_re(s, "[ _]", 1) * s2 is a table of strings columns: * [ ["a", "a", "", "ab"], - * ["bc def_g", "_bc", "ab_cd", "cd "] ] + * ["bc def_g", "_bc", "ab cd", "cd "] ] * @endcode * * @throw cudf:logic_error if `pattern` is empty. @@ -70,9 +67,7 @@ namespace strings { * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return Lists column of strings - * Each vector of the lists column holds splits from a single row - * element of the input column. + * @return A table of columns of strings. */ std::unique_ptr
split_re( strings_column_view const& strings, @@ -81,9 +76,59 @@ std::unique_ptr
split_re( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a list of strings + * @brief Splits individual strings elements into a table of strings columns * using a regex pattern to delimit each string. * + * Each element generates an array of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of string[row]` + * where `token` is the substring between each delimiter. + * + * The number of elements in the output table will be the same as the number of + * elements in the input column. The resulting number of columns will be the + * maximum number of tokens found in any input row. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within the string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty string in the + * corresponding row of the first column. + * A null row will produce corresponding null rows in the output table. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_re(s, "[_ ]") + * s1 is a table of strings columns: + * [ ["a", "a", "", "ab"], + * ["bc", "", "ab", "cd"], + * ["def", "bc", "cd", ""], + * ["g", null, null, null] ] + * s2 = rsplit_re(s, "[ _]", 1) + * s2 is a table of strings columns: + * [ ["a_bc def", "a_", "_ab", "ab"], + * ["g", "bc", "cd", "cd "] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return A table of columns of strings. + */ +std::unique_ptr
rsplit_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Splits individual strings elements into a list of strings + * using the given regex pattern to delimit each string. + * * Each element generates an array of strings that are stored in an output * lists column. * @@ -96,7 +141,7 @@ std::unique_ptr
split_re( * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty list item output row. - * A null row will produce a corresponding null list item output row. + * A null row will produce a corresponding null output row. * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] @@ -121,9 +166,7 @@ std::unique_ptr
split_re( * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. * @param mr Device memory resource used to allocate the returned result's device memory. - * @return Lists column of strings - * Each vector of the lists column holds splits from a single row - * element of the input column. + * @return Lists column of strings. */ std::unique_ptr split_record_re( strings_column_view const& strings, @@ -131,6 +174,57 @@ std::unique_ptr split_record_re( size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Splits individual strings elements into a list of strings + * using the given regex pattern to delimit each string. + * + * Each element generates an array of strings that are stored in an output + * lists column. + * + * The number of elements in the output column will be the same as the number of + * elements in the input column. Each individual list item will contain the + * new strings for that row. The resulting number of strings in each row can vary + * from 0 to `maxsplit + 1`. + * + * Splitting occurs by traversing starting from the end of the input string. + * The `pattern` is used to identify the separation points within a string + * and splitting stops when either `maxsplit` or the beginning of the string + * is reached. + * + * An empty input string will produce a corresponding empty list item output row. + * A null row will produce a corresponding null output row. + * + * @code{.pseudo} + * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] + * s1 = rsplit_record(s, "[_ ]") + * s1 is a lists column of strings: + * [ ["a", "bc", "def", "g"], + * ["a", "", "bc"], + * ["", "ab", "cd"], + * ["ab", "cd", ""] ] + * s2 = rsplit_record(s, "[ _]", 1) + * s2 is a lists column of strings: + * [ ["a_bc def", "g"], + * ["a_", "bc"], + * ["_ab", "cd"], + * ["ab_cd", ""] ] + * @endcode + * + * @throw cudf:logic_error if `pattern` is empty. + * + * @param strings A column of string elements to be split. + * @param pattern The regex pattern for delimiting characters within each string. + * @param maxsplit Maximum number of splits to perform. + * Default of -1 indicates all possible splits on each string. + * @param mr Device memory resource used to allocate the returned result's device memory. + * @return Lists column of strings. + */ +std::unique_ptr rsplit_record_re( + strings_column_view const& strings, + std::string const& pattern, + size_type maxsplit = -1, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 3f916d0138f..9427a900d8d 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -32,36 +32,18 @@ #include #include -#include -#include +#include namespace cudf { namespace strings { namespace detail { - -using string_index_pair = thrust::pair; - namespace { -/** - * @brief Convert match counts to token counts. - * - * The matches are the delimiters and the tokens are what is left: - * `token1, delimiter, token2, delimiter, token3, etc` - * Usually `token_count = match_count + 1` even with empty strings. - * However, we need to account for the max_tokens and null rows. - */ -struct match_to_token_count_fn { - column_device_view const d_strings; - size_type const* d_counts; - size_type const max_tokens; +using string_index_pair = thrust::pair; - __device__ size_type operator()(size_type idx) - { - if (d_strings.is_null(idx)) { return 0; } - auto const match_count = d_counts[idx]; - return std::min(match_count, max_tokens) + 1; - } +enum class split_direction { + FORWARD, ///< for split logic + BACKWARD ///< for rsplit logic }; /** @@ -71,56 +53,58 @@ template struct token_reader_fn { column_device_view const d_strings; reprog_device prog; + split_direction const direction; offset_type const* d_token_offsets; string_index_pair* d_tokens; __device__ void operator()(size_type idx) { if (d_strings.is_null(idx)) { return; } + auto const d_str = d_strings.element(idx); auto const token_offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - token_offset; - auto d_result = d_tokens + token_offset; - auto const d_str = d_strings.element(idx); + auto d_result = d_tokens + token_offset; // store tokens here size_type token_idx = 0; - size_type begin = 0; + size_type begin = 0; // characters size_type end = d_str.length(); - size_type last_pos = 0; - while (token_idx < token_count - 1) { - if (prog.find(idx, d_str, begin, end) <= 0) { break; } - - auto const start_pos = d_str.byte_offset(begin); - auto const end_pos = d_str.byte_offset(end); - d_result[token_idx] = string_index_pair{d_str.data() + last_pos, start_pos - last_pos}; - - begin = end + (begin == end); - end = d_str.length(); - token_idx++; - last_pos = end_pos; + size_type last_pos = 0; // bytes + while (prog.find(idx, d_str, begin, end) > 0) { + // get the token (characters just before this match) + auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; + // store it if we have space + if (token_idx < token_count - 1) { + d_result[token_idx++] = token; + } else { + if (direction == split_direction::FORWARD) { break; } // we are done + for (auto l = 0; l < token_idx - 1; ++l) { + d_result[l] = d_result[l + 1]; // shift left + } + d_result[token_idx - 1] = token; + } + // setup for next match + last_pos = d_str.byte_offset(end); + begin = end + (begin == end); + end = d_str.length(); } - // set last token to remainder of the string + // set the last token to the remainder of the string if (last_pos <= d_str.size_bytes()) { d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; } - } -}; - -struct tokens_transform_fn { - column_device_view const d_strings; - string_index_pair const* d_tokens; - offset_type const* d_token_offsets; - size_type const column_index; - __device__ string_index_pair operator()(size_type idx) const - { - auto const offset = d_token_offsets[idx]; - auto const token_count = d_token_offsets[idx + 1] - offset; - if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; } - if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; } - return d_tokens[offset + column_index]; + if (direction == split_direction::BACKWARD) { + // update first entry -- this happens when max-tokens is hit before the end + auto const first_offset = + d_result[0].first + ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) + : 0; + if (first_offset) { + d_result[0] = string_index_pair{d_str.data(), first_offset + d_result[0].second}; + } + } } }; @@ -138,6 +122,7 @@ struct tokens_transform_fn { */ rmm::device_uvector split_utility(column_device_view const& d_strings, reprog_device& d_prog, + split_direction direction, size_type max_tokens, mutable_column_view& offsets, rmm::cuda_stream_view stream) @@ -148,15 +133,12 @@ rmm::device_uvector split_utility(column_device_view const& d auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); - // convert match counts to tokens - match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens}; - thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn); - - // convert counts into offsets - thrust::exclusive_scan(rmm::exec_policy(stream), - offsets.begin(), - offsets.end(), - offsets.begin()); + // convert match counts to token offsets + auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { + return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1; + }; + thrust::transform_exclusive_scan( + rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); // the last entry is the total number of tokens to be generated auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); @@ -165,60 +147,48 @@ rmm::device_uvector split_utility(column_device_view const& d rmm::device_uvector tokens(total_tokens, stream); auto const regex_insts = d_prog.insts_counts(); if (regex_insts <= RX_SMALL_INSTS) { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_MEDIUM_INSTS) { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else if (regex_insts <= RX_LARGE_INSTS) { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } else { - token_reader_fn reader{d_strings, d_prog, d_offsets, tokens.data()}; + token_reader_fn reader{d_strings, d_prog, direction, d_offsets, tokens.data()}; thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader); } return tokens; } -} // namespace - -// The output is one list item per string -std::unique_ptr split_record_re(strings_column_view const& input, - std::string const& pattern, - size_type maxsplit, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); - auto const strings_count = input.size(); - - auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); - auto d_strings = column_device_view::create(input.parent(), stream); - - auto offsets = count_matches(*d_strings, *d_prog, stream, mr); - auto offsets_view = offsets->mutable_view(); - - // get split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); - - // convert the tokens into one big strings column - auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); +/** + * @brief Returns string pair for the specified column for each string in `d_strings` + * + * This is used to build the table result of a split. + * Null is returned if the row is null of if the `column_index` is larger + * than the token count for that string. + */ +struct tokens_transform_fn { + column_device_view const d_strings; + string_index_pair const* d_tokens; + offset_type const* d_token_offsets; + size_type const column_index; - // create a lists column using the offsets and the strings columns - return make_lists_column(strings_count, - std::move(offsets), - std::move(strings_output), - input.null_count(), - copy_bitmask(input.parent(), stream, mr), - stream, - mr); -} + __device__ string_index_pair operator()(size_type idx) const + { + auto const offset = d_token_offsets[idx]; + auto const token_count = d_token_offsets[idx + 1] - offset; + return (column_index > token_count - 1) || d_strings.is_null(idx) + ? string_index_pair{nullptr, 0} + : d_tokens[offset + column_index]; + } +}; std::unique_ptr
split_re(strings_column_view const& input, std::string const& pattern, + split_direction direction, size_type maxsplit, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -239,19 +209,21 @@ std::unique_ptr
split_re(strings_column_view const& input, auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); auto offsets_view = offsets->mutable_view(); + auto d_offsets = offsets_view.data(); // get split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream); + auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // the columns_count is the maximum number of tokens for any string in the input column - auto const begin = thrust::make_counting_iterator(0); - auto const end = thrust::make_counting_iterator(strings_count); - auto d_offsets = offsets_view.data(); - auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type { - return d_offsets[idx + 1] - d_offsets[idx]; - }; auto const columns_count = thrust::transform_reduce( - rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum{}); + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(strings_count), + [d_offsets] __device__(auto const idx) -> size_type { + return d_offsets[idx + 1] - d_offsets[idx]; + }, + 0, + thrust::maximum{}); // boundary case: if no columns, return one all-null column (custrings issue #119) if (columns_count == 0) { @@ -271,7 +243,7 @@ std::unique_ptr
split_re(strings_column_view const& input, 0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index}); return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); }; - // create each column of tokens + // build a vector of columns results.resize(columns_count); std::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(columns_count), @@ -281,6 +253,78 @@ std::unique_ptr
split_re(strings_column_view const& input, return std::make_unique
(std::move(results)); } +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + split_direction direction, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); + + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const strings_count = input.size(); + + auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); + auto d_strings = column_device_view::create(input.parent(), stream); + + auto offsets = count_matches(*d_strings, *d_prog, stream, mr); + auto offsets_view = offsets->mutable_view(); + + // get split tokens from the input column + auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + + // convert the tokens into one big strings column + auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); + + // create a lists column using the offsets and the strings columns + return make_lists_column(strings_count, + std::move(offsets), + std::move(strings_output), + input.null_count(), + copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace + +std::unique_ptr
split_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr split_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr); +} + +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr); +} + } // namespace detail // external APIs @@ -303,5 +347,22 @@ std::unique_ptr split_record_re(strings_column_view const& input, return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); } +std::unique_ptr
rsplit_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} + +std::unique_ptr rsplit_record_re(strings_column_view const& input, + std::string const& pattern, + size_type maxsplit, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::rsplit_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr); +} } // namespace strings } // namespace cudf diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index f541a6b0e81..d0b695bbc93 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -25,8 +25,8 @@ #include #include #include +#include #include -#include #include @@ -241,21 +241,6 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax) CUDF_TEST_EXPECT_TABLES_EQUAL(*results, *expected); } -TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) -{ - cudf::column_view zero_size_strings_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto results = cudf::strings::split(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - EXPECT_TRUE(results->num_rows() == 0); - results = cudf::strings::rsplit(zero_size_strings_column); - EXPECT_TRUE(results->num_columns() == 1); - EXPECT_TRUE(results->num_rows() == 0); - results = cudf::strings::split_re(zero_size_strings_column, "\\s"); - EXPECT_TRUE(results->num_columns() == 1); - EXPECT_TRUE(results->num_rows() == 0); -} - TEST_F(StringsSplitTest, SplitRecord) { std::vector h_strings{" Héllo thesé", nullptr, "are some ", "tést String", ""}; @@ -331,41 +316,30 @@ TEST_F(StringsSplitTest, SplitRegex) { auto result = cudf::strings::split_re(sv, "\\s+"); - cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity); cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0}); cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0}); auto expected = cudf::table_view({col0, col1, col2}); - CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + result = cudf::strings::rsplit_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } { auto result = cudf::strings::split_re(sv, "[eé]"); - cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity); cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""}, {1, 0, 1, 1, 0}); cudf::test::strings_column_wrapper col2({"s", "", " ", "", ""}, {1, 0, 1, 0, 0}); cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0}); auto expected = cudf::table_view({col0, col1, col2, col3}); - CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); - } -} - -TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) -{ - std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; - auto validity = - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); - cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); - auto sv = cudf::strings_column_view(input); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); - auto result = cudf::strings::split_re(sv, "\\s+", 1); - - cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); - cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, - {1, 0, 1, 1, 0}); - auto expected = cudf::table_view({col0, col1}); - CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + result = cudf::strings::rsplit_re(sv, "[eé]"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } } TEST_F(StringsSplitTest, SplitRecordRegex) @@ -376,30 +350,60 @@ TEST_F(StringsSplitTest, SplitRecordRegex) cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); auto sv = cudf::strings_column_view(input); - auto result = cudf::strings::split_record_re(sv, "[eé]"); - using LCW = cudf::test::lists_column_wrapper; - LCW expected( - {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", " "}, LCW{"t", "st String"}, LCW{""}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + { + auto result = cudf::strings::split_record_re(sv, "\\s+"); + + LCW expected( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + result = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } + + { + auto result = cudf::strings::split_record_re(sv, "[eé]"); + + LCW expected({LCW{" H", "llo th", "s", ""}, + LCW{}, + LCW{"ar", " som", " "}, + LCW{"t", "st String"}, + LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + result = cudf::strings::rsplit_record_re(sv, "[eé]"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } } -TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit) +TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) { std::vector h_strings{" Héllo\tthesé", nullptr, "are\nsome ", "tést\rString", ""}; auto validity = thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); auto sv = cudf::strings_column_view(input); + { + auto result = cudf::strings::split_re(sv, "\\s+", 1); - auto result = cudf::strings::split_record_re(sv, "\\s", 1); + cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1}); + cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some ", "String", ""}, + {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + { + auto result = cudf::strings::split_record_re(sv, "\\s", 1); - using LCW = cudf::test::lists_column_wrapper; - LCW expected( - {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } } TEST_F(StringsSplitTest, RSplitRecord) @@ -493,16 +497,58 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected); } -TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns) +TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) +{ + std::vector h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""}; + auto validity = + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity); + auto sv = cudf::strings_column_view(input); + + { + auto result = cudf::strings::rsplit_re(sv, "\\s+", 1); + + cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity); + cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0}); + auto expected = cudf::table_view({col0, col1}); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + } + { + auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1); + + using LCW = cudf::test::lists_column_wrapper; + LCW expected( + {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + } +} + +TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + auto results = cudf::strings::split(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit(zero_size_strings_column); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::split_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(results->num_columns() == 1); + EXPECT_TRUE(results->num_rows() == 0); + auto result = cudf::strings::split_record(zero_size_strings_column); EXPECT_TRUE(result->size() == 0); result = cudf::strings::rsplit_record(zero_size_strings_column); EXPECT_TRUE(result->size() == 0); result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); EXPECT_TRUE(result->size() == 0); + result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(result->size() == 0); } // This test specifically for https://github.com/rapidsai/custrings/issues/119 @@ -526,6 +572,20 @@ TEST_F(StringsSplitTest, AllNullsCase) results = cudf::strings::split_re(sv, "-"); EXPECT_TRUE(results->num_columns() == 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + results = cudf::strings::rsplit_re(sv, "-"); + EXPECT_TRUE(results->num_columns() == 1); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); + + auto result = cudf::strings::split_record(sv); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_record(sv); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::split_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + result = cudf::strings::rsplit_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } TEST_F(StringsSplitTest, Partition) From 7bc451b142c84c2505416ae5d8f2d9d979a1989f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Jan 2022 17:26:52 -0500 Subject: [PATCH 08/20] remove unneeded if-check --- cpp/src/strings/split/split_re.cu | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 9427a900d8d..9dcf7e6f17b 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -64,7 +64,7 @@ struct token_reader_fn { auto const token_offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - token_offset; - auto d_result = d_tokens + token_offset; // store tokens here + auto const d_result = d_tokens + token_offset; // store tokens here size_type token_idx = 0; size_type begin = 0; // characters @@ -72,7 +72,8 @@ struct token_reader_fn { size_type last_pos = 0; // bytes while (prog.find(idx, d_str, begin, end) > 0) { // get the token (characters just before this match) - auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; + auto const token = + string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos}; // store it if we have space if (token_idx < token_count - 1) { d_result[token_idx++] = token; @@ -90,13 +91,10 @@ struct token_reader_fn { } // set the last token to the remainder of the string - if (last_pos <= d_str.size_bytes()) { - d_result[token_idx] = - string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; - } + d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; if (direction == split_direction::BACKWARD) { - // update first entry -- this happens when max-tokens is hit before the end + // update first entry -- this happens when max-tokens is hit before the end of the string auto const first_offset = d_result[0].first ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) @@ -127,11 +125,11 @@ rmm::device_uvector split_utility(column_device_view const& d mutable_column_view& offsets, rmm::cuda_stream_view stream) { - auto d_offsets = offsets.data(); auto const strings_count = d_strings.size(); - auto const begin = thrust::make_counting_iterator(0); - auto const end = thrust::make_counting_iterator(strings_count); + auto const begin = thrust::make_counting_iterator(0); + auto const end = thrust::make_counting_iterator(strings_count); + auto const d_offsets = offsets.data(); // convert match counts to token offsets auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { @@ -140,7 +138,7 @@ rmm::device_uvector split_utility(column_device_view const& d thrust::transform_exclusive_scan( rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); - // the last entry is the total number of tokens to be generated + // the last offset entry is the total number of tokens to be generated auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); // generate tokens for each string @@ -204,14 +202,16 @@ std::unique_ptr
split_re(strings_column_view const& input, return std::make_unique
(std::move(results)); } + // create the regex device prog from the given pattern auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); auto d_strings = column_device_view::create(input.parent(), stream); + // count the number of delimiters matched in each string auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource()); auto offsets_view = offsets->mutable_view(); auto d_offsets = offsets_view.data(); - // get split tokens from the input column + // get the split tokens from the input column auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // the columns_count is the maximum number of tokens for any string in the input column @@ -265,13 +265,15 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); + // create the regex device prog from the given pattern auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); auto d_strings = column_device_view::create(input.parent(), stream); + // count the number of delimiters matched in each string auto offsets = count_matches(*d_strings, *d_prog, stream, mr); auto offsets_view = offsets->mutable_view(); - // get split tokens from the input column + // get the split tokens from the input column auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // convert the tokens into one big strings column From 93887b1877733bfc97c29606f8c9a221d8304efb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 31 Jan 2022 17:27:29 -0500 Subject: [PATCH 09/20] add all empty and all null test cases --- cpp/tests/strings/split_tests.cpp | 36 ++++++++++++++++--------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index d0b695bbc93..4650cbc3c44 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -541,14 +541,14 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns) EXPECT_TRUE(results->num_columns() == 1); EXPECT_TRUE(results->num_rows() == 0); - auto result = cudf::strings::split_record(zero_size_strings_column); - EXPECT_TRUE(result->size() == 0); - result = cudf::strings::rsplit_record(zero_size_strings_column); - EXPECT_TRUE(result->size() == 0); - result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); - EXPECT_TRUE(result->size() == 0); - result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); - EXPECT_TRUE(result->size() == 0); + auto list_result = cudf::strings::split_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record(zero_size_strings_column); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); + list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s"); + EXPECT_TRUE(list_result->size() == 0); } // This test specifically for https://github.com/rapidsai/custrings/issues/119 @@ -576,16 +576,16 @@ TEST_F(StringsSplitTest, AllNullsCase) EXPECT_TRUE(results->num_columns() == 1); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input); - auto result = cudf::strings::split_record(sv); - using LCW = cudf::test::lists_column_wrapper; + auto list_result = cudf::strings::split_record(sv); + using LCW = cudf::test::lists_column_wrapper; LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - result = cudf::strings::rsplit_record(sv); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - result = cudf::strings::split_record_re(sv, "-"); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); - result = cudf::strings::rsplit_record_re(sv, "-"); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record(sv); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::split_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); + list_result = cudf::strings::rsplit_record_re(sv, "-"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected); } TEST_F(StringsSplitTest, Partition) @@ -773,6 +773,8 @@ TEST_F(StringsSplitTest, InvalidParameter) cudf::logic_error); EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error); + EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error); EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)), From c88eeae8727b9c94f05d15c0e9e3e9714107bf39 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Feb 2022 12:34:45 -0500 Subject: [PATCH 10/20] add more maxsplit gtests --- cpp/include/cudf/strings/split/split_re.hpp | 50 +++++++++++---------- cpp/src/strings/split/split_re.cu | 28 +++++++----- cpp/tests/strings/split_tests.cpp | 34 +++++++++++--- 3 files changed, 71 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index cf6d23ccd28..c6dc1e5c697 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -28,18 +28,18 @@ namespace strings { */ /** - * @brief Splits individual strings elements into a table of strings columns + * @brief Splits strings elements into a table of strings columns * using a regex pattern to delimit each string. * - * Each element generates an array of strings that are stored in corresponding - * rows in the output table -- `table[col,row] = token[col] of string[row]` - * where `token` is the substring between each delimiter. + * Each element generates a vector of strings that are stored in corresponding + * rows in the output table -- `table[col,row] = token[col] of strings[row]` + * where `token` is a substring between delimiters. * - * The number of elements in the output table will be the same as the number of + * The number of rows in the output table will be the same as the number of * elements in the input column. The resulting number of columns will be the * maximum number of tokens found in any input row. * - * The `pattern` is used to identify the separation points within a string + * The `pattern` is used to identify the delimiters within a string * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty string in the @@ -62,7 +62,7 @@ namespace strings { * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -70,25 +70,25 @@ namespace strings { * @return A table of columns of strings. */ std::unique_ptr
split_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a table of strings columns + * @brief Splits strings elements into a table of strings columns * using a regex pattern to delimit each string. * - * Each element generates an array of strings that are stored in corresponding + * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of string[row]` * where `token` is the substring between each delimiter. * - * The number of elements in the output table will be the same as the number of + * The number of rows in the output table will be the same as the number of * elements in the input column. The resulting number of columns will be the * maximum number of tokens found in any input row. * * Splitting occurs by traversing starting from the end of the input string. - * The `pattern` is used to identify the separation points within the string + * The `pattern` is used to identify the delimiters within a string * and splitting stops when either `maxsplit` or the beginning of the string * is reached. * @@ -112,7 +112,7 @@ std::unique_ptr
split_re( * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -120,24 +120,25 @@ std::unique_ptr
split_re( * @return A table of columns of strings. */ std::unique_ptr
rsplit_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a list of strings + * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string. * * Each element generates an array of strings that are stored in an output - * lists column. + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. * * The number of elements in the output column will be the same as the number of * elements in the input column. Each individual list item will contain the * new strings for that row. The resulting number of strings in each row can vary * from 0 to `maxsplit + 1`. * - * The `pattern` is used to identify the separation points within a string + * The `pattern` is used to identify the delimiters within a string * and splitting stops when either `maxsplit` or the end of the string is reached. * * An empty input string will produce a corresponding empty list item output row. @@ -161,7 +162,7 @@ std::unique_ptr
rsplit_re( * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -169,17 +170,18 @@ std::unique_ptr
rsplit_re( * @return Lists column of strings. */ std::unique_ptr split_record_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Splits individual strings elements into a list of strings + * @brief Splits strings elements into a list column of strings * using the given regex pattern to delimit each string. * - * Each element generates an array of strings that are stored in an output - * lists column. + * Each element generates a vector of strings that are stored in an output + * lists column -- `list[row] = [token1, token2, ...] found in input[row]` + * where `token` is a substring between delimiters. * * The number of elements in the output column will be the same as the number of * elements in the input column. Each individual list item will contain the @@ -212,7 +214,7 @@ std::unique_ptr split_record_re( * * @throw cudf:logic_error if `pattern` is empty. * - * @param strings A column of string elements to be split. + * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. * @param maxsplit Maximum number of splits to perform. * Default of -1 indicates all possible splits on each string. @@ -220,7 +222,7 @@ std::unique_ptr split_record_re( * @return Lists column of strings. */ std::unique_ptr rsplit_record_re( - strings_column_view const& strings, + strings_column_view const& input, std::string const& pattern, size_type maxsplit = -1, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 9dcf7e6f17b..dd71533c773 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -48,6 +48,10 @@ enum class split_direction { /** * @brief Identify the tokens from the `idx'th` string element of `d_strings`. + * + * Each string's tokens are stored in the `d_tokens` vector. + * The `d_token_offsets` specifies the output position within `d_tokens` + * for each string. */ template struct token_reader_fn { @@ -118,12 +122,12 @@ struct token_reader_fn { * The offsets for each token in each string on output. * @param stream CUDA stream used for kernel launches. */ -rmm::device_uvector split_utility(column_device_view const& d_strings, - reprog_device& d_prog, - split_direction direction, - size_type max_tokens, - mutable_column_view& offsets, - rmm::cuda_stream_view stream) +rmm::device_uvector generate_tokens(column_device_view const& d_strings, + reprog_device& d_prog, + split_direction direction, + size_type max_tokens, + mutable_column_view& offsets, + rmm::cuda_stream_view stream) { auto const strings_count = d_strings.size(); @@ -165,7 +169,7 @@ rmm::device_uvector split_utility(column_device_view const& d * @brief Returns string pair for the specified column for each string in `d_strings` * * This is used to build the table result of a split. - * Null is returned if the row is null of if the `column_index` is larger + * Null is returned if the row is null or if the `column_index` is larger * than the token count for that string. */ struct tokens_transform_fn { @@ -211,10 +215,10 @@ std::unique_ptr
split_re(strings_column_view const& input, auto offsets_view = offsets->mutable_view(); auto d_offsets = offsets_view.data(); - // get the split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); - // the columns_count is the maximum number of tokens for any string in the input column + // the output column count is the maximum number of tokens generated for any input string auto const columns_count = thrust::transform_reduce( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -273,8 +277,8 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto offsets = count_matches(*d_strings, *d_prog, stream, mr); auto offsets_view = offsets->mutable_view(); - // get the split tokens from the input column - auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + // get the split tokens from the input column; this also converts the counts into offsets + auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); // convert the tokens into one big strings column auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp index 4650cbc3c44..f0d7315929b 100644 --- a/cpp/tests/strings/split_tests.cpp +++ b/cpp/tests/strings/split_tests.cpp @@ -322,6 +322,7 @@ TEST_F(StringsSplitTest, SplitRegex) auto expected = cudf::table_view({col0, col1, col2}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_re(sv, "\\s+"); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } @@ -337,6 +338,7 @@ TEST_F(StringsSplitTest, SplitRegex) auto expected = cudf::table_view({col0, col1, col2, col3}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_re(sv, "[eé]"); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); } @@ -359,6 +361,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex) validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_record_re(sv, "\\s+"); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } @@ -374,6 +377,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex) validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + // rsplit == split when using default parameters result = cudf::strings::rsplit_record_re(sv, "[eé]"); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } @@ -394,15 +398,31 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit) {1, 0, 1, 1, 0}); auto expected = cudf::table_view({col0, col1}); CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected); + + // split everything is the same output as maxsplit==2 for the test input column here + result = cudf::strings::split_re(sv, "\\s+", 2); + auto expected2 = cudf::strings::split_re(sv, "\\s+"); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view()); } { auto result = cudf::strings::split_record_re(sv, "\\s", 1); using LCW = cudf::test::lists_column_wrapper; - LCW expected( + LCW expected1( {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some "}, LCW{"tést", "String"}, LCW{""}}, validity); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1); + + result = cudf::strings::split_record_re(sv, "\\s", 2); + LCW expected2( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}}, + validity); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2); + + // split everything is the same output as maxsplit==3 for the test input column here + result = cudf::strings::split_record_re(sv, "\\s", 3); + auto expected0 = cudf::strings::split_record_re(sv, "\\s"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); } } @@ -521,6 +541,11 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit) {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}}, validity); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); + + // split everything is the same output as any maxsplit > 2 for the test input column here + result = cudf::strings::rsplit_record_re(sv, "\\s+", 3); + auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+"); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view()); } } @@ -760,9 +785,8 @@ TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns) TEST_F(StringsSplitTest, InvalidParameter) { - std::vector h_strings{"string left intentionally blank"}; - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); - auto strings_view = cudf::strings_column_view(strings); + cudf::test::strings_column_wrapper input({"string left intentionally blank"}); + auto strings_view = cudf::strings_column_view(input); EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)), cudf::logic_error); EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)), From f17065c7387fcb535d87884ec5d59daa3f5b0d27 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 1 Feb 2022 15:11:27 -0500 Subject: [PATCH 11/20] Add regex parameter to cudf strings split() --- .../cudf/_lib/cpp/strings/split/split.pxd | 26 ++++- python/cudf/cudf/_lib/strings/__init__.py | 4 + python/cudf/cudf/_lib/strings/split/split.pyx | 102 +++++++++++++++++- python/cudf/cudf/core/column/string.py | 44 ++++++-- python/cudf/cudf/tests/test_string.py | 23 ++++ 5 files changed, 187 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd b/python/cudf/cudf/_lib/cpp/strings/split/split.pxd index 4a90aa233f0..9d3aa706ff1 100644 --- a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/split/split.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -32,3 +32,27 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \ column_view source_strings, string_scalar delimiter, size_type maxsplit) except + + + +cdef extern from "cudf/strings/split/split_re.hpp" namespace \ + "cudf::strings" nogil: + + cdef unique_ptr[table] split_re( + const column_view& source_strings, + const string& pattern, + size_type maxsplit) except + + + cdef unique_ptr[table] rsplit_re( + const column_view& source_strings, + const string& pattern, + size_type maxsplit) except + + + cdef unique_ptr[column] split_record_re( + const column_view& source_strings, + const string& pattern, + size_type maxsplit) except + + + cdef unique_ptr[column] rsplit_record_re( + const column_view& source_strings, + const string& pattern, + size_type maxsplit) except + diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 7911d0eff2a..178d96de0e5 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -78,9 +78,13 @@ from cudf._lib.strings.split.partition import partition, rpartition from cudf._lib.strings.split.split import ( rsplit, + rsplit_re, rsplit_record, + rsplit_record_re, split, + split_re, split_record, + split_record_re, ) from cudf._lib.strings.strip import lstrip, rstrip, strip from cudf._lib.strings.substring import get, slice_from, slice_strings diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index 184b256125c..ce066aa6aec 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.string cimport string @@ -10,9 +10,13 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport string_scalar from cudf._lib.cpp.strings.split.split cimport ( rsplit as cpp_rsplit, + rsplit_re as cpp_rsplit_re, rsplit_record as cpp_rsplit_record, + rsplit_record_re as cpp_rsplit_record_re, split as cpp_split, + split_re as cpp_split_re, split_record as cpp_split_record, + split_record_re as cpp_split_record_re, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -137,3 +141,99 @@ def rsplit_record(Column source_strings, return Column.from_unique_ptr( move(c_result), ) + + +def split_re(Column source_strings, + object pattern, + size_type maxsplit): + """ + Returns data by splitting the `source_strings` + column around the delimiters identified by `pattern`. + """ + cdef unique_ptr[table] c_result + cdef column_view source_view = source_strings.view() + cdef string pattern_string = str(pattern).encode() + + with nogil: + c_result = move(cpp_split_re( + source_view, + pattern_string, + maxsplit + )) + + return data_from_unique_ptr( + move(c_result), + column_names=range(0, c_result.get()[0].num_columns()) + ) + + +def rsplit_re(Column source_strings, + object pattern, + size_type maxsplit): + """ + Returns data by splitting the `source_strings` + column around the delimiters identified by `pattern`. + The delimiters are searched starting from the end of each string. + """ + cdef unique_ptr[table] c_result + cdef column_view source_view = source_strings.view() + cdef string pattern_string = str(pattern).encode() + + with nogil: + c_result = move(cpp_rsplit_re( + source_view, + pattern_string, + maxsplit + )) + + return data_from_unique_ptr( + move(c_result), + column_names=range(0, c_result.get()[0].num_columns()) + ) + + +def split_record_re(Column source_strings, + object pattern, + size_type maxsplit): + """ + Returns a Column by splitting the `source_strings` + column around the delimiters identified by `pattern`. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + cdef string pattern_string = str(pattern).encode() + + with nogil: + c_result = move(cpp_split_record_re( + source_view, + pattern_string, + maxsplit + )) + + return Column.from_unique_ptr( + move(c_result), + ) + + +def rsplit_record_re(Column source_strings, + object pattern, + size_type maxsplit): + """ + Returns a Column by splitting the `source_strings` + column around the delimiters identified by `pattern`. + The delimiters are searched starting from the end of each string. + """ + cdef unique_ptr[column] c_result + cdef column_view source_view = source_strings.view() + cdef string pattern_string = str(pattern).encode() + + with nogil: + c_result = move(cpp_rsplit_record_re( + source_view, + pattern_string, + maxsplit + )) + + return Column.from_unique_ptr( + move(c_result), + ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 6467fd39ddd..a60a2b97c52 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2284,7 +2284,11 @@ def get_json_object(self, json_path): return res def split( - self, pat: str = None, n: int = -1, expand: bool = None + self, + pat: str = None, + n: int = -1, + expand: bool = None, + regex: bool = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2295,8 +2299,9 @@ def split( Parameters ---------- - pat : str, default ' ' (space) - String to split on, does not yet support regular expressions. + pat : str, default None + String or regular expression to split on. If not specified, split + on whitespace. n : int, default -1 (all) Limit number of splits in output. `None`, 0, and -1 will all be interpreted as "all splits". @@ -2307,6 +2312,13 @@ def split( dimensionality. * If ``False``, return Series/Index, containing lists of strings. + regex : bool, default None + Determines if the passed-in pattern is a regular expression: + + * If ``True``, assumes the passed-in pattern is a regular + expression + * If ``False``, treats the pattern as a literal string. + * If pat length is 1, treats pat as a literal string. Returns ------- @@ -2406,27 +2418,39 @@ def split( ) # Pandas treats 0 as all - if n == 0: + if n is None or n == 0: n = -1 if pat is None: pat = "" + if regex and isinstance(pat, re.Pattern): + pat = pat.pattern + + if len(str(pat)) <= 1: + regex = False + if expand: if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - data, index = libstrings.split( - self._column, cudf.Scalar(pat, "str"), n - ) + if regex is True: + data, index = libstrings.split_re(self._column, pat, n) + else: + data, index = libstrings.split( + self._column, cudf.Scalar(pat, "str"), n + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = cudf.core.frame.Frame({}) else: result_table = cudf.core.frame.Frame(data, index) else: - result_table = libstrings.split_record( - self._column, cudf.Scalar(pat, "str"), n - ) + if regex is True: + result_table = libstrings.split_record_re(self._column, pat, n) + else: + result_table = libstrings.split_record( + self._column, cudf.Scalar(pat, "str"), n + ) return self._return_or_inplace(result_table, expand=expand) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 75cf2e6c892..b34d705f139 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -961,6 +961,29 @@ def test_string_split(data, pat, n, expand): assert_eq(expect, got) +@pytest.mark.parametrize( + "data", + [ + ["a b", " c ", " d", "e ", "f"], + ["a-b", "-c-", "---d", "e---", "f"], + ["ab", "c", "d", "e", "f"], + [None, None, None, None, None], + ], +) +@pytest.mark.parametrize("pat", [None, " ", "\\-+", "\\s+"]) +@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) +@pytest.mark.parametrize("expand", [True, False, None]) +def test_string_split_re(data, pat, n, expand): + ps = pd.Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") + + # Pandas does not support the regex parameter until 1.4.0 + expect = ps.str.split(pat=pat, n=n, expand=expand) + got = gs.str.split(pat=pat, n=n, expand=expand, regex=True) + + assert_eq(expect, got) + + @pytest.mark.parametrize( "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]] ) From eb8c326cd1b2ea1e9f673b6a16e07533ce637f14 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 7 Feb 2022 19:51:00 -0500 Subject: [PATCH 12/20] fix doxygen typo in @throw line --- cpp/include/cudf/strings/split/split_re.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index c6dc1e5c697..d61b802efe9 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -60,7 +60,7 @@ namespace strings { * ["bc def_g", "_bc", "ab cd", "cd "] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. @@ -110,7 +110,7 @@ std::unique_ptr
split_re( * ["g", "bc", "cd", "cd "] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. @@ -160,7 +160,7 @@ std::unique_ptr
rsplit_re( * ["ab", "cd "] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. @@ -212,7 +212,7 @@ std::unique_ptr split_record_re( * ["ab_cd", ""] ] * @endcode * - * @throw cudf:logic_error if `pattern` is empty. + * @throw cudf::logic_error if `pattern` is empty. * * @param input A column of string elements to be split. * @param pattern The regex pattern for delimiting characters within each string. From d6ee8837ff3f523816d96f444e1b001d14debdf7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 7 Feb 2022 19:51:37 -0500 Subject: [PATCH 13/20] refactor max-tokens calculation into helper function --- cpp/src/strings/split/split_re.cu | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index dd71533c773..d80148f2fe6 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -98,7 +98,7 @@ struct token_reader_fn { d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos}; if (direction == split_direction::BACKWARD) { - // update first entry -- this happens when max-tokens is hit before the end of the string + // update first entry -- this happens when max_tokens is hit before the end of the string auto const first_offset = d_result[0].first ? static_cast(thrust::distance(d_str.data(), d_result[0].first)) @@ -117,6 +117,7 @@ struct token_reader_fn { * * @param d_strings Strings to split * @param d_prog Regex to evaluate against each string + * @param direction Whether tokens are generated forwards or backwards. * @param max_tokens The maximum number of tokens for each split. * @param offsets The number of matches on input. * The offsets for each token in each string on output. @@ -125,12 +126,14 @@ struct token_reader_fn { rmm::device_uvector generate_tokens(column_device_view const& d_strings, reprog_device& d_prog, split_direction direction, - size_type max_tokens, + size_type maxsplit, mutable_column_view& offsets, rmm::cuda_stream_view stream) { auto const strings_count = d_strings.size(); + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const begin = thrust::make_counting_iterator(0); auto const end = thrust::make_counting_iterator(strings_count); auto const d_offsets = offsets.data(); @@ -182,7 +185,7 @@ struct tokens_transform_fn { { auto const offset = d_token_offsets[idx]; auto const token_count = d_token_offsets[idx + 1] - offset; - return (column_index > token_count - 1) || d_strings.is_null(idx) + return (column_index >= token_count) || d_strings.is_null(idx) ? string_index_pair{nullptr, 0} : d_tokens[offset + column_index]; } @@ -197,7 +200,6 @@ std::unique_ptr
split_re(strings_column_view const& input, { CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); std::vector> results; @@ -216,7 +218,7 @@ std::unique_ptr
split_re(strings_column_view const& input, auto d_offsets = offsets_view.data(); // get the split tokens from the input column; this also converts the counts into offsets - auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); // the output column count is the maximum number of tokens generated for any input string auto const columns_count = thrust::transform_reduce( @@ -266,7 +268,6 @@ std::unique_ptr split_record_re(strings_column_view const& input, { CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); auto const strings_count = input.size(); // create the regex device prog from the given pattern @@ -278,7 +279,7 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto offsets_view = offsets->mutable_view(); // get the split tokens from the input column; this also converts the counts into offsets - auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream); + auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); // convert the tokens into one big strings column auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); From 6094ed9f218d4c104b4ba015f235583abd50baca Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 8 Feb 2022 12:40:24 -0500 Subject: [PATCH 14/20] fix doxygen brief and examples --- cpp/include/cudf/strings/split/split_re.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index d61b802efe9..320d1bdc9b4 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -77,7 +77,7 @@ std::unique_ptr
split_re( /** * @brief Splits strings elements into a table of strings columns - * using a regex pattern to delimit each string. + * using a regex pattern to delimit each string starting from the end of the string. * * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of string[row]` @@ -146,13 +146,13 @@ std::unique_ptr
rsplit_re( * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = split_record(s, "[_ ]") + * s1 = split_record_re(s, "[_ ]") * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * s2 = split_record(s, "[ _]", 1) + * s2 = split_record_re(s, "[ _]", 1) * s2 is a lists column of strings: * [ ["a", "bc def_g"], * ["a", "_bc"], @@ -177,7 +177,7 @@ std::unique_ptr split_record_re( /** * @brief Splits strings elements into a list column of strings - * using the given regex pattern to delimit each string. + * using the given regex pattern to delimit each string starting from the end of the string. * * Each element generates a vector of strings that are stored in an output * lists column -- `list[row] = [token1, token2, ...] found in input[row]` @@ -198,13 +198,13 @@ std::unique_ptr split_record_re( * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = rsplit_record(s, "[_ ]") + * s1 = rsplit_record_re(s, "[_ ]") * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * s2 = rsplit_record(s, "[ _]", 1) + * s2 = rsplit_record_re(s, "[ _]", 1) * s2 is a lists column of strings: * [ ["a_bc def", "g"], * ["a_", "bc"], From f647cf0f681493eaa254a3c39cf9e299a19a0589 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 8 Feb 2022 12:42:47 -0500 Subject: [PATCH 15/20] fix doxygen brief and examples --- cpp/include/cudf/strings/split/split_re.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp index d61b802efe9..320d1bdc9b4 100644 --- a/cpp/include/cudf/strings/split/split_re.hpp +++ b/cpp/include/cudf/strings/split/split_re.hpp @@ -77,7 +77,7 @@ std::unique_ptr
split_re( /** * @brief Splits strings elements into a table of strings columns - * using a regex pattern to delimit each string. + * using a regex pattern to delimit each string starting from the end of the string. * * Each element generates a vector of strings that are stored in corresponding * rows in the output table -- `table[col,row] = token[col] of string[row]` @@ -146,13 +146,13 @@ std::unique_ptr
rsplit_re( * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = split_record(s, "[_ ]") + * s1 = split_record_re(s, "[_ ]") * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * s2 = split_record(s, "[ _]", 1) + * s2 = split_record_re(s, "[ _]", 1) * s2 is a lists column of strings: * [ ["a", "bc def_g"], * ["a", "_bc"], @@ -177,7 +177,7 @@ std::unique_ptr split_record_re( /** * @brief Splits strings elements into a list column of strings - * using the given regex pattern to delimit each string. + * using the given regex pattern to delimit each string starting from the end of the string. * * Each element generates a vector of strings that are stored in an output * lists column -- `list[row] = [token1, token2, ...] found in input[row]` @@ -198,13 +198,13 @@ std::unique_ptr split_record_re( * * @code{.pseudo} * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "] - * s1 = rsplit_record(s, "[_ ]") + * s1 = rsplit_record_re(s, "[_ ]") * s1 is a lists column of strings: * [ ["a", "bc", "def", "g"], * ["a", "", "bc"], * ["", "ab", "cd"], * ["ab", "cd", ""] ] - * s2 = rsplit_record(s, "[ _]", 1) + * s2 = rsplit_record_re(s, "[ _]", 1) * s2 is a lists column of strings: * [ ["a_bc def", "g"], * ["a_", "bc"], From 7394e74a0726273bd9de3825cb4a1d30fd034e49 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 14 Feb 2022 10:22:53 -0500 Subject: [PATCH 16/20] add rsplit regex interface --- python/cudf/cudf/core/column/string.py | 40 ++++++++++++++++++++------ python/cudf/cudf/tests/test_string.py | 15 ++++++++++ 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ebfa1a640de..84fc497ef89 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -2300,7 +2300,7 @@ def split( Split strings around given separator/delimiter. Splits the string in the Series/Index from the beginning, at the - specified delimiter string. Equivalent to `str.split() + specified delimiter string. Similar to `str.split() `_. Parameters @@ -2461,13 +2461,17 @@ def split( return self._return_or_inplace(result_table, expand=expand) def rsplit( - self, pat: str = None, n: int = -1, expand: bool = None + self, + pat: str = None, + n: int = -1, + expand: bool = None, + regex: bool = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. Splits the string in the Series/Index from the end, at the - specified delimiter string. Equivalent to `str.rsplit() + specified delimiter string. Similar to `str.rsplit() `_. Parameters @@ -2484,6 +2488,13 @@ def rsplit( dimensionality. * If ``False``, return Series/Index, containing lists of strings. + regex : bool, default None + Determines if the passed-in pattern is a regular expression: + + * If ``True``, assumes the passed-in pattern is a regular + expression + * If ``False``, treats the pattern as a literal string. + * If pat length is 1, treats pat as a literal string. Returns ------- @@ -2598,21 +2609,32 @@ def rsplit( if pat is None: pat = "" + if regex and isinstance(pat, re.Pattern): + pat = pat.pattern + if expand: if self._column.null_count == len(self._column): result_table = cudf.core.frame.Frame({0: self._column.copy()}) else: - data, index = libstrings.rsplit( - self._column, cudf.Scalar(pat, "str"), n - ) + if regex is True: + data, index = libstrings.rsplit_re(self._column, pat, n) + else: + data, index = libstrings.rsplit( + self._column, cudf.Scalar(pat, "str"), n + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = cudf.core.frame.Frame({}) else: result_table = cudf.core.frame.Frame(data, index) else: - result_table = libstrings.rsplit_record( - self._column, cudf.Scalar(pat), n - ) + if regex is True: + result_table = libstrings.rsplit_record_re( + self._column, pat, n + ) + else: + result_table = libstrings.rsplit_record( + self._column, cudf.Scalar(pat), n + ) return self._return_or_inplace(result_table, expand=expand) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 1e7f8bfb7b0..753d8af7c73 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1535,6 +1535,21 @@ def test_strings_rsplit(data, n, expand): ) +@pytest.mark.parametrize( + "data", [["a b", " c ", " d", "e ", "f"]], +) +@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) +@pytest.mark.parametrize("expand", [True, False, None]) +def test_string_rsplit_re(data, n, expand): + ps = pd.Series(data, dtype="str") + gs = cudf.Series(data, dtype="str") + + # Pandas does not support the regex parameter until 1.4.0 + expect = ps.str.rsplit(pat=" ", n=n, expand=expand) + got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) + assert_eq(expect, got) + + @pytest.mark.parametrize( "data", [ From 3ff72abd8017614afeaecc7e78825aa7b68211d0 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 14 Feb 2022 16:19:33 -0500 Subject: [PATCH 17/20] update copyright header in init.py --- python/cudf/cudf/_lib/strings/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 178d96de0e5..fe0710504db 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,3 +1,4 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, From a518415bfd9c4060c61c447addab712688722a19 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 14 Feb 2022 18:25:45 -0500 Subject: [PATCH 18/20] correct copyright year in init.py --- python/cudf/cudf/_lib/strings/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index fe0710504db..9fccd61c82d 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix from cudf._lib.nvtext.generate_ngrams import ( generate_character_ngrams, From 75f20d57c421c2893ef84b942609acf3b0075dc8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 15 Feb 2022 18:14:27 -0500 Subject: [PATCH 19/20] add PANDAS_LT_140 check in rsplit test --- python/cudf/cudf/tests/test_string.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 753d8af7c73..36351bfd1c3 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1545,7 +1545,13 @@ def test_string_rsplit_re(data, n, expand): gs = cudf.Series(data, dtype="str") # Pandas does not support the regex parameter until 1.4.0 - expect = ps.str.rsplit(pat=" ", n=n, expand=expand) + from cudf.core._compat import PANDAS_LT_140 + + if PANDAS_LT_140: + expect = ps.str.rsplit(pat=" ", n=n, expand=expand) + else: + expect = ps.str.rsplit(pat="\\s", n=n, regex=True) + got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) assert_eq(expect, got) From ba4c8a214da531e6b57989861ba814dc9fb1132e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 17 Feb 2022 08:46:54 -0500 Subject: [PATCH 20/20] add inspect.signature check for rsplit regex parameter --- python/cudf/cudf/tests/test_string.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index 36351bfd1c3..5ee14d8132b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1535,23 +1535,22 @@ def test_strings_rsplit(data, n, expand): ) -@pytest.mark.parametrize( - "data", [["a b", " c ", " d", "e ", "f"]], -) @pytest.mark.parametrize("n", [-1, 0, 1, 3, 10]) @pytest.mark.parametrize("expand", [True, False, None]) -def test_string_rsplit_re(data, n, expand): +def test_string_rsplit_re(n, expand): + data = ["a b", " c ", " d", "e ", "f"] ps = pd.Series(data, dtype="str") gs = cudf.Series(data, dtype="str") - # Pandas does not support the regex parameter until 1.4.0 - from cudf.core._compat import PANDAS_LT_140 + # Pandas does not yet support the regex parameter for rsplit + import inspect - if PANDAS_LT_140: - expect = ps.str.rsplit(pat=" ", n=n, expand=expand) - else: - expect = ps.str.rsplit(pat="\\s", n=n, regex=True) + assert ( + "regex" + not in inspect.signature(pd.Series.str.rsplit).parameters.keys() + ) + expect = ps.str.rsplit(pat=" ", n=n, expand=expand) got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True) assert_eq(expect, got)