From eaba42e4d7631302c81c4caf2f3d29fb24f3c45d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 25 Jan 2022 19:47:19 -0500
Subject: [PATCH 01/20] Add libcudf strings split API that accepts regex
 pattern

---
 conda/recipes/libcudf/meta.yaml             |   1 +
 cpp/CMakeLists.txt                          |   1 +
 cpp/include/cudf/strings/split/split_re.hpp |  82 ++++++++
 cpp/src/strings/split/split_record_re.cu    | 215 ++++++++++++++++++++
 cpp/tests/strings/split_tests.cpp           |  37 +++-
 5 files changed, 335 insertions(+), 1 deletion(-)
 create mode 100644 cpp/include/cudf/strings/split/split_re.hpp
 create mode 100644 cpp/src/strings/split/split_record_re.cu

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 2cbe5173de0..01ad8d4e270 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -214,6 +214,7 @@ test:
     - test -f $PREFIX/include/cudf/strings/replace_re.hpp
     - test -f $PREFIX/include/cudf/strings/split/partition.hpp
     - test -f $PREFIX/include/cudf/strings/split/split.hpp
+    - test -f $PREFIX/include/cudf/strings/split/split_re.hpp
     - test -f $PREFIX/include/cudf/strings/string_view.hpp
     - test -f $PREFIX/include/cudf/strings/strings_column_view.hpp
     - test -f $PREFIX/include/cudf/strings/strip.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e4637408110..b25d6ff3703 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -444,6 +444,7 @@ add_library(
   src/strings/split/partition.cu
   src/strings/split/split.cu
   src/strings/split/split_record.cu
+  src/strings/split/split_record_re.cu
   src/strings/strings_column_factories.cu
   src/strings/strings_column_view.cpp
   src/strings/strings_scalar_factories.cpp
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
new file mode 100644
index 00000000000..b69bd1c5991
--- /dev/null
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+namespace cudf {
+namespace strings {
+/**
+ * @addtogroup strings_split
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Splits individual strings elements into a list of strings
+ * using a regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null list item output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_record(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = split_record(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc_def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab_cd"],
+ *       ["ab", "cd_"] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
+ *         element of the input column.
+ */
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu
new file mode 100644
index 00000000000..d197ee9c7e3
--- /dev/null
+++ b/cpp/src/strings/split/split_record_re.cu
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <strings/regex/regex.cuh>
+#include <strings/utilities.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/split/split_re.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/scan.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+using string_index_pair = thrust::pair<const char*, size_type>;
+
+namespace {
+
+/**
+ * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
+ */
+template <int stack_size>
+struct token_counter_fn {
+  column_device_view const d_strings;  // strings to split
+  reprog_device prog;
+  size_type const max_tokens;
+
+  __device__ size_type operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return 0; }
+
+    auto const d_str      = d_strings.element<string_view>(idx);
+    size_type token_count = 0;
+
+    int32_t begin = 0;
+    int32_t end   = -1;
+    while (token_count < max_tokens - 1) {
+      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
+      token_count++;
+      begin = end + (begin == end);
+      end   = -1;
+    }
+    return token_count + 1;  // always at least one token
+  }
+};
+
+/**
+ * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
+ */
+template <int stack_size>
+struct token_reader_fn {
+  column_device_view const d_strings;
+  reprog_device prog;
+  int32_t const* d_token_offsets;
+  string_index_pair* d_tokens;
+
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) { return; }
+
+    auto const token_offset = d_token_offsets[idx];
+    auto const token_count  = d_token_offsets[idx + 1] - token_offset;
+    auto d_result           = d_tokens + token_offset;
+    auto const d_str        = d_strings.element<string_view>(idx);
+    if (d_str.empty()) {
+      // return empty string output for empty string input
+      *d_result = string_index_pair{"", 0};
+      return;
+    }
+
+    size_type token_idx = 0;
+    size_type begin     = 0;
+    size_type end       = d_str.length();
+    size_type last_pos  = 0;
+    while (token_idx < token_count - 1) {
+      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
+
+      auto const start_pos = d_str.byte_offset(begin);
+      auto const end_pos   = d_str.byte_offset(end);
+      d_result[token_idx]  = string_index_pair{d_str.data() + last_pos, start_pos - last_pos};
+
+      begin = end + (begin == end);
+      end   = d_str.length();
+      token_idx++;
+      last_pos = end_pos;
+    }
+
+    // set last token to remainder of the string
+    if (last_pos <= d_str.size_bytes()) {
+      d_result[token_idx] =
+        string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
+    }
+  }
+};
+
+}  // namespace
+
+// The output is one list item per string
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& input,
+  std::string const& pattern,
+  size_type maxsplit,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets = make_numeric_column(
+    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto d_offsets = offsets->mutable_view().data<int32_t>();
+
+  auto const begin = thrust::make_counting_iterator<size_type>(0);
+  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+
+  // create offsets column by counting the number of tokens per string
+  auto const regex_insts = d_prog->insts_counts();
+  if (regex_insts <= RX_SMALL_INSTS) {
+    token_counter_fn<RX_STACK_SMALL> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    token_counter_fn<RX_STACK_MEDIUM> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    token_counter_fn<RX_STACK_LARGE> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  } else {
+    token_counter_fn<RX_STACK_ANY> counter{*d_strings, *d_prog, max_tokens};
+    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
+  }
+  // convert counts into offsets
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+
+  // last entry is the total number of tokens to be generated
+  auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+
+  printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n",
+         regex_insts,
+         total_tokens,
+         *begin,
+         *end);
+  // split each string into an array of index-pair values
+  rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+  if (regex_insts <= RX_SMALL_INSTS) {
+    token_reader_fn<RX_STACK_SMALL> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else if (regex_insts <= RX_MEDIUM_INSTS) {
+    token_reader_fn<RX_STACK_MEDIUM> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else if (regex_insts <= RX_LARGE_INSTS) {
+    token_reader_fn<RX_STACK_LARGE> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  } else {
+    token_reader_fn<RX_STACK_ANY> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
+  }
+
+  // convert the index-pairs into one big strings column
+  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+  // create a lists column using the offsets and the strings columns
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
+}
+
+}  // namespace detail
+
+// external APIs
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index de4e48fd70a..7eddc947d40 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/split/partition.hpp>
 #include <cudf/strings/split/split.hpp>
+#include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 
@@ -339,6 +340,40 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRecordRegex)
+{
+  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto result = cudf::strings::split_record_re(sv, "[eé]");
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected(
+    {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", "  "}, LCW{"t", "st String"}, LCW{""}},
+    validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
+TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto result = cudf::strings::split_record_re(sv, "\\s", 1);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected(
+    {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
+    validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<const char*> h_strings{

From a83243646975f0c398910bb0cb80affa40c214bf Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 26 Jan 2022 11:25:18 -0500
Subject: [PATCH 02/20] add error-checking gtests

---
 cpp/tests/strings/split_tests.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 7eddc947d40..badb84536ba 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -654,6 +654,11 @@ TEST_F(StringsSplitTest, InvalidParameter)
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_record(strings_view, cudf::string_scalar("", false)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)),

From d33f79bb9c89d014a4e6a374067e8a88c366aafa Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 27 Jan 2022 10:49:01 -0500
Subject: [PATCH 03/20] use count_matches utility

---
 cpp/src/strings/split/split_record_re.cu | 63 ++++++++----------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu
index d197ee9c7e3..eff0c511393 100644
--- a/cpp/src/strings/split/split_record_re.cu
+++ b/cpp/src/strings/split/split_record_re.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/count_matches.hpp>
 #include <strings/regex/regex.cuh>
 #include <strings/utilities.hpp>
 
@@ -42,30 +43,23 @@ using string_index_pair = thrust::pair<const char*, size_type>;
 namespace {
 
 /**
- * @brief Compute the number of tokens for the `idx'th` string element of `d_strings`.
+ * @brief Convert match counts to token counts.
+ *
+ * The matches are the delimiters and the tokens are what is left:
+ * `token1, delimiter, token2, delimiter, token3, etc`
+ * Usually `token_count = match_count + 1` even with empty strings.
+ * However, we need to account for the max_tokens and null rows.
  */
-template <int stack_size>
-struct token_counter_fn {
-  column_device_view const d_strings;  // strings to split
-  reprog_device prog;
+struct match_to_token_count_fn {
+  column_device_view const d_strings;
+  size_type const* d_counts;
   size_type const max_tokens;
 
   __device__ size_type operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) { return 0; }
-
-    auto const d_str      = d_strings.element<string_view>(idx);
-    size_type token_count = 0;
-
-    int32_t begin = 0;
-    int32_t end   = -1;
-    while (token_count < max_tokens - 1) {
-      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
-      token_count++;
-      begin = end + (begin == end);
-      end   = -1;
-    }
-    return token_count + 1;  // always at least one token
+    auto const match_count = d_counts[idx];
+    return std::min(match_count, max_tokens) + 1;
   }
 };
 
@@ -130,34 +124,23 @@ std::unique_ptr<column> split_record_re(
 {
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
 
-  auto const max_tokens    = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max();
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
   auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto offsets = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
   auto d_offsets = offsets->mutable_view().data<int32_t>();
 
   auto const begin = thrust::make_counting_iterator<size_type>(0);
   auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
-
-  // create offsets column by counting the number of tokens per string
-  auto const regex_insts = d_prog->insts_counts();
-  if (regex_insts <= RX_SMALL_INSTS) {
-    token_counter_fn<RX_STACK_SMALL> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_counter_fn<RX_STACK_MEDIUM> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_counter_fn<RX_STACK_LARGE> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  } else {
-    token_counter_fn<RX_STACK_ANY> counter{*d_strings, *d_prog, max_tokens};
-    thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, counter);
-  }
+  // convert match counts to tokens
+  thrust::transform(rmm::exec_policy(stream),
+                    begin,
+                    end,
+                    d_offsets,
+                    match_to_token_count_fn{*d_strings, d_offsets, max_tokens});
   // convert counts into offsets
   thrust::exclusive_scan(
     rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
@@ -165,13 +148,9 @@ std::unique_ptr<column> split_record_re(
   // last entry is the total number of tokens to be generated
   auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
 
-  printf("instruction = %d\ntotal_tokens = %d\nbegin,end = %d,%d\n",
-         regex_insts,
-         total_tokens,
-         *begin,
-         *end);
   // split each string into an array of index-pair values
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
+  auto const regex_insts = d_prog->insts_counts();
   if (regex_insts <= RX_SMALL_INSTS) {
     token_reader_fn<RX_STACK_SMALL> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);

From 9c74fdffbc2c3ddc7e4a248a4c837996c8c25bf4 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 27 Jan 2022 10:49:16 -0500
Subject: [PATCH 04/20] add split_re declaration

---
 cpp/include/cudf/strings/split/split_re.hpp | 62 +++++++++++++++++++--
 1 file changed, 58 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index b69bd1c5991..54d590fcf71 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
 
 namespace cudf {
 namespace strings {
@@ -26,6 +27,59 @@ namespace strings {
  * @file
  */
 
+/**
+ * @brief Splits individual strings elements into a table of strings columns
+ * using a regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in corresponding
+ * rows in the output table.
+ *
+ * The number of elements in the output table will be the same as the number of
+ * elements in the input column. The row for each column will contain the
+ * new strings produced from that input row.
+ *
+ * The resulting number of columns will be the maximum number of tokens found
+ * in any input row.
+ *
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row in the first column.
+ *
+ * A null row will produce a corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = split_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = split_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc def_g", "_bc", "ab_cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings
+ *         Each vector of the lists column holds splits from a single row
+ *         element of the input column.
+ */
+std::unique_ptr<table> split_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits individual strings elements into a list of strings
  * using a regex pattern to delimit each string.
@@ -54,10 +108,10 @@ namespace strings {
  *       ["ab", "cd", ""] ]
  * s2 = split_record(s, "[ _]", 1)
  * s2 is a lists column of strings:
- *     [ ["a", "bc_def_g"],
+ *     [ ["a", "bc def_g"],
  *       ["a", "_bc"],
- *       ["", "ab_cd"],
- *       ["ab", "cd_"] ]
+ *       ["", "ab cd"],
+ *       ["ab", "cd "] ]
  * @endcode
  *
  * @throw cudf:logic_error if `pattern` is empty.

From 1a89db5f53ed21952183d8ab9f2d4e6e800b1175 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 27 Jan 2022 18:01:11 -0500
Subject: [PATCH 05/20] split_re implementation and tests

---
 cpp/src/strings/split/split_record_re.cu | 193 ++++++++++++++++++-----
 cpp/tests/strings/split_tests.cpp        | 110 +++++++++----
 2 files changed, 235 insertions(+), 68 deletions(-)

diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_record_re.cu
index eff0c511393..3f916d0138f 100644
--- a/cpp/src/strings/split/split_record_re.cu
+++ b/cpp/src/strings/split/split_record_re.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split_re.hpp>
@@ -70,7 +71,7 @@ template <int stack_size>
 struct token_reader_fn {
   column_device_view const d_strings;
   reprog_device prog;
-  int32_t const* d_token_offsets;
+  offset_type const* d_token_offsets;
   string_index_pair* d_tokens;
 
   __device__ void operator()(size_type idx)
@@ -81,11 +82,6 @@ struct token_reader_fn {
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
     auto d_result           = d_tokens + token_offset;
     auto const d_str        = d_strings.element<string_view>(idx);
-    if (d_str.empty()) {
-      // return empty string output for empty string input
-      *d_result = string_index_pair{"", 0};
-      return;
-    }
 
     size_type token_idx = 0;
     size_type begin     = 0;
@@ -112,61 +108,105 @@ struct token_reader_fn {
   }
 };
 
-}  // namespace
-
-// The output is one list item per string
-std::unique_ptr<column> split_record_re(
-  strings_column_view const& input,
-  std::string const& pattern,
-  size_type maxsplit,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
-
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
-  auto const strings_count = input.size();
+struct tokens_transform_fn {
+  column_device_view const d_strings;
+  string_index_pair const* d_tokens;
+  offset_type const* d_token_offsets;
+  size_type const column_index;
 
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
-  auto d_strings = column_device_view::create(input.parent(), stream);
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto const offset      = d_token_offsets[idx];
+    auto const token_count = d_token_offsets[idx + 1] - offset;
+    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+    if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; }
+    return d_tokens[offset + column_index];
+  }
+};
 
-  auto offsets   = count_matches(*d_strings, *d_prog, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<int32_t>();
+/**
+ * @brief Call regex to split each input string into tokens.
+ *
+ * This will also convert the `offsets` values from counts to offsets.
+ *
+ * @param d_strings Strings to split
+ * @param d_prog Regex to evaluate against each string
+ * @param max_tokens The maximum number of tokens for each split.
+ * @param offsets The number of matches on input.
+ *                The offsets for each token in each string on output.
+ * @param stream CUDA stream used for kernel launches.
+ */
+rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d_strings,
+                                                     reprog_device& d_prog,
+                                                     size_type max_tokens,
+                                                     mutable_column_view& offsets,
+                                                     rmm::cuda_stream_view stream)
+{
+  auto d_offsets           = offsets.data<offset_type>();
+  auto const strings_count = d_strings.size();
 
   auto const begin = thrust::make_counting_iterator<size_type>(0);
   auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+
   // convert match counts to tokens
-  thrust::transform(rmm::exec_policy(stream),
-                    begin,
-                    end,
-                    d_offsets,
-                    match_to_token_count_fn{*d_strings, d_offsets, max_tokens});
+  match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens};
+  thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn);
+
   // convert counts into offsets
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets);
+  thrust::exclusive_scan(rmm::exec_policy(stream),
+                         offsets.begin<offset_type>(),
+                         offsets.end<offset_type>(),
+                         offsets.begin<offset_type>());
 
-  // last entry is the total number of tokens to be generated
-  auto total_tokens = cudf::detail::get_value<int32_t>(offsets->view(), strings_count, stream);
+  // the last entry is the total number of tokens to be generated
+  auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
 
-  // split each string into an array of index-pair values
+  // generate tokens for each string
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
-  auto const regex_insts = d_prog->insts_counts();
+  auto const regex_insts = d_prog.insts_counts();
   if (regex_insts <= RX_SMALL_INSTS) {
-    token_reader_fn<RX_STACK_SMALL> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_reader_fn<RX_STACK_MEDIUM> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_reader_fn<RX_STACK_LARGE> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else {
-    token_reader_fn<RX_STACK_ANY> reader{*d_strings, *d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   }
 
-  // convert the index-pairs into one big strings column
+  return tokens;
+}
+
+}  // namespace
+
+// The output is one list item per string
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
+  auto offsets_view = offsets->mutable_view();
+
+  // get split tokens from the input column
+  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
+
+  // convert the tokens into one big strings column
   auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+
   // create a lists column using the offsets and the strings columns
   return make_lists_column(strings_count,
                            std::move(offsets),
@@ -177,10 +217,83 @@ std::unique_ptr<column> split_record_re(
                            mr);
 }
 
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  std::vector<std::unique_ptr<column>> results;
+  if (strings_count == 0) {
+    results.push_back(make_empty_column(type_id::STRING));
+    return std::make_unique<table>(std::move(results));
+  }
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
+  auto offsets_view = offsets->mutable_view();
+
+  // get split tokens from the input column
+  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
+
+  // the columns_count is the maximum number of tokens for any string in the input column
+  auto const begin = thrust::make_counting_iterator<size_type>(0);
+  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+  auto d_offsets   = offsets_view.data<offset_type>();
+  auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type {
+    return d_offsets[idx + 1] - d_offsets[idx];
+  };
+  auto const columns_count = thrust::transform_reduce(
+    rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum<size_type>{});
+
+  // boundary case: if no columns, return one all-null column (custrings issue #119)
+  if (columns_count == 0) {
+    results.push_back(std::make_unique<column>(
+      data_type{type_id::STRING},
+      strings_count,
+      rmm::device_buffer{0, stream, mr},  // no data
+      cudf::detail::create_null_mask(strings_count, mask_state::ALL_NULL, stream, mr),
+      strings_count));
+    return std::make_unique<table>(std::move(results));
+  }
+
+  // convert the tokens into multiple strings columns
+  auto make_strings_lambda = [&](size_type column_index) {
+    // returns appropriate token for each row/column
+    auto indices_itr = cudf::detail::make_counting_transform_iterator(
+      0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index});
+    return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
+  };
+  // create each column of tokens
+  results.resize(columns_count);
+  std::transform(thrust::make_counting_iterator<size_type>(0),
+                 thrust::make_counting_iterator<size_type>(columns_count),
+                 results.begin(),
+                 make_strings_lambda);
+
+  return std::make_unique<table>(std::move(results));
+}
+
 }  // namespace detail
 
 // external APIs
 
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         std::string const& pattern,
                                         size_type maxsplit,
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index badb84536ba..f541a6b0e81 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -247,33 +247,13 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::split(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 1);
-  cudf::test::expect_strings_empty(results->get_column(0));
+  EXPECT_TRUE(results->num_rows() == 0);
   results = cudf::strings::rsplit(zero_size_strings_column);
   EXPECT_TRUE(results->num_columns() == 1);
-  cudf::test::expect_strings_empty(results->get_column(0));
-}
-
-// This test specifically for https://github.com/rapidsai/custrings/issues/119
-TEST_F(StringsSplitTest, AllNullsCase)
-{
-  std::vector<const char*> h_strings{nullptr, nullptr, nullptr};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-
-  auto results = cudf::strings::split(cudf::strings_column_view(strings));
-  EXPECT_TRUE(results->num_columns() == 1);
-  auto column = results->get_column(0).view();
-  EXPECT_TRUE(column.size() == 3);
-  EXPECT_TRUE(column.has_nulls());
-  EXPECT_TRUE(column.null_count() == column.size());
-  results = cudf::strings::split(cudf::strings_column_view(strings), cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::split_re(zero_size_strings_column, "\\s");
   EXPECT_TRUE(results->num_columns() == 1);
-  column = results->get_column(0);
-  EXPECT_TRUE(column.size() == 3);
-  EXPECT_TRUE(column.has_nulls());
-  EXPECT_TRUE(column.null_count() == column.size());
+  EXPECT_TRUE(results->num_rows() == 0);
 }
 
 TEST_F(StringsSplitTest, SplitRecord)
@@ -340,6 +320,54 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRegex)
+{
+  std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  {
+    auto result = cudf::strings::split_re(sv, "\\s+");
+
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
+    cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
+    auto expected = cudf::table_view({col0, col1, col2});
+    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+  }
+
+  {
+    auto result = cudf::strings::split_re(sv, "[eé]");
+
+    cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
+                                            {1, 0, 1, 1, 0});
+    cudf::test::strings_column_wrapper col2({"s", "", "  ", "", ""}, {1, 0, 1, 0, 0});
+    cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
+    auto expected = cudf::table_view({col0, col1, col2, col3});
+    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+  }
+}
+
+TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  auto result = cudf::strings::split_re(sv, "\\s+", 1);
+
+  cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+  cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
+                                          {1, 0, 1, 1, 0});
+  auto expected = cudf::table_view({col0, col1});
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+}
+
 TEST_F(StringsSplitTest, SplitRecordRegex)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
@@ -469,10 +497,35 @@ TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto split_record_result = cudf::strings::split_record(zero_size_strings_column);
-  EXPECT_TRUE(split_record_result->size() == 0);
-  auto rsplit_record_result = cudf::strings::rsplit_record(zero_size_strings_column);
-  EXPECT_TRUE(rsplit_record_result->size() == 0);
+  auto result = cudf::strings::split_record(zero_size_strings_column);
+  EXPECT_TRUE(result->size() == 0);
+  result = cudf::strings::rsplit_record(zero_size_strings_column);
+  EXPECT_TRUE(result->size() == 0);
+  result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(result->size() == 0);
+}
+
+// This test specifically for https://github.com/rapidsai/custrings/issues/119
+TEST_F(StringsSplitTest, AllNullsCase)
+{
+  cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0});
+  auto sv = cudf::strings_column_view(input);
+
+  auto results = cudf::strings::split(sv);
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::split(sv, cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit(sv);
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit(sv, cudf::string_scalar("-"));
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::split_re(sv, "-");
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
 }
 
 TEST_F(StringsSplitTest, Partition)
@@ -658,6 +711,7 @@ TEST_F(StringsSplitTest, InvalidParameter)
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rsplit_record(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
+  EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);

From 8599d0cba24ef963c28361455769969a3764a430 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 28 Jan 2022 08:16:31 -0500
Subject: [PATCH 06/20] rename split_record_re.cu to split_re.cu

---
 cpp/CMakeLists.txt                                        | 2 +-
 cpp/src/strings/split/{split_record_re.cu => split_re.cu} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/src/strings/split/{split_record_re.cu => split_re.cu} (100%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f96ef4945b9..407e1f9a858 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -445,8 +445,8 @@ add_library(
   src/strings/search/find_multiple.cu
   src/strings/split/partition.cu
   src/strings/split/split.cu
+  src/strings/split/split_re.cu
   src/strings/split/split_record.cu
-  src/strings/split/split_record_re.cu
   src/strings/strings_column_factories.cu
   src/strings/strings_column_view.cpp
   src/strings/strings_scalar_factories.cpp
diff --git a/cpp/src/strings/split/split_record_re.cu b/cpp/src/strings/split/split_re.cu
similarity index 100%
rename from cpp/src/strings/split/split_record_re.cu
rename to cpp/src/strings/split/split_re.cu

From b6d7453b66c3548e4e47499de66e7eae0fa0b2fb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 31 Jan 2022 14:05:56 -0500
Subject: [PATCH 07/20] refactored split_re/rsplit_re functions

---
 cpp/include/cudf/strings/split/split_re.hpp | 130 +++++++--
 cpp/src/strings/split/split_re.cu           | 281 ++++++++++++--------
 cpp/tests/strings/split_tests.cpp           | 162 +++++++----
 3 files changed, 394 insertions(+), 179 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 54d590fcf71..cf6d23ccd28 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -32,22 +32,19 @@ namespace strings {
  * using a regex pattern to delimit each string.
  *
  * Each element generates an array of strings that are stored in corresponding
- * rows in the output table.
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
  *
  * The number of elements in the output table will be the same as the number of
- * elements in the input column. The row for each column will contain the
- * new strings produced from that input row.
- *
- * The resulting number of columns will be the maximum number of tokens found
- * in any input row.
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
  *
  * The `pattern` is used to identify the separation points within a string
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty string in the
- * corresponding row in the first column.
- *
- * A null row will produce a corresponding null rows in the output table.
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
@@ -60,7 +57,7 @@ namespace strings {
  * s2 = split_re(s, "[ _]", 1)
  * s2 is a table of strings columns:
  *     [ ["a", "a", "", "ab"],
- *       ["bc def_g", "_bc", "ab_cd", "cd "] ]
+ *       ["bc def_g", "_bc", "ab cd", "cd "] ]
  * @endcode
  *
  * @throw cudf:logic_error if `pattern` is empty.
@@ -70,9 +67,7 @@ namespace strings {
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
- *         element of the input column.
+ * @return A table of columns of strings.
  */
 std::unique_ptr<table> split_re(
   strings_column_view const& strings,
@@ -81,9 +76,59 @@ std::unique_ptr<table> split_re(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a list of strings
+ * @brief Splits individual strings elements into a table of strings columns
  * using a regex pattern to delimit each string.
  *
+ * Each element generates an array of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
+ *
+ * The number of elements in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within the string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_re(s, "[_ ]")
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * s2 = rsplit_re(s, "[ _]", 1)
+ * s2 is a table of strings columns:
+ *     [ ["a_bc def", "a_", "_ab", "ab"],
+ *       ["g", "bc", "cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> rsplit_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Splits individual strings elements into a list of strings
+ * using the given regex pattern to delimit each string.
+ *
  * Each element generates an array of strings that are stored in an output
  * lists column.
  *
@@ -96,7 +141,7 @@ std::unique_ptr<table> split_re(
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty list item output row.
- * A null row will produce a corresponding null list item output row.
+ * A null row will produce a corresponding null output row.
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
@@ -121,9 +166,7 @@ std::unique_ptr<table> split_re(
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
  * @param mr Device memory resource used to allocate the returned result's device memory.
- * @return Lists column of strings
- *         Each vector of the lists column holds splits from a single row
- *         element of the input column.
+ * @return Lists column of strings.
  */
 std::unique_ptr<column> split_record_re(
   strings_column_view const& strings,
@@ -131,6 +174,57 @@ std::unique_ptr<column> split_record_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits individual strings elements into a list of strings
+ * using the given regex pattern to delimit each string.
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * s1 = rsplit_record(s, "[_ ]")
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * s2 = rsplit_record(s, "[ _]", 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * @throw cudf:logic_error if `pattern` is empty.
+ *
+ * @param strings A column of string elements to be split.
+ * @param pattern The regex pattern for delimiting characters within each string.
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> rsplit_record_re(
+  strings_column_view const& strings,
+  std::string const& pattern,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 3f916d0138f..9427a900d8d 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -32,36 +32,18 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/for_each.h>
-#include <thrust/scan.h>
-#include <thrust/transform.h>
+#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
-
-using string_index_pair = thrust::pair<const char*, size_type>;
-
 namespace {
 
-/**
- * @brief Convert match counts to token counts.
- *
- * The matches are the delimiters and the tokens are what is left:
- * `token1, delimiter, token2, delimiter, token3, etc`
- * Usually `token_count = match_count + 1` even with empty strings.
- * However, we need to account for the max_tokens and null rows.
- */
-struct match_to_token_count_fn {
-  column_device_view const d_strings;
-  size_type const* d_counts;
-  size_type const max_tokens;
+using string_index_pair = thrust::pair<const char*, size_type>;
 
-  __device__ size_type operator()(size_type idx)
-  {
-    if (d_strings.is_null(idx)) { return 0; }
-    auto const match_count = d_counts[idx];
-    return std::min(match_count, max_tokens) + 1;
-  }
+enum class split_direction {
+  FORWARD,  ///< for split logic
+  BACKWARD  ///< for rsplit logic
 };
 
 /**
@@ -71,56 +53,58 @@ template <int stack_size>
 struct token_reader_fn {
   column_device_view const d_strings;
   reprog_device prog;
+  split_direction const direction;
   offset_type const* d_token_offsets;
   string_index_pair* d_tokens;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) { return; }
+    auto const d_str = d_strings.element<string_view>(idx);
 
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
-    auto d_result           = d_tokens + token_offset;
-    auto const d_str        = d_strings.element<string_view>(idx);
+    auto d_result           = d_tokens + token_offset;  // store tokens here
 
     size_type token_idx = 0;
-    size_type begin     = 0;
+    size_type begin     = 0;  // characters
     size_type end       = d_str.length();
-    size_type last_pos  = 0;
-    while (token_idx < token_count - 1) {
-      if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) { break; }
-
-      auto const start_pos = d_str.byte_offset(begin);
-      auto const end_pos   = d_str.byte_offset(end);
-      d_result[token_idx]  = string_index_pair{d_str.data() + last_pos, start_pos - last_pos};
-
-      begin = end + (begin == end);
-      end   = d_str.length();
-      token_idx++;
-      last_pos = end_pos;
+    size_type last_pos  = 0;  // bytes
+    while (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
+      // get the token (characters just before this match)
+      auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
+      // store it if we have space
+      if (token_idx < token_count - 1) {
+        d_result[token_idx++] = token;
+      } else {
+        if (direction == split_direction::FORWARD) { break; }  // we are done
+        for (auto l = 0; l < token_idx - 1; ++l) {
+          d_result[l] = d_result[l + 1];  // shift left
+        }
+        d_result[token_idx - 1] = token;
+      }
+      // setup for next match
+      last_pos = d_str.byte_offset(end);
+      begin    = end + (begin == end);
+      end      = d_str.length();
     }
 
-    // set last token to remainder of the string
+    // set the last token to the remainder of the string
     if (last_pos <= d_str.size_bytes()) {
       d_result[token_idx] =
         string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
     }
-  }
-};
-
-struct tokens_transform_fn {
-  column_device_view const d_strings;
-  string_index_pair const* d_tokens;
-  offset_type const* d_token_offsets;
-  size_type const column_index;
 
-  __device__ string_index_pair operator()(size_type idx) const
-  {
-    auto const offset      = d_token_offsets[idx];
-    auto const token_count = d_token_offsets[idx + 1] - offset;
-    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
-    if (column_index > token_count - 1) { return string_index_pair{nullptr, 0}; }
-    return d_tokens[offset + column_index];
+    if (direction == split_direction::BACKWARD) {
+      // update first entry -- this happens when max-tokens is hit before the end
+      auto const first_offset =
+        d_result[0].first
+          ? static_cast<size_type>(thrust::distance(d_str.data(), d_result[0].first))
+          : 0;
+      if (first_offset) {
+        d_result[0] = string_index_pair{d_str.data(), first_offset + d_result[0].second};
+      }
+    }
   }
 };
 
@@ -138,6 +122,7 @@ struct tokens_transform_fn {
  */
 rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d_strings,
                                                      reprog_device& d_prog,
+                                                     split_direction direction,
                                                      size_type max_tokens,
                                                      mutable_column_view& offsets,
                                                      rmm::cuda_stream_view stream)
@@ -148,15 +133,12 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
   auto const begin = thrust::make_counting_iterator<size_type>(0);
   auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
 
-  // convert match counts to tokens
-  match_to_token_count_fn match_fn{d_strings, d_offsets, max_tokens};
-  thrust::transform(rmm::exec_policy(stream), begin, end, d_offsets, match_fn);
-
-  // convert counts into offsets
-  thrust::exclusive_scan(rmm::exec_policy(stream),
-                         offsets.begin<offset_type>(),
-                         offsets.end<offset_type>(),
-                         offsets.begin<offset_type>());
+  // convert match counts to token offsets
+  auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) {
+    return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1;
+  };
+  thrust::transform_exclusive_scan(
+    rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<offset_type>{});
 
   // the last entry is the total number of tokens to be generated
   auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
@@ -165,60 +147,48 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   auto const regex_insts = d_prog.insts_counts();
   if (regex_insts <= RX_SMALL_INSTS) {
-    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_SMALL> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_MEDIUM_INSTS) {
-    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_MEDIUM> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else if (regex_insts <= RX_LARGE_INSTS) {
-    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_LARGE> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   } else {
-    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, d_offsets, tokens.data()};
+    token_reader_fn<RX_STACK_ANY> reader{d_strings, d_prog, direction, d_offsets, tokens.data()};
     thrust::for_each_n(rmm::exec_policy(stream), begin, strings_count, reader);
   }
 
   return tokens;
 }
 
-}  // namespace
-
-// The output is one list item per string
-std::unique_ptr<column> split_record_re(strings_column_view const& input,
-                                        std::string const& pattern,
-                                        size_type maxsplit,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
-
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
-  auto const strings_count = input.size();
-
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
-  auto d_strings = column_device_view::create(input.parent(), stream);
-
-  auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
-  auto offsets_view = offsets->mutable_view();
-
-  // get split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
-
-  // convert the tokens into one big strings column
-  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+/**
+ * @brief Returns string pair for the specified column for each string in `d_strings`
+ *
+ * This is used to build the table result of a split.
+ * Null is returned if the row is null of if the `column_index` is larger
+ * than the token count for that string.
+ */
+struct tokens_transform_fn {
+  column_device_view const d_strings;
+  string_index_pair const* d_tokens;
+  offset_type const* d_token_offsets;
+  size_type const column_index;
 
-  // create a lists column using the offsets and the strings columns
-  return make_lists_column(strings_count,
-                           std::move(offsets),
-                           std::move(strings_output),
-                           input.null_count(),
-                           copy_bitmask(input.parent(), stream, mr),
-                           stream,
-                           mr);
-}
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto const offset      = d_token_offsets[idx];
+    auto const token_count = d_token_offsets[idx + 1] - offset;
+    return (column_index > token_count - 1) || d_strings.is_null(idx)
+             ? string_index_pair{nullptr, 0}
+             : d_tokens[offset + column_index];
+  }
+};
 
 std::unique_ptr<table> split_re(strings_column_view const& input,
                                 std::string const& pattern,
+                                split_direction direction,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
@@ -239,19 +209,21 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 
   auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
+  auto d_offsets    = offsets_view.data<offset_type>();
 
   // get split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, max_tokens, offsets_view, stream);
+  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // the columns_count is the maximum number of tokens for any string in the input column
-  auto const begin = thrust::make_counting_iterator<size_type>(0);
-  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
-  auto d_offsets   = offsets_view.data<offset_type>();
-  auto size_lambda = [d_offsets] __device__(auto const idx) -> size_type {
-    return d_offsets[idx + 1] - d_offsets[idx];
-  };
   auto const columns_count = thrust::transform_reduce(
-    rmm::exec_policy(stream), begin, end, size_lambda, 0, thrust::maximum<size_type>{});
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(strings_count),
+    [d_offsets] __device__(auto const idx) -> size_type {
+      return d_offsets[idx + 1] - d_offsets[idx];
+    },
+    0,
+    thrust::maximum<size_type>{});
 
   // boundary case: if no columns, return one all-null column (custrings issue #119)
   if (columns_count == 0) {
@@ -271,7 +243,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
       0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index});
     return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr);
   };
-  // create each column of tokens
+  // build a vector of columns
   results.resize(columns_count);
   std::transform(thrust::make_counting_iterator<size_type>(0),
                  thrust::make_counting_iterator<size_type>(columns_count),
@@ -281,6 +253,78 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   return std::make_unique<table>(std::move(results));
 }
 
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        split_direction direction,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+
+  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+  auto const strings_count = input.size();
+
+  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
+  auto offsets_view = offsets->mutable_view();
+
+  // get split tokens from the input column
+  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+
+  // convert the tokens into one big strings column
+  auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
+
+  // create a lists column using the offsets and the strings columns
+  return make_lists_column(strings_count,
+                           std::move(offsets),
+                           std::move(strings_output),
+                           input.null_count(),
+                           copy_bitmask(input.parent(), stream, mr),
+                           stream,
+                           mr);
+}
+
+}  // namespace
+
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                std::string const& pattern,
+                                size_type maxsplit,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
+{
+  return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        std::string const& pattern,
+                                        size_type maxsplit,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 std::string const& pattern,
+                                 size_type maxsplit,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+}
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         std::string const& pattern,
+                                         size_type maxsplit,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+}
+
 }  // namespace detail
 
 // external APIs
@@ -303,5 +347,22 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   return detail::split_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
 }
 
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 std::string const& pattern,
+                                 size_type maxsplit,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         std::string const& pattern,
+                                         size_type maxsplit,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_record_re(input, pattern, maxsplit, rmm::cuda_stream_default, mr);
+}
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index f541a6b0e81..d0b695bbc93 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -25,8 +25,8 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <tests/strings/utilities.h>
 
 #include <vector>
 
@@ -241,21 +241,6 @@ TEST_F(StringsSplitTest, RSplitWhitespaceWithMax)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, *expected);
 }
 
-TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
-{
-  cudf::column_view zero_size_strings_column(
-    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::split(zero_size_strings_column);
-  EXPECT_TRUE(results->num_columns() == 1);
-  EXPECT_TRUE(results->num_rows() == 0);
-  results = cudf::strings::rsplit(zero_size_strings_column);
-  EXPECT_TRUE(results->num_columns() == 1);
-  EXPECT_TRUE(results->num_rows() == 0);
-  results = cudf::strings::split_re(zero_size_strings_column, "\\s");
-  EXPECT_TRUE(results->num_columns() == 1);
-  EXPECT_TRUE(results->num_rows() == 0);
-}
-
 TEST_F(StringsSplitTest, SplitRecord)
 {
   std::vector<const char*> h_strings{" Héllo thesé", nullptr, "are some  ", "tést String", ""};
@@ -331,41 +316,30 @@ TEST_F(StringsSplitTest, SplitRegex)
   {
     auto result = cudf::strings::split_re(sv, "\\s+");
 
-    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity);
     cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
     cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+
+    result = cudf::strings::rsplit_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
 
   {
     auto result = cudf::strings::split_re(sv, "[eé]");
 
-    cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity);
     cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
                                             {1, 0, 1, 1, 0});
     cudf::test::strings_column_wrapper col2({"s", "", "  ", "", ""}, {1, 0, 1, 0, 0});
     cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2, col3});
-    CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
-  }
-}
-
-TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
-{
-  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
-  auto validity =
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
-  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
-  auto sv = cudf::strings_column_view(input);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
-  auto result = cudf::strings::split_re(sv, "\\s+", 1);
-
-  cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
-  cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
-                                          {1, 0, 1, 1, 0});
-  auto expected = cudf::table_view({col0, col1});
-  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, "[eé]");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
 }
 
 TEST_F(StringsSplitTest, SplitRecordRegex)
@@ -376,30 +350,60 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto result = cudf::strings::split_record_re(sv, "[eé]");
-
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected(
-    {LCW{" H", "llo th", "s", ""}, LCW{}, LCW{"ar", " som", "  "}, LCW{"t", "st String"}, LCW{""}},
-    validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  {
+    auto result = cudf::strings::split_record_re(sv, "\\s+");
+
+    LCW expected(
+      {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    result = cudf::strings::rsplit_record_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
+
+  {
+    auto result = cudf::strings::split_record_re(sv, "[eé]");
+
+    LCW expected({LCW{" H", "llo th", "s", ""},
+                  LCW{},
+                  LCW{"ar", " som", "  "},
+                  LCW{"t", "st String"},
+                  LCW{""}},
+                 validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    result = cudf::strings::rsplit_record_re(sv, "[eé]");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
 }
 
-TEST_F(StringsSplitTest, SplitRecordRegexWithMaxSplit)
+TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
 {
   std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are\nsome  ", "tést\rString", ""};
   auto validity =
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
+  {
+    auto result = cudf::strings::split_re(sv, "\\s+", 1);
 
-  auto result = cudf::strings::split_record_re(sv, "\\s", 1);
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
+                                            {1, 0, 1, 1, 0});
+    auto expected = cudf::table_view({col0, col1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
+  {
+    auto result = cudf::strings::split_record_re(sv, "\\s", 1);
 
-  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected(
-    {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
-    validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected(
+      {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
 }
 
 TEST_F(StringsSplitTest, RSplitRecord)
@@ -493,16 +497,58 @@ TEST_F(StringsSplitTest, RSplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
-TEST_F(StringsSplitTest, SplitRecordZeroSizeStringsColumns)
+TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
+{
+  std::vector<const char*> h_strings{" Héllo\tthesé", nullptr, "are some\n ", "tést\rString", ""};
+  auto validity =
+    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
+  auto sv = cudf::strings_column_view(input);
+
+  {
+    auto result = cudf::strings::rsplit_re(sv, "\\s+", 1);
+
+    cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity);
+    cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0});
+    auto expected = cudf::table_view({col0, col1});
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+  }
+  {
+    auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1);
+
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected(
+      {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  }
+}
+
+TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
+  auto results = cudf::strings::split(zero_size_strings_column);
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::rsplit(zero_size_strings_column);
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::split_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+  results = cudf::strings::rsplit_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(results->num_columns() == 1);
+  EXPECT_TRUE(results->num_rows() == 0);
+
   auto result = cudf::strings::split_record(zero_size_strings_column);
   EXPECT_TRUE(result->size() == 0);
   result = cudf::strings::rsplit_record(zero_size_strings_column);
   EXPECT_TRUE(result->size() == 0);
   result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
   EXPECT_TRUE(result->size() == 0);
+  result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(result->size() == 0);
 }
 
 // This test specifically for https://github.com/rapidsai/custrings/issues/119
@@ -526,6 +572,20 @@ TEST_F(StringsSplitTest, AllNullsCase)
   results = cudf::strings::split_re(sv, "-");
   EXPECT_TRUE(results->num_columns() == 1);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+  results = cudf::strings::rsplit_re(sv, "-");
+  EXPECT_TRUE(results->num_columns() == 1);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
+
+  auto result = cudf::strings::split_record(sv);
+  using LCW   = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  result = cudf::strings::rsplit_record(sv);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  result = cudf::strings::split_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  result = cudf::strings::rsplit_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, Partition)

From 7bc451b142c84c2505416ae5d8f2d9d979a1989f Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 31 Jan 2022 17:26:52 -0500
Subject: [PATCH 08/20] remove unneeded if-check

---
 cpp/src/strings/split/split_re.cu | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 9427a900d8d..9dcf7e6f17b 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -64,7 +64,7 @@ struct token_reader_fn {
 
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
-    auto d_result           = d_tokens + token_offset;  // store tokens here
+    auto const d_result     = d_tokens + token_offset;  // store tokens here
 
     size_type token_idx = 0;
     size_type begin     = 0;  // characters
@@ -72,7 +72,8 @@ struct token_reader_fn {
     size_type last_pos  = 0;  // bytes
     while (prog.find<stack_size>(idx, d_str, begin, end) > 0) {
       // get the token (characters just before this match)
-      auto token = string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
+      auto const token =
+        string_index_pair{d_str.data() + last_pos, d_str.byte_offset(begin) - last_pos};
       // store it if we have space
       if (token_idx < token_count - 1) {
         d_result[token_idx++] = token;
@@ -90,13 +91,10 @@ struct token_reader_fn {
     }
 
     // set the last token to the remainder of the string
-    if (last_pos <= d_str.size_bytes()) {
-      d_result[token_idx] =
-        string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
-    }
+    d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
 
     if (direction == split_direction::BACKWARD) {
-      // update first entry -- this happens when max-tokens is hit before the end
+      // update first entry -- this happens when max-tokens is hit before the end of the string
       auto const first_offset =
         d_result[0].first
           ? static_cast<size_type>(thrust::distance(d_str.data(), d_result[0].first))
@@ -127,11 +125,11 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
                                                      mutable_column_view& offsets,
                                                      rmm::cuda_stream_view stream)
 {
-  auto d_offsets           = offsets.data<offset_type>();
   auto const strings_count = d_strings.size();
 
-  auto const begin = thrust::make_counting_iterator<size_type>(0);
-  auto const end   = thrust::make_counting_iterator<size_type>(strings_count);
+  auto const begin     = thrust::make_counting_iterator<size_type>(0);
+  auto const end       = thrust::make_counting_iterator<size_type>(strings_count);
+  auto const d_offsets = offsets.data<offset_type>();
 
   // convert match counts to token offsets
   auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) {
@@ -140,7 +138,7 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
   thrust::transform_exclusive_scan(
     rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus<offset_type>{});
 
-  // the last entry is the total number of tokens to be generated
+  // the last offset entry is the total number of tokens to be generated
   auto const total_tokens = cudf::detail::get_value<offset_type>(offsets, strings_count, stream);
 
   // generate tokens for each string
@@ -204,14 +202,16 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     return std::make_unique<table>(std::move(results));
   }
 
+  // create the regex device prog from the given pattern
   auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
+  // count the number of delimiters matched in each string
   auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
   auto d_offsets    = offsets_view.data<offset_type>();
 
-  // get split tokens from the input column
+  // get the split tokens from the input column
   auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // the columns_count is the maximum number of tokens for any string in the input column
@@ -265,13 +265,15 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
+  // create the regex device prog from the given pattern
   auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
   auto d_strings = column_device_view::create(input.parent(), stream);
 
+  // count the number of delimiters matched in each string
   auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
   auto offsets_view = offsets->mutable_view();
 
-  // get split tokens from the input column
+  // get the split tokens from the input column
   auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // convert the tokens into one big strings column

From 93887b1877733bfc97c29606f8c9a221d8304efb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 31 Jan 2022 17:27:29 -0500
Subject: [PATCH 09/20] add all empty and all null test cases

---
 cpp/tests/strings/split_tests.cpp | 36 ++++++++++++++++---------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index d0b695bbc93..4650cbc3c44 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -541,14 +541,14 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
   EXPECT_TRUE(results->num_columns() == 1);
   EXPECT_TRUE(results->num_rows() == 0);
 
-  auto result = cudf::strings::split_record(zero_size_strings_column);
-  EXPECT_TRUE(result->size() == 0);
-  result = cudf::strings::rsplit_record(zero_size_strings_column);
-  EXPECT_TRUE(result->size() == 0);
-  result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
-  EXPECT_TRUE(result->size() == 0);
-  result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s");
-  EXPECT_TRUE(result->size() == 0);
+  auto list_result = cudf::strings::split_record(zero_size_strings_column);
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::rsplit_record(zero_size_strings_column);
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::split_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(list_result->size() == 0);
+  list_result = cudf::strings::rsplit_record_re(zero_size_strings_column, "\\s");
+  EXPECT_TRUE(list_result->size() == 0);
 }
 
 // This test specifically for https://github.com/rapidsai/custrings/issues/119
@@ -576,16 +576,16 @@ TEST_F(StringsSplitTest, AllNullsCase)
   EXPECT_TRUE(results->num_columns() == 1);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->get_column(0).view(), input);
 
-  auto result = cudf::strings::split_record(sv);
-  using LCW   = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto list_result = cudf::strings::split_record(sv);
+  using LCW        = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls());
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
-  result = cudf::strings::rsplit_record(sv);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
-  result = cudf::strings::split_record_re(sv, "-");
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
-  result = cudf::strings::rsplit_record_re(sv, "-");
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::rsplit_record(sv);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::split_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
+  list_result = cudf::strings::rsplit_record_re(sv, "-");
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(list_result->view(), expected);
 }
 
 TEST_F(StringsSplitTest, Partition)
@@ -773,6 +773,8 @@ TEST_F(StringsSplitTest, InvalidParameter)
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::split_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::split_record_re(strings_view, ""), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_re(strings_view, ""), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::rsplit_record_re(strings_view, ""), cudf::logic_error);
   EXPECT_THROW(cudf::strings::partition(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rpartition(strings_view, cudf::string_scalar("", false)),

From c88eeae8727b9c94f05d15c0e9e3e9714107bf39 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 1 Feb 2022 12:34:45 -0500
Subject: [PATCH 10/20] add more maxsplit gtests

---
 cpp/include/cudf/strings/split/split_re.hpp | 50 +++++++++++----------
 cpp/src/strings/split/split_re.cu           | 28 +++++++-----
 cpp/tests/strings/split_tests.cpp           | 34 +++++++++++---
 3 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index cf6d23ccd28..c6dc1e5c697 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -28,18 +28,18 @@ namespace strings {
  */
 
 /**
- * @brief Splits individual strings elements into a table of strings columns
+ * @brief Splits strings elements into a table of strings columns
  * using a regex pattern to delimit each string.
  *
- * Each element generates an array of strings that are stored in corresponding
- * rows in the output table -- `table[col,row] = token[col] of string[row]`
- * where `token` is the substring between each delimiter.
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of strings[row]`
+ * where `token` is a substring between delimiters.
  *
- * The number of elements in the output table will be the same as the number of
+ * The number of rows in the output table will be the same as the number of
  * elements in the input column. The resulting number of columns will be the
  * maximum number of tokens found in any input row.
  *
- * The `pattern` is used to identify the separation points within a string
+ * The `pattern` is used to identify the delimiters within a string
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty string in the
@@ -62,7 +62,7 @@ namespace strings {
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -70,25 +70,25 @@ namespace strings {
  * @return A table of columns of strings.
  */
 std::unique_ptr<table> split_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a table of strings columns
+ * @brief Splits strings elements into a table of strings columns
  * using a regex pattern to delimit each string.
  *
- * Each element generates an array of strings that are stored in corresponding
+ * Each element generates a vector of strings that are stored in corresponding
  * rows in the output table -- `table[col,row] = token[col] of string[row]`
  * where `token` is the substring between each delimiter.
  *
- * The number of elements in the output table will be the same as the number of
+ * The number of rows in the output table will be the same as the number of
  * elements in the input column. The resulting number of columns will be the
  * maximum number of tokens found in any input row.
  *
  * Splitting occurs by traversing starting from the end of the input string.
- * The `pattern` is used to identify the separation points within the string
+ * The `pattern` is used to identify the delimiters within a string
  * and splitting stops when either `maxsplit` or the beginning of the string
  * is reached.
  *
@@ -112,7 +112,7 @@ std::unique_ptr<table> split_re(
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -120,24 +120,25 @@ std::unique_ptr<table> split_re(
  * @return A table of columns of strings.
  */
 std::unique_ptr<table> rsplit_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a list of strings
+ * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string.
  *
  * Each element generates an array of strings that are stored in an output
- * lists column.
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
  *
  * The number of elements in the output column will be the same as the number of
  * elements in the input column. Each individual list item will contain the
  * new strings for that row. The resulting number of strings in each row can vary
  * from 0 to `maxsplit + 1`.
  *
- * The `pattern` is used to identify the separation points within a string
+ * The `pattern` is used to identify the delimiters within a string
  * and splitting stops when either `maxsplit` or the end of the string is reached.
  *
  * An empty input string will produce a corresponding empty list item output row.
@@ -161,7 +162,7 @@ std::unique_ptr<table> rsplit_re(
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -169,17 +170,18 @@ std::unique_ptr<table> rsplit_re(
  * @return Lists column of strings.
  */
 std::unique_ptr<column> split_record_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Splits individual strings elements into a list of strings
+ * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string.
  *
- * Each element generates an array of strings that are stored in an output
- * lists column.
+ * Each element generates a vector of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
  *
  * The number of elements in the output column will be the same as the number of
  * elements in the input column. Each individual list item will contain the
@@ -212,7 +214,7 @@ std::unique_ptr<column> split_record_re(
  *
  * @throw cudf:logic_error if `pattern` is empty.
  *
- * @param strings A column of string elements to be split.
+ * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
  * @param maxsplit Maximum number of splits to perform.
  *        Default of -1 indicates all possible splits on each string.
@@ -220,7 +222,7 @@ std::unique_ptr<column> split_record_re(
  * @return Lists column of strings.
  */
 std::unique_ptr<column> rsplit_record_re(
-  strings_column_view const& strings,
+  strings_column_view const& input,
   std::string const& pattern,
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 9dcf7e6f17b..dd71533c773 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -48,6 +48,10 @@ enum class split_direction {
 
 /**
  * @brief Identify the tokens from the `idx'th` string element of `d_strings`.
+ *
+ * Each string's tokens are stored in the `d_tokens` vector.
+ * The `d_token_offsets` specifies the output position within `d_tokens`
+ * for each string.
  */
 template <int stack_size>
 struct token_reader_fn {
@@ -118,12 +122,12 @@ struct token_reader_fn {
  *                The offsets for each token in each string on output.
  * @param stream CUDA stream used for kernel launches.
  */
-rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d_strings,
-                                                     reprog_device& d_prog,
-                                                     split_direction direction,
-                                                     size_type max_tokens,
-                                                     mutable_column_view& offsets,
-                                                     rmm::cuda_stream_view stream)
+rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const& d_strings,
+                                                       reprog_device& d_prog,
+                                                       split_direction direction,
+                                                       size_type max_tokens,
+                                                       mutable_column_view& offsets,
+                                                       rmm::cuda_stream_view stream)
 {
   auto const strings_count = d_strings.size();
 
@@ -165,7 +169,7 @@ rmm::device_uvector<string_index_pair> split_utility(column_device_view const& d
  * @brief Returns string pair for the specified column for each string in `d_strings`
  *
  * This is used to build the table result of a split.
- * Null is returned if the row is null of if the `column_index` is larger
+ * Null is returned if the row is null or if the `column_index` is larger
  * than the token count for that string.
  */
 struct tokens_transform_fn {
@@ -211,10 +215,10 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto offsets_view = offsets->mutable_view();
   auto d_offsets    = offsets_view.data<offset_type>();
 
-  // get the split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  // get the split tokens from the input column; this also converts the counts into offsets
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
-  // the columns_count is the maximum number of tokens for any string in the input column
+  // the output column count is the maximum number of tokens generated for any input string
   auto const columns_count = thrust::transform_reduce(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
@@ -273,8 +277,8 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto offsets      = count_matches(*d_strings, *d_prog, stream, mr);
   auto offsets_view = offsets->mutable_view();
 
-  // get the split tokens from the input column
-  auto tokens = split_utility(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  // get the split tokens from the input column; this also converts the counts into offsets
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
 
   // convert the tokens into one big strings column
   auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 4650cbc3c44..f0d7315929b 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -322,6 +322,7 @@ TEST_F(StringsSplitTest, SplitRegex)
     auto expected = cudf::table_view({col0, col1, col2});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_re(sv, "\\s+");
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
@@ -337,6 +338,7 @@ TEST_F(StringsSplitTest, SplitRegex)
     auto expected = cudf::table_view({col0, col1, col2, col3});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_re(sv, "[eé]");
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
@@ -359,6 +361,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_record_re(sv, "\\s+");
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
@@ -374,6 +377,7 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
                  validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
+    // rsplit == split when using default parameters
     result = cudf::strings::rsplit_record_re(sv, "[eé]");
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
@@ -394,15 +398,31 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
                                             {1, 0, 1, 1, 0});
     auto expected = cudf::table_view({col0, col1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+
+    // split everything is the same output as maxsplit==2 for the test input column here
+    result         = cudf::strings::split_re(sv, "\\s+", 2);
+    auto expected2 = cudf::strings::split_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view());
   }
   {
     auto result = cudf::strings::split_record_re(sv, "\\s", 1);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-    LCW expected(
+    LCW expected1(
       {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
       validity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1);
+
+    result = cudf::strings::split_record_re(sv, "\\s", 2);
+    LCW expected2(
+      {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}},
+      validity);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2);
+
+    // split everything is the same output as maxsplit==3 for the test input column here
+    result         = cudf::strings::split_record_re(sv, "\\s", 3);
+    auto expected0 = cudf::strings::split_record_re(sv, "\\s");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
 
@@ -521,6 +541,11 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
       {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+
+    // split everything is the same output as any maxsplit > 2 for the test input column here
+    result         = cudf::strings::rsplit_record_re(sv, "\\s+", 3);
+    auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+");
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
 
@@ -760,9 +785,8 @@ TEST_F(StringsSplitTest, PartitionZeroSizeStringsColumns)
 
 TEST_F(StringsSplitTest, InvalidParameter)
 {
-  std::vector<const char*> h_strings{"string left intentionally blank"};
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
-  auto strings_view = cudf::strings_column_view(strings);
+  cudf::test::strings_column_wrapper input({"string left intentionally blank"});
+  auto strings_view = cudf::strings_column_view(input);
   EXPECT_THROW(cudf::strings::split(strings_view, cudf::string_scalar("", false)),
                cudf::logic_error);
   EXPECT_THROW(cudf::strings::rsplit(strings_view, cudf::string_scalar("", false)),

From f17065c7387fcb535d87884ec5d59daa3f5b0d27 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 1 Feb 2022 15:11:27 -0500
Subject: [PATCH 11/20] Add regex parameter to cudf strings split()

---
 .../cudf/_lib/cpp/strings/split/split.pxd     |  26 ++++-
 python/cudf/cudf/_lib/strings/__init__.py     |   4 +
 python/cudf/cudf/_lib/strings/split/split.pyx | 102 +++++++++++++++++-
 python/cudf/cudf/core/column/string.py        |  44 ++++++--
 python/cudf/cudf/tests/test_string.py         |  23 ++++
 5 files changed, 187 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd b/python/cudf/cudf/_lib/cpp/strings/split/split.pxd
index 4a90aa233f0..9d3aa706ff1 100644
--- a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/split/split.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -32,3 +32,27 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
         column_view source_strings,
         string_scalar delimiter,
         size_type maxsplit) except +
+
+
+cdef extern from "cudf/strings/split/split_re.hpp" namespace \
+        "cudf::strings" nogil:
+
+    cdef unique_ptr[table] split_re(
+        const column_view& source_strings,
+        const string& pattern,
+        size_type maxsplit) except +
+
+    cdef unique_ptr[table] rsplit_re(
+        const column_view& source_strings,
+        const string& pattern,
+        size_type maxsplit) except +
+
+    cdef unique_ptr[column] split_record_re(
+        const column_view& source_strings,
+        const string& pattern,
+        size_type maxsplit) except +
+
+    cdef unique_ptr[column] rsplit_record_re(
+        const column_view& source_strings,
+        const string& pattern,
+        size_type maxsplit) except +
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 7911d0eff2a..178d96de0e5 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -78,9 +78,13 @@
 from cudf._lib.strings.split.partition import partition, rpartition
 from cudf._lib.strings.split.split import (
     rsplit,
+    rsplit_re,
     rsplit_record,
+    rsplit_record_re,
     split,
+    split_re,
     split_record,
+    split_record_re,
 )
 from cudf._lib.strings.strip import lstrip, rstrip, strip
 from cudf._lib.strings.substring import get, slice_from, slice_strings
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index 184b256125c..ce066aa6aec 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -10,9 +10,13 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.strings.split.split cimport (
     rsplit as cpp_rsplit,
+    rsplit_re as cpp_rsplit_re,
     rsplit_record as cpp_rsplit_record,
+    rsplit_record_re as cpp_rsplit_record_re,
     split as cpp_split,
+    split_re as cpp_split_re,
     split_record as cpp_split_record,
+    split_record_re as cpp_split_record_re,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
@@ -137,3 +141,99 @@ def rsplit_record(Column source_strings,
     return Column.from_unique_ptr(
         move(c_result),
     )
+
+
+def split_re(Column source_strings,
+             object pattern,
+             size_type maxsplit):
+    """
+    Returns data by splitting the `source_strings`
+    column around the delimiters identified by `pattern`.
+    """
+    cdef unique_ptr[table] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef string pattern_string = <string>str(pattern).encode()
+
+    with nogil:
+        c_result = move(cpp_split_re(
+            source_view,
+            pattern_string,
+            maxsplit
+        ))
+
+    return data_from_unique_ptr(
+        move(c_result),
+        column_names=range(0, c_result.get()[0].num_columns())
+    )
+
+
+def rsplit_re(Column source_strings,
+              object pattern,
+              size_type maxsplit):
+    """
+    Returns data by splitting the `source_strings`
+    column around the delimiters identified by `pattern`.
+    The delimiters are searched starting from the end of each string.
+    """
+    cdef unique_ptr[table] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef string pattern_string = <string>str(pattern).encode()
+
+    with nogil:
+        c_result = move(cpp_rsplit_re(
+            source_view,
+            pattern_string,
+            maxsplit
+        ))
+
+    return data_from_unique_ptr(
+        move(c_result),
+        column_names=range(0, c_result.get()[0].num_columns())
+    )
+
+
+def split_record_re(Column source_strings,
+                    object pattern,
+                    size_type maxsplit):
+    """
+    Returns a Column by splitting the `source_strings`
+    column around the delimiters identified by `pattern`.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef string pattern_string = <string>str(pattern).encode()
+
+    with nogil:
+        c_result = move(cpp_split_record_re(
+            source_view,
+            pattern_string,
+            maxsplit
+        ))
+
+    return Column.from_unique_ptr(
+        move(c_result),
+    )
+
+
+def rsplit_record_re(Column source_strings,
+                     object pattern,
+                     size_type maxsplit):
+    """
+    Returns a Column by splitting the `source_strings`
+    column around the delimiters identified by `pattern`.
+    The delimiters are searched starting from the end of each string.
+    """
+    cdef unique_ptr[column] c_result
+    cdef column_view source_view = source_strings.view()
+    cdef string pattern_string = <string>str(pattern).encode()
+
+    with nogil:
+        c_result = move(cpp_rsplit_record_re(
+            source_view,
+            pattern_string,
+            maxsplit
+        ))
+
+    return Column.from_unique_ptr(
+        move(c_result),
+    )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 6467fd39ddd..a60a2b97c52 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2284,7 +2284,11 @@ def get_json_object(self, json_path):
             return res
 
     def split(
-        self, pat: str = None, n: int = -1, expand: bool = None
+        self,
+        pat: str = None,
+        n: int = -1,
+        expand: bool = None,
+        regex: bool = None,
     ) -> SeriesOrIndex:
         """
         Split strings around given separator/delimiter.
@@ -2295,8 +2299,9 @@ def split(
 
         Parameters
         ----------
-        pat : str, default ' ' (space)
-            String to split on, does not yet support regular expressions.
+        pat : str, default None
+            String or regular expression to split on. If not specified, split
+            on whitespace.
         n : int, default -1 (all)
             Limit number of splits in output. `None`, 0, and -1 will all be
             interpreted as "all splits".
@@ -2307,6 +2312,13 @@ def split(
               dimensionality.
             * If ``False``, return Series/Index, containing lists
               of strings.
+        regex : bool, default None
+            Determines if the passed-in pattern is a regular expression:
+
+            * If ``True``, assumes the passed-in pattern is a regular
+              expression
+            * If ``False``, treats the pattern as a literal string.
+            * If pat length is 1, treats pat as a literal string.
 
         Returns
         -------
@@ -2406,27 +2418,39 @@ def split(
             )
 
         # Pandas treats 0 as all
-        if n == 0:
+        if n is None or n == 0:
             n = -1
 
         if pat is None:
             pat = ""
 
+        if regex and isinstance(pat, re.Pattern):
+            pat = pat.pattern
+
+        if len(str(pat)) <= 1:
+            regex = False
+
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = cudf.core.frame.Frame({0: self._column.copy()})
             else:
-                data, index = libstrings.split(
-                    self._column, cudf.Scalar(pat, "str"), n
-                )
+                if regex is True:
+                    data, index = libstrings.split_re(self._column, pat, n)
+                else:
+                    data, index = libstrings.split(
+                        self._column, cudf.Scalar(pat, "str"), n
+                    )
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = cudf.core.frame.Frame({})
                 else:
                     result_table = cudf.core.frame.Frame(data, index)
         else:
-            result_table = libstrings.split_record(
-                self._column, cudf.Scalar(pat, "str"), n
-            )
+            if regex is True:
+                result_table = libstrings.split_record_re(self._column, pat, n)
+            else:
+                result_table = libstrings.split_record(
+                    self._column, cudf.Scalar(pat, "str"), n
+                )
 
         return self._return_or_inplace(result_table, expand=expand)
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 75cf2e6c892..b34d705f139 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -961,6 +961,29 @@ def test_string_split(data, pat, n, expand):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["a b", " c ", "   d", "e   ", "f"],
+        ["a-b", "-c-", "---d", "e---", "f"],
+        ["ab", "c", "d", "e", "f"],
+        [None, None, None, None, None],
+    ],
+)
+@pytest.mark.parametrize("pat", [None, " ", "\\-+", "\\s+"])
+@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10])
+@pytest.mark.parametrize("expand", [True, False, None])
+def test_string_split_re(data, pat, n, expand):
+    ps = pd.Series(data, dtype="str")
+    gs = cudf.Series(data, dtype="str")
+
+    # Pandas does not support the regex parameter until 1.4.0
+    expect = ps.str.split(pat=pat, n=n, expand=expand)
+    got = gs.str.split(pat=pat, n=n, expand=expand, regex=True)
+
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
 )

From eb8c326cd1b2ea1e9f673b6a16e07533ce637f14 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 7 Feb 2022 19:51:00 -0500
Subject: [PATCH 12/20] fix doxygen typo in @throw line

---
 cpp/include/cudf/strings/split/split_re.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index c6dc1e5c697..d61b802efe9 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -60,7 +60,7 @@ namespace strings {
  *       ["bc def_g", "_bc", "ab cd", "cd "] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
@@ -110,7 +110,7 @@ std::unique_ptr<table> split_re(
  *       ["g", "bc", "cd", "cd "] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
@@ -160,7 +160,7 @@ std::unique_ptr<table> rsplit_re(
  *       ["ab", "cd "] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.
@@ -212,7 +212,7 @@ std::unique_ptr<column> split_record_re(
  *       ["ab_cd", ""] ]
  * @endcode
  *
- * @throw cudf:logic_error if `pattern` is empty.
+ * @throw cudf::logic_error if `pattern` is empty.
  *
  * @param input A column of string elements to be split.
  * @param pattern The regex pattern for delimiting characters within each string.

From d6ee8837ff3f523816d96f444e1b001d14debdf7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 7 Feb 2022 19:51:37 -0500
Subject: [PATCH 13/20] refactor max-tokens calculation into helper function

---
 cpp/src/strings/split/split_re.cu | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index dd71533c773..d80148f2fe6 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -98,7 +98,7 @@ struct token_reader_fn {
     d_result[token_idx] = string_index_pair{d_str.data() + last_pos, d_str.size_bytes() - last_pos};
 
     if (direction == split_direction::BACKWARD) {
-      // update first entry -- this happens when max-tokens is hit before the end of the string
+      // update first entry -- this happens when max_tokens is hit before the end of the string
       auto const first_offset =
         d_result[0].first
           ? static_cast<size_type>(thrust::distance(d_str.data(), d_result[0].first))
@@ -117,6 +117,7 @@ struct token_reader_fn {
  *
  * @param d_strings Strings to split
  * @param d_prog Regex to evaluate against each string
+ * @param direction Whether tokens are generated forwards or backwards.
  * @param max_tokens The maximum number of tokens for each split.
  * @param offsets The number of matches on input.
  *                The offsets for each token in each string on output.
@@ -125,12 +126,14 @@ struct token_reader_fn {
 rmm::device_uvector<string_index_pair> generate_tokens(column_device_view const& d_strings,
                                                        reprog_device& d_prog,
                                                        split_direction direction,
-                                                       size_type max_tokens,
+                                                       size_type maxsplit,
                                                        mutable_column_view& offsets,
                                                        rmm::cuda_stream_view stream)
 {
   auto const strings_count = d_strings.size();
 
+  auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
+
   auto const begin     = thrust::make_counting_iterator<size_type>(0);
   auto const end       = thrust::make_counting_iterator<size_type>(strings_count);
   auto const d_offsets = offsets.data<offset_type>();
@@ -182,7 +185,7 @@ struct tokens_transform_fn {
   {
     auto const offset      = d_token_offsets[idx];
     auto const token_count = d_token_offsets[idx + 1] - offset;
-    return (column_index > token_count - 1) || d_strings.is_null(idx)
+    return (column_index >= token_count) || d_strings.is_null(idx)
              ? string_index_pair{nullptr, 0}
              : d_tokens[offset + column_index];
   }
@@ -197,7 +200,6 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 {
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
 
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
   std::vector<std::unique_ptr<column>> results;
@@ -216,7 +218,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto d_offsets    = offsets_view.data<offset_type>();
 
   // get the split tokens from the input column; this also converts the counts into offsets
-  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
 
   // the output column count is the maximum number of tokens generated for any input string
   auto const columns_count = thrust::transform_reduce(
@@ -266,7 +268,6 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
 {
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
 
-  auto const max_tokens    = maxsplit > 0 ? maxsplit : std::numeric_limits<size_type>::max();
   auto const strings_count = input.size();
 
   // create the regex device prog from the given pattern
@@ -278,7 +279,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto offsets_view = offsets->mutable_view();
 
   // get the split tokens from the input column; this also converts the counts into offsets
-  auto tokens = generate_tokens(*d_strings, *d_prog, direction, max_tokens, offsets_view, stream);
+  auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream);
 
   // convert the tokens into one big strings column
   auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr);

From 6094ed9f218d4c104b4ba015f235583abd50baca Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 8 Feb 2022 12:40:24 -0500
Subject: [PATCH 14/20] fix doxygen brief and examples

---
 cpp/include/cudf/strings/split/split_re.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index d61b802efe9..320d1bdc9b4 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -77,7 +77,7 @@ std::unique_ptr<table> split_re(
 
 /**
  * @brief Splits strings elements into a table of strings columns
- * using a regex pattern to delimit each string.
+ * using a regex pattern to delimit each string starting from the end of the string.
  *
  * Each element generates a vector of strings that are stored in corresponding
  * rows in the output table -- `table[col,row] = token[col] of string[row]`
@@ -146,13 +146,13 @@ std::unique_ptr<table> rsplit_re(
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
- * s1 = split_record(s, "[_ ]")
+ * s1 = split_record_re(s, "[_ ]")
  * s1 is a lists column of strings:
  *     [ ["a", "bc", "def", "g"],
  *       ["a", "", "bc"],
  *       ["", "ab", "cd"],
  *       ["ab", "cd", ""] ]
- * s2 = split_record(s, "[ _]", 1)
+ * s2 = split_record_re(s, "[ _]", 1)
  * s2 is a lists column of strings:
  *     [ ["a", "bc def_g"],
  *       ["a", "_bc"],
@@ -177,7 +177,7 @@ std::unique_ptr<column> split_record_re(
 
 /**
  * @brief Splits strings elements into a list column of strings
- * using the given regex pattern to delimit each string.
+ * using the given regex pattern to delimit each string starting from the end of the string.
  *
  * Each element generates a vector of strings that are stored in an output
  * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
@@ -198,13 +198,13 @@ std::unique_ptr<column> split_record_re(
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
- * s1 = rsplit_record(s, "[_ ]")
+ * s1 = rsplit_record_re(s, "[_ ]")
  * s1 is a lists column of strings:
  *     [ ["a", "bc", "def", "g"],
  *       ["a", "", "bc"],
  *       ["", "ab", "cd"],
  *       ["ab", "cd", ""] ]
- * s2 = rsplit_record(s, "[ _]", 1)
+ * s2 = rsplit_record_re(s, "[ _]", 1)
  * s2 is a lists column of strings:
  *     [ ["a_bc def", "g"],
  *       ["a_", "bc"],

From f647cf0f681493eaa254a3c39cf9e299a19a0589 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 8 Feb 2022 12:42:47 -0500
Subject: [PATCH 15/20] fix doxygen brief and examples

---
 cpp/include/cudf/strings/split/split_re.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index d61b802efe9..320d1bdc9b4 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -77,7 +77,7 @@ std::unique_ptr<table> split_re(
 
 /**
  * @brief Splits strings elements into a table of strings columns
- * using a regex pattern to delimit each string.
+ * using a regex pattern to delimit each string starting from the end of the string.
  *
  * Each element generates a vector of strings that are stored in corresponding
  * rows in the output table -- `table[col,row] = token[col] of string[row]`
@@ -146,13 +146,13 @@ std::unique_ptr<table> rsplit_re(
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
- * s1 = split_record(s, "[_ ]")
+ * s1 = split_record_re(s, "[_ ]")
  * s1 is a lists column of strings:
  *     [ ["a", "bc", "def", "g"],
  *       ["a", "", "bc"],
  *       ["", "ab", "cd"],
  *       ["ab", "cd", ""] ]
- * s2 = split_record(s, "[ _]", 1)
+ * s2 = split_record_re(s, "[ _]", 1)
  * s2 is a lists column of strings:
  *     [ ["a", "bc def_g"],
  *       ["a", "_bc"],
@@ -177,7 +177,7 @@ std::unique_ptr<column> split_record_re(
 
 /**
  * @brief Splits strings elements into a list column of strings
- * using the given regex pattern to delimit each string.
+ * using the given regex pattern to delimit each string starting from the end of the string.
  *
  * Each element generates a vector of strings that are stored in an output
  * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
@@ -198,13 +198,13 @@ std::unique_ptr<column> split_record_re(
  *
  * @code{.pseudo}
  * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
- * s1 = rsplit_record(s, "[_ ]")
+ * s1 = rsplit_record_re(s, "[_ ]")
  * s1 is a lists column of strings:
  *     [ ["a", "bc", "def", "g"],
  *       ["a", "", "bc"],
  *       ["", "ab", "cd"],
  *       ["ab", "cd", ""] ]
- * s2 = rsplit_record(s, "[ _]", 1)
+ * s2 = rsplit_record_re(s, "[ _]", 1)
  * s2 is a lists column of strings:
  *     [ ["a_bc def", "g"],
  *       ["a_", "bc"],

From 7394e74a0726273bd9de3825cb4a1d30fd034e49 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 14 Feb 2022 10:22:53 -0500
Subject: [PATCH 16/20] add rsplit regex interface

---
 python/cudf/cudf/core/column/string.py | 40 ++++++++++++++++++++------
 python/cudf/cudf/tests/test_string.py  | 15 ++++++++++
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ebfa1a640de..84fc497ef89 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -2300,7 +2300,7 @@ def split(
         Split strings around given separator/delimiter.
 
         Splits the string in the Series/Index from the beginning, at the
-        specified delimiter string. Equivalent to `str.split()
+        specified delimiter string. Similar to `str.split()
         <https://docs.python.org/3/library/stdtypes.html#str.split>`_.
 
         Parameters
@@ -2461,13 +2461,17 @@ def split(
         return self._return_or_inplace(result_table, expand=expand)
 
     def rsplit(
-        self, pat: str = None, n: int = -1, expand: bool = None
+        self,
+        pat: str = None,
+        n: int = -1,
+        expand: bool = None,
+        regex: bool = None,
     ) -> SeriesOrIndex:
         """
         Split strings around given separator/delimiter.
 
         Splits the string in the Series/Index from the end, at the
-        specified delimiter string. Equivalent to `str.rsplit()
+        specified delimiter string. Similar to `str.rsplit()
         <https://docs.python.org/3/library/stdtypes.html#str.rsplit>`_.
 
         Parameters
@@ -2484,6 +2488,13 @@ def rsplit(
               dimensionality.
             * If ``False``, return Series/Index, containing lists
               of strings.
+        regex : bool, default None
+            Determines if the passed-in pattern is a regular expression:
+
+            * If ``True``, assumes the passed-in pattern is a regular
+              expression
+            * If ``False``, treats the pattern as a literal string.
+            * If pat length is 1, treats pat as a literal string.
 
         Returns
         -------
@@ -2598,21 +2609,32 @@ def rsplit(
         if pat is None:
             pat = ""
 
+        if regex and isinstance(pat, re.Pattern):
+            pat = pat.pattern
+
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = cudf.core.frame.Frame({0: self._column.copy()})
             else:
-                data, index = libstrings.rsplit(
-                    self._column, cudf.Scalar(pat, "str"), n
-                )
+                if regex is True:
+                    data, index = libstrings.rsplit_re(self._column, pat, n)
+                else:
+                    data, index = libstrings.rsplit(
+                        self._column, cudf.Scalar(pat, "str"), n
+                    )
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = cudf.core.frame.Frame({})
                 else:
                     result_table = cudf.core.frame.Frame(data, index)
         else:
-            result_table = libstrings.rsplit_record(
-                self._column, cudf.Scalar(pat), n
-            )
+            if regex is True:
+                result_table = libstrings.rsplit_record_re(
+                    self._column, pat, n
+                )
+            else:
+                result_table = libstrings.rsplit_record(
+                    self._column, cudf.Scalar(pat), n
+                )
 
         return self._return_or_inplace(result_table, expand=expand)
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 1e7f8bfb7b0..753d8af7c73 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1535,6 +1535,21 @@ def test_strings_rsplit(data, n, expand):
     )
 
 
+@pytest.mark.parametrize(
+    "data", [["a b", " c ", "   d", "e   ", "f"]],
+)
+@pytest.mark.parametrize("n", [-1, 0, 1, 3, 10])
+@pytest.mark.parametrize("expand", [True, False, None])
+def test_string_rsplit_re(data, n, expand):
+    ps = pd.Series(data, dtype="str")
+    gs = cudf.Series(data, dtype="str")
+
+    # Pandas does not support the regex parameter until 1.4.0
+    expect = ps.str.rsplit(pat=" ", n=n, expand=expand)
+    got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True)
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From 3ff72abd8017614afeaecc7e78825aa7b68211d0 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 14 Feb 2022 16:19:33 -0500
Subject: [PATCH 17/20] update copyright header in init.py

---
 python/cudf/cudf/_lib/strings/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 178d96de0e5..fe0710504db 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,

From a518415bfd9c4060c61c447addab712688722a19 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 14 Feb 2022 18:25:45 -0500
Subject: [PATCH 18/20] correct copyright year in init.py

---
 python/cudf/cudf/_lib/strings/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index fe0710504db..9fccd61c82d 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,

From 75f20d57c421c2893ef84b942609acf3b0075dc8 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 15 Feb 2022 18:14:27 -0500
Subject: [PATCH 19/20] add PANDAS_LT_140 check in rsplit test

---
 python/cudf/cudf/tests/test_string.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 753d8af7c73..36351bfd1c3 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1545,7 +1545,13 @@ def test_string_rsplit_re(data, n, expand):
     gs = cudf.Series(data, dtype="str")
 
     # Pandas does not support the regex parameter until 1.4.0
-    expect = ps.str.rsplit(pat=" ", n=n, expand=expand)
+    from cudf.core._compat import PANDAS_LT_140
+
+    if PANDAS_LT_140:
+        expect = ps.str.rsplit(pat=" ", n=n, expand=expand)
+    else:
+        expect = ps.str.rsplit(pat="\\s", n=n, regex=True)
+
     got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True)
     assert_eq(expect, got)
 

From ba4c8a214da531e6b57989861ba814dc9fb1132e Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 17 Feb 2022 08:46:54 -0500
Subject: [PATCH 20/20] add inspect.signature check for rsplit regex parameter

---
 python/cudf/cudf/tests/test_string.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 36351bfd1c3..5ee14d8132b 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1535,23 +1535,22 @@ def test_strings_rsplit(data, n, expand):
     )
 
 
-@pytest.mark.parametrize(
-    "data", [["a b", " c ", "   d", "e   ", "f"]],
-)
 @pytest.mark.parametrize("n", [-1, 0, 1, 3, 10])
 @pytest.mark.parametrize("expand", [True, False, None])
-def test_string_rsplit_re(data, n, expand):
+def test_string_rsplit_re(n, expand):
+    data = ["a b", " c ", "   d", "e   ", "f"]
     ps = pd.Series(data, dtype="str")
     gs = cudf.Series(data, dtype="str")
 
-    # Pandas does not support the regex parameter until 1.4.0
-    from cudf.core._compat import PANDAS_LT_140
+    # Pandas does not yet support the regex parameter for rsplit
+    import inspect
 
-    if PANDAS_LT_140:
-        expect = ps.str.rsplit(pat=" ", n=n, expand=expand)
-    else:
-        expect = ps.str.rsplit(pat="\\s", n=n, regex=True)
+    assert (
+        "regex"
+        not in inspect.signature(pd.Series.str.rsplit).parameters.keys()
+    )
 
+    expect = ps.str.rsplit(pat=" ", n=n, expand=expand)
     got = gs.str.rsplit(pat="\\s", n=n, expand=expand, regex=True)
     assert_eq(expect, got)