Skip to content

Commit

Permalink
Cleanup libcudf strings regex classes (#10573)
Browse files Browse the repository at this point in the history
Refactors some of the internal libcudf regex classes used for executing regex on strings. This is the first part of some changes to reduce kernel memory launch size for the regex code. A follow on PR will change the stack-based state management to a device memory approach. The changes here are isolated to help ease the review process in the next PR. Mostly code has been moved or refactored along with general cleanup like adding consts and removing some unnecessary pass-by-reference/pointer.

None of the calling routines currently require changes and no behavior has changed.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #10573
  • Loading branch information
davidwendt authored Apr 14, 2022
1 parent 22a6679 commit ac27757
Show file tree
Hide file tree
Showing 5 changed files with 313 additions and 317 deletions.
37 changes: 35 additions & 2 deletions cpp/src/strings/regex/regcomp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <strings/regex/regcomp.h>

#include <cudf/strings/detail/utf8.hpp>
#include <cudf/utilities/error.hpp>

#include <algorithm>
Expand Down Expand Up @@ -58,6 +59,37 @@ const std::array<char, 33> escapable_chars{
{'.', '-', '+', '*', '\\', '?', '^', '$', '|', '{', '}', '(', ')', '[', ']', '<', '>',
'"', '~', '\'', '`', '_', '@', '=', ';', ':', '!', '#', '%', '&', ',', '/', ' '}};

/**
* @brief Converts UTF-8 string into fixed-width 32-bit character vector.
*
* No character conversion occurs.
* Each UTF-8 character is promoted into a 32-bit value.
* The last entry in the returned vector will be a 0 value.
* The fixed-width vector makes it easier to compile and faster to execute.
*
* @param pattern Regular expression encoded with UTF-8.
* @return Fixed-width 32-bit character vector.
*/
std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
{
size_type size = static_cast<size_type>(pattern.size());
size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
return is_begin_utf8_char(static_cast<uint8_t>(ch));
});
std::vector<char32_t> result(count + 1);
char32_t* output_ptr = result.data();
const char* input_ptr = pattern.data();
for (size_type idx = 0; idx < size; ++idx) {
char_utf8 output_character = 0;
size_type ch_width = to_char_utf8(input_ptr, output_character);
input_ptr += ch_width;
idx += ch_width - 1;
*output_ptr++ = output_character;
}
result[count] = 0; // last entry set to 0
return result;
}

} // namespace

int32_t reprog::add_inst(int32_t t)
Expand Down Expand Up @@ -838,10 +870,11 @@ class regex_compiler {
};

// Convert pattern into program
reprog reprog::create_from(const char32_t* pattern, regex_flags const flags)
reprog reprog::create_from(std::string_view pattern, regex_flags const flags)
{
reprog rtn;
regex_compiler compiler(pattern, flags, rtn);
auto pattern32 = string_to_char32_vector(pattern);
regex_compiler compiler(pattern32.data(), flags, rtn);
// for debugging, it can be helpful to call rtn.print(flags) here to dump
// out the instructions that have been created from the given pattern
return rtn;
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/regex/regcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class reprog {
* @brief Parses the given regex pattern and compiles
* into a list of chained instructions.
*/
static reprog create_from(const char32_t* pattern, regex_flags const flags);
static reprog create_from(std::string_view pattern, regex_flags const flags);

int32_t add_inst(int32_t type);
int32_t add_inst(reinst inst);
Expand Down
109 changes: 59 additions & 50 deletions cpp/src/strings/regex/regex.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
#include <thrust/optional.h>
#include <thrust/pair.h>

#include <functional>
#include <memory>

namespace cudf {
Expand All @@ -35,9 +34,7 @@ class string_view;
namespace strings {
namespace detail {

struct reljunk;
struct reinst;
class reprog;
struct relist;

using match_pair = thrust::pair<cudf::size_type, cudf::size_type>;
using match_result = thrust::optional<match_pair>;
Expand Down Expand Up @@ -65,19 +62,18 @@ constexpr int32_t RX_LARGE_INSTS = (RX_STACK_LARGE / 11);
*
* This class holds the unique data for any regex CCLASS instruction.
*/
class reclass_device {
public:
struct alignas(16) reclass_device {
int32_t builtins{};
int32_t count{};
char32_t* literals{};
char32_t const* literals{};

__device__ bool is_match(char32_t ch, const uint8_t* flags);
__device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const;
};

/**
* @brief Regex program of instructions/data for a specific regex pattern.
*
* Once create, this find/extract methods are used to evaluating the regex instructions
* Once created, the find/extract methods are used to evaluate the regex instructions
* against a single string.
*/
class reprog_device {
Expand Down Expand Up @@ -132,15 +128,7 @@ class reprog_device {
/**
* @brief Returns the number of regex instructions.
*/
[[nodiscard]] __host__ __device__ int32_t insts_counts() const { return _insts_count; }

/**
* @brief Returns true if this is an empty program.
*/
[[nodiscard]] __device__ bool is_empty() const
{
return insts_counts() == 0 || get_inst(0)->type == END;
}
[[nodiscard]] CUDF_HOST_DEVICE int32_t insts_counts() const { return _insts_count; }

/**
* @brief Returns the number of regex groups found in the expression.
Expand All @@ -151,19 +139,9 @@ class reprog_device {
}

/**
* @brief Returns the regex instruction object for a given index.
*/
[[nodiscard]] __device__ inline reinst* get_inst(int32_t idx) const;

/**
* @brief Returns the regex class object for a given index.
*/
[[nodiscard]] __device__ inline reclass_device get_class(int32_t idx) const;

/**
* @brief Returns the start-instruction-ids vector.
* @brief Returns true if this is an empty program.
*/
[[nodiscard]] __device__ inline int32_t* startinst_ids() const;
[[nodiscard]] __device__ inline bool is_empty() const;

/**
* @brief Does a find evaluation using the compiled expression on the given string.
Expand All @@ -180,9 +158,9 @@ class reprog_device {
*/
template <int stack_size>
__device__ inline int32_t find(int32_t idx,
string_view const& d_str,
int32_t& begin,
int32_t& end);
string_view const d_str,
cudf::size_type& begin,
cudf::size_type& end) const;

/**
* @brief Does an extract evaluation using the compiled expression on the given string.
Expand All @@ -192,8 +170,8 @@ class reprog_device {
* the matched section.
*
* @tparam stack_size One of the `RX_STACK_` values based on the `insts_count`.
* @param idx The string index used for mapping the state memory for this string in global memory
* (if necessary).
* @param idx The string index used for mapping the state memory for this string in global
* memory (if necessary).
* @param d_str The string to search.
* @param begin Position index to begin the search. If found, returns the position found
* in the string.
Expand All @@ -204,34 +182,65 @@ class reprog_device {
*/
template <int stack_size>
__device__ inline match_result extract(cudf::size_type idx,
string_view const& d_str,
string_view const d_str,
cudf::size_type begin,
cudf::size_type end,
cudf::size_type group_id);
cudf::size_type const group_id) const;

private:
int32_t _startinst_id, _num_capturing_groups;
int32_t _insts_count, _starts_count, _classes_count;
const uint8_t* _codepoint_flags{}; // table of character types
reinst* _insts{}; // array of regex instructions
int32_t* _startinst_ids{}; // array of start instruction ids
reclass_device* _classes{}; // array of regex classes
void* _relists_mem{}; // runtime relist memory for regexec
struct reljunk {
relist* __restrict__ list1;
relist* __restrict__ list2;
int32_t starttype{};
char32_t startchar{};

__device__ inline reljunk(relist* list1, relist* list2, reinst const inst);
__device__ inline void swaplist();
};

/**
* @brief Returns the regex instruction object for a given id.
*/
__device__ inline reinst get_inst(int32_t id) const;

/**
* @brief Returns the regex class object for a given id.
*/
__device__ inline reclass_device get_class(int32_t id) const;

/**
* @brief Executes the regex pattern on the given string.
*/
__device__ inline int32_t regexec(
string_view const& d_str, reljunk& jnk, int32_t& begin, int32_t& end, int32_t group_id = 0);
__device__ inline int32_t regexec(string_view const d_str,
reljunk jnk,
cudf::size_type& begin,
cudf::size_type& end,
cudf::size_type const group_id = 0) const;

/**
* @brief Utility wrapper to setup state memory structures for calling regexec
*/
template <int stack_size>
__device__ inline int32_t call_regexec(
int32_t idx, string_view const& d_str, int32_t& begin, int32_t& end, int32_t group_id = 0);

reprog_device(reprog&); // must use create()
__device__ inline int32_t call_regexec(int32_t idx,
string_view const d_str,
cudf::size_type& begin,
cudf::size_type& end,
cudf::size_type const group_id = 0) const;

reprog_device(reprog&);

int32_t _startinst_id; // first instruction id
int32_t _num_capturing_groups; // instruction groups
int32_t _insts_count; // number of instructions
int32_t _starts_count; // number of start-insts ids
int32_t _classes_count; // number of classes

uint8_t const* _codepoint_flags{}; // table of character types
reinst const* _insts{}; // array of regex instructions
int32_t const* _startinst_ids{}; // array of start instruction ids
reclass_device const* _classes{}; // array of regex classes

void* _relists_mem{}; // runtime relist memory for regexec()
};

} // namespace detail
Expand Down
Loading

0 comments on commit ac27757

Please sign in to comment.