Skip to content

Commit

Permalink
fix(tokenizers): Use pass-by-value for string views
Browse files Browse the repository at this point in the history
Best Practices: https://abseil.io/docs/cpp/guides/strings#string_view

pytorch#1251
Branch: TokenizersCpp-1251

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
  • Loading branch information
gabe-l-hart committed Nov 15, 2024
1 parent 59a2bdb commit 34bea83
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 8 deletions.
6 changes: 3 additions & 3 deletions tokenizer/pre_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ RegexPreTokenizer::create_regex_(const std::string& pattern) {
return std::make_unique<re2::RE2>("(" + pattern + ")");
}

std::vector<std::string> RegexPreTokenizer::pre_tokenize(re2::StringPiece& input) const {
std::vector<std::string> RegexPreTokenizer::pre_tokenize(re2::StringPiece input) const {
std::vector<std::string> result;
std::string piece;
while (RE2::FindAndConsume(&input, *regex_, &piece)) {
Expand All @@ -30,7 +30,7 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(re2::StringPiece& input
return result;
}

// ByteLevelPreTokenizer /////////////////////////////////////////////
// ByteLevelPreTokenizer ///////////////////////////////////////////////////////

//////////////////
// Impl Details //
Expand All @@ -55,7 +55,7 @@ ByteLevelPreTokenizer::ByteLevelPreTokenizer(
{}

std::vector<std::string>
ByteLevelPreTokenizer::pre_tokenize(re2::StringPiece& input) const {
ByteLevelPreTokenizer::pre_tokenize(re2::StringPiece input) const {

// Add the prefix space if configured to do so
std::string input_str(input);
Expand Down
11 changes: 6 additions & 5 deletions tokenizer/pre_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,11 @@ class PreTokenizer {
* This pre-tokenization may result in sub-pieces that are not contained
* within the original input, therefore the resulting pieces will be owned by
* the caller.
*
* NOTE: Pass by value per best practice
* https://abseil.io/docs/cpp/guides/strings#string_view
*/
virtual std::vector<std::string> pre_tokenize(re2::StringPiece& input) const = 0;
virtual std::vector<std::string> pre_tokenize(re2::StringPiece input) const = 0;
}; // end class PreTokenizer

// -- Regex --------------------------------------------------------------------
Expand All @@ -48,7 +51,7 @@ class RegexPreTokenizer : public PreTokenizer {
{}

/** Pre-tokenize with the stored regex */
std::vector<std::string> pre_tokenize(re2::StringPiece& input) const;
std::vector<std::string> pre_tokenize(re2::StringPiece input) const;

protected:
static Re2UPtr create_regex_(const std::string& pattern);
Expand Down Expand Up @@ -76,16 +79,14 @@ class ByteLevelPreTokenizer : public PreTokenizer {
public:

/**
* Construct with matching rust implementation
*
* @param add_prefix_space: Whether to add a leading space to the first word
* @param pattern: A user-supplied regex to use for token splitting. If not
* provided, it use the standard GPT2 pattern.
*/
ByteLevelPreTokenizer(bool add_prefix_space = true, const std::string& pattern = "");

/** Perform pre-tokenization */
std::vector<std::string> pre_tokenize(re2::StringPiece& input) const override;
std::vector<std::string> pre_tokenize(re2::StringPiece input) const override;

private:

Expand Down

0 comments on commit 34bea83

Please sign in to comment.