fix(tokenizers): Use pass-by-value for string views

Best Practices: https://abseil.io/docs/cpp/guides/strings#string_view pytorch#1251 Branch: TokenizersCpp-1251 Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
gabe-l-hart · Nov 15, 2024 · 34bea83 · 34bea83
1 parent 59a2bdb
commit 34bea83
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 8 deletions.
diff --git a/tokenizer/pre_tokenizer.cpp b/tokenizer/pre_tokenizer.cpp
@@ -21,7 +21,7 @@ RegexPreTokenizer::create_regex_(const std::string& pattern) {
   return std::make_unique<re2::RE2>("(" + pattern + ")");
 }
 
-std::vector<std::string> RegexPreTokenizer::pre_tokenize(re2::StringPiece& input) const {
+std::vector<std::string> RegexPreTokenizer::pre_tokenize(re2::StringPiece input) const {
   std::vector<std::string> result;
   std::string piece;
   while (RE2::FindAndConsume(&input, *regex_, &piece)) {
@@ -30,7 +30,7 @@ std::vector<std::string> RegexPreTokenizer::pre_tokenize(re2::StringPiece& input
   return result;
 }
 
-// ByteLevelPreTokenizer /////////////////////////////////////////////
+// ByteLevelPreTokenizer ///////////////////////////////////////////////////////
 
 //////////////////
 // Impl Details //
@@ -55,7 +55,7 @@ ByteLevelPreTokenizer::ByteLevelPreTokenizer(
 {}
 
 std::vector<std::string>
-ByteLevelPreTokenizer::pre_tokenize(re2::StringPiece& input) const {
+ByteLevelPreTokenizer::pre_tokenize(re2::StringPiece input) const {
 
   // Add the prefix space if configured to do so
   std::string input_str(input);

diff --git a/tokenizer/pre_tokenizer.h b/tokenizer/pre_tokenizer.h
@@ -30,8 +30,11 @@ class PreTokenizer {
    * This pre-tokenization may result in sub-pieces that are not contained
    * within the original input, therefore the resulting pieces will be owned by
    * the caller.
+   *
+   * NOTE: Pass by value per best practice
+   *  https://abseil.io/docs/cpp/guides/strings#string_view
    */
-  virtual std::vector<std::string> pre_tokenize(re2::StringPiece& input) const = 0;
+  virtual std::vector<std::string> pre_tokenize(re2::StringPiece input) const = 0;
 };  // end class PreTokenizer
 
 // -- Regex --------------------------------------------------------------------
@@ -48,7 +51,7 @@ class RegexPreTokenizer : public PreTokenizer {
   {}
 
   /** Pre-tokenize with the stored regex */
-  std::vector<std::string> pre_tokenize(re2::StringPiece& input) const;
+  std::vector<std::string> pre_tokenize(re2::StringPiece input) const;
 
  protected:
   static Re2UPtr create_regex_(const std::string& pattern);
@@ -76,16 +79,14 @@ class ByteLevelPreTokenizer : public PreTokenizer {
  public:
 
   /**
-   * Construct with matching rust implementation
-   *
    * @param add_prefix_space: Whether to add a leading space to the first word
    * @param pattern: A user-supplied regex to use for token splitting. If not
    *    provided, it use the standard GPT2 pattern.
    */
   ByteLevelPreTokenizer(bool add_prefix_space = true, const std::string& pattern = "");
 
   /** Perform pre-tokenization */
-  std::vector<std::string> pre_tokenize(re2::StringPiece& input) const override;
+  std::vector<std::string> pre_tokenize(re2::StringPiece input) const override;
 
  private: