Skip to content

Commit

Permalink
fix(tokenizer): Remove unnecessary templates on Tiktoken impl methods
Browse files Browse the repository at this point in the history
These was likely needed in the original implementation because of being
called with different types for allowed_special, but here they're only ever
used with an Encoder type so the template is unnecessary.

pytorch#1251
Branch: TokenizersCpp-1251

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
  • Loading branch information
gabe-l-hart committed Nov 12, 2024
1 parent 2e6f8b6 commit 69a5dd0
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 8 deletions.
6 changes: 2 additions & 4 deletions tokenizer/tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,11 +242,10 @@ static std::vector<uint64_t> _byte_pair_encode(
// ------------------------------Util end------------------------------------
// -------------------------private method start-------------------------------

template <typename T>
std::pair<std::optional<std::string>, re2::StringPiece>
Tiktoken::_split_with_allowed_special_token(
re2::StringPiece& input,
const T& allowed_special) const {
const Encoder& allowed_special) const {
if (!_special_token_regex) {
return std::make_pair(std::nullopt, input);
}
Expand Down Expand Up @@ -293,10 +292,9 @@ void Tiktoken::_encode(
}
}

template <typename T>
std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
const std::string& text,
const T& allowed_special) const {
const Encoder& allowed_special) const {
std::vector<uint64_t> tokens;
uint64_t last_piece_token_len = 0;
re2::StringPiece input(text);
Expand Down
6 changes: 2 additions & 4 deletions tokenizer/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,21 +118,19 @@ class Tiktoken : public Tokenizer {
return special_tokens;
}

template <typename T>
std::pair<std::optional<std::string>, re2::StringPiece>
_split_with_allowed_special_token(
re2::StringPiece& input,
const T& allowed_special) const;
const Encoder& allowed_special) const;

void _encode(
re2::StringPiece& input,
std::vector<uint64_t>& ret,
uint64_t& last_piece_token_len) const;

template <typename T>
std::pair<std::vector<uint64_t>, uint64_t> _encode_with_special_token(
const std::string& text,
const T& allowed_special) const;
const Encoder& allowed_special) const;

// Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
const std::string _pattern =
Expand Down

0 comments on commit 69a5dd0

Please sign in to comment.