Skip to content

Commit

Permalink
refactor(tokenizer): Add const qualifiers to const methods in tokeniz…
Browse files Browse the repository at this point in the history
…er classes

pytorch#1251
Branch: TokenizersCpp-1251

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
  • Loading branch information
gabe-l-hart committed Oct 4, 2024
1 parent cd2d131 commit 4e5a279
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 17 deletions.
4 changes: 2 additions & 2 deletions tokenizer/sentencepiece.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ SPTokenizer::~SPTokenizer() {}
* @return std::string A pointer to the string representation of the
* token.
*/
std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) {
std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) const {
if (!initialized_) {
fprintf(stderr, "Tokenizer not initialized\n");
exit(EXIT_FAILURE);
Expand Down Expand Up @@ -94,7 +94,7 @@ std::string SPTokenizer::decode(uint64_t prev_token, uint64_t token) {
* @return std::vector<uint64_t>
*/
std::vector<uint64_t>
SPTokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
SPTokenizer::encode(const std::string& text, int8_t bos, int8_t eos) const {
if (!initialized_) {
fprintf(stderr, "Tokenizer not initialized\n");
exit(EXIT_FAILURE);
Expand Down
10 changes: 5 additions & 5 deletions tokenizer/tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ template <typename T>
std::pair<std::optional<std::string>, re2::StringPiece>
Tiktoken::_split_with_allowed_special_token(
re2::StringPiece& input,
const T& allowed_special) {
const T& allowed_special) const {
if (!_special_token_regex) {
return std::make_pair(std::nullopt, input);
}
Expand All @@ -273,7 +273,7 @@ Tiktoken::_split_with_allowed_special_token(
void Tiktoken::_encode(
re2::StringPiece& input,
std::vector<uint64_t>& ret,
uint64_t& last_piece_token_len) {
uint64_t& last_piece_token_len) const {
std::string piece;
assert(_regex);
while (re2::RE2::FindAndConsume(&input, *_regex, &piece)) {
Expand All @@ -292,7 +292,7 @@ void Tiktoken::_encode(
template <typename T>
std::pair<std::vector<uint64_t>, uint64_t> Tiktoken::_encode_with_special_token(
const std::string& text,
const T& allowed_special) {
const T& allowed_special) const {
std::vector<uint64_t> tokens;
uint64_t last_piece_token_len = 0;
re2::StringPiece input(text);
Expand Down Expand Up @@ -349,7 +349,7 @@ void Tiktoken::load(const std::string& path) {
}

std::vector<uint64_t>
Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) const {
if (!initialized_) {
exit(EXIT_FAILURE);
}
Expand All @@ -363,7 +363,7 @@ Tiktoken::encode(const std::string& text, int8_t bos, int8_t eos) {
return res;
}

std::string Tiktoken::decode(uint64_t prev, uint64_t cur) {
std::string Tiktoken::decode(uint64_t prev, uint64_t cur) const {
(void)prev;
if (!initialized_) {
exit(EXIT_FAILURE);
Expand Down
20 changes: 10 additions & 10 deletions tokenizer/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ class Tokenizer {
virtual void load(const std::string& tokenizer_path) = 0;

virtual std::vector<uint64_t>
encode(const std::string& input, int8_t bos, int8_t eos) = 0;
encode(const std::string& input, int8_t bos, int8_t eos) const = 0;

virtual std::string decode(uint64_t prev_token, uint64_t token) = 0;
virtual std::string decode(uint64_t prev_token, uint64_t token) const = 0;

// getters
int32_t vocab_size() const {
Expand Down Expand Up @@ -70,9 +70,9 @@ class SPTokenizer : public Tokenizer {
void load(const std::string& tokenizer_path) override;

std::vector<uint64_t> encode(const std::string& input, int8_t bos, int8_t eos)
override;
const override;

std::string decode(uint64_t prev_token, uint64_t token) override;
std::string decode(uint64_t prev_token, uint64_t token) const override;

private:
std::unique_ptr<sentencepiece::SentencePieceProcessor> _processor;
Expand All @@ -90,12 +90,12 @@ class Tiktoken : public Tokenizer {
explicit Tiktoken();
~Tiktoken(){};

void load(const std::string& tokenizer_path);
void load(const std::string& tokenizer_path) override;

std::vector<uint64_t>
encode(const std::string& input, int8_t bos, int8_t eos);
encode(const std::string& input, int8_t bos, int8_t eos) const override;

std::string decode(uint64_t prev_token, uint64_t token);
std::string decode(uint64_t prev_token, uint64_t token) const override;

private:
static inline const Encoder _get_special_tokens(ssize_t num_base_tokens) {
Expand All @@ -122,17 +122,17 @@ class Tiktoken : public Tokenizer {
std::pair<std::optional<std::string>, re2::StringPiece>
_split_with_allowed_special_token(
re2::StringPiece& input,
const T& allowed_special);
const T& allowed_special) const;

void _encode(
re2::StringPiece& input,
std::vector<uint64_t>& ret,
uint64_t& last_piece_token_len);
uint64_t& last_piece_token_len) const;

template <typename T>
std::pair<std::vector<uint64_t>, uint64_t> _encode_with_special_token(
const std::string& text,
const T& allowed_special);
const T& allowed_special) const;

// Removed negative lookahead \s+(?!\S) since it's not supported by RE2.
const std::string _pattern =
Expand Down

0 comments on commit 4e5a279

Please sign in to comment.