Skip to content

Commit

Permalink
feat(tokenizer): Add stub of TokenizersTokenizer
Browse files Browse the repository at this point in the history
pytorch#1251
Branch: TokenizersCpp-1251

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
  • Loading branch information
gabe-l-hart committed Oct 8, 2024
1 parent 6ddb7e3 commit 4b407a3
Show file tree
Hide file tree
Showing 2 changed files with 443 additions and 0 deletions.
48 changes: 48 additions & 0 deletions tokenizer/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,51 @@ class Tiktoken : public Tokenizer {
Re2UPtr _regex;
Re2UPtr _special_token_regex;
};


// ----------------------- Tokenizers -----------------------
// Used by many Huggingface models. Adapted from a combination of the original
// rust implementation (https://github.com/huggingface/tokenizers/tree/main)
// and the corresponding support in llama.cpp
// (https://github.com/ggerganov/llama.cpp)

class TokenizersTokenizer : public Tokenizer {
public:
/*-- Public Interface --*/

/**
* Default initialize with no loaded data
*/
explicit TokenizersTokenizer();
~TokenizersTokenizer() {};

/**
* Load the model data into the
*/
void load(const std::string& tokenizer_path) override;

/**
* Encode the input string as a list of token IDs
*/
std::vector<uint64_t>
encode(const std::string& input, int8_t bos, int8_t eos) const override;

/**
* Decode the list of token IDs into a string
*/
std::string decode(uint64_t prev_token, uint64_t token) const override;

private:
/*-- Private Methods --*/

/*-- Private Members --*/

// Sequential regex patterns to evaluate
std::vector<std::string> _patterns;

// Forward/backward mappings to/from (special) tokens and their IDs
Encoder _encoder;
Encoder _special_token_encoder;
Decoder _decoder;
Decoder _special_token_decoder;
};
Loading

0 comments on commit 4b407a3

Please sign in to comment.