Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[TEST] use gguf tokenizer #12757

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,28 @@ endif()
add_library(npu_llm STATIC IMPORTED)
set_target_properties(npu_llm PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/npu_llm.lib)

add_library(llama STATIC IMPORTED)
set_target_properties(llama PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/llama.lib)

add_library(common STATIC IMPORTED)
set_target_properties(common PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/common.lib)

add_library(ggml STATIC IMPORTED)
set_target_properties(ggml PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/ggml.lib)

set(TARGET llama-cli-npu)
add_executable(${TARGET} llama-cli-npu.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE npu_llm)
target_link_libraries(${TARGET} PRIVATE npu_llm common llama ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

add_custom_command(TARGET llama-cli-npu POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${LIBRARY_DIR}/npu_llm.dll
${LIBRARY_DIR}/llama.dll
${LIBRARY_DIR}/ggml.dll
${CMAKE_BINARY_DIR}/Release/
COMMENT "Copying npu_llm.dll to build/Release\n"
COMMENT "Copying npu_llm.dll llama.dll ggml.dll to build/Release\n"
)

add_custom_command(TARGET llama-cli-npu POST_BUILD
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,23 @@
#include "common.h"
#include "npu_llm.h"

#include "llamacpp/arg.h"
#include "llamacpp/common.h"
#include "llamacpp/log.h"
#include "llamacpp/llama.h"
#include <filesystem>
#include <vector>
#include<iostream>


struct gguf_tokenizer_params {
llama_context * ctx;
int32_t bos_token_id;
int32_t eos_token_id;
bool add_bos;
bool parse_special;
};


static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
Expand All @@ -30,6 +47,20 @@ static void print_usage(int, char ** argv) {
}


vector<int32_t> gguf_tokenize(std::string prompt,
gguf_tokenizer_params tok_params) {
int n_tokens = prompt.length() + 2 * tok_params.add_bos;
std::vector<int32_t> ids = llama_tokenize(tok_params.ctx, prompt,
tok_params.add_bos, tok_params.parse_special);
return ids;
}

std::string gguf_decode(vector<int32_t> tokens, gguf_tokenizer_params tok_params) {
std::string output = llama_detokenize(tok_params.ctx, tokens, tok_params.parse_special);
return output;
}


const std::string llama2_template = "<s>[INST] <<SYS>>\n\n<</SYS>>\n\n%s [/INST]";
const std::string llama3_template = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n";
const std::string minicpm_template = "<用户>%s<AI>";
Expand Down Expand Up @@ -99,7 +130,7 @@ std::string add_chat_history(npu_model_params model_params,
}

std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_inp_size,
npu_model_params model_params, tokenizer_params tok_params, npu_generation_params generation_params){
npu_model_params model_params, gguf_tokenizer_params tok_params, npu_generation_params generation_params){
float* logits = run_prefill(void_model, embd_inp_ptr, embd_inp_size,
generation_params.repetition_penalty);
int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
Expand All @@ -112,15 +143,15 @@ std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_i
auto logits = run_decode(void_model, embd[i-1],
generation_params.repetition_penalty);
int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
if (std::find(tok_params.eos_token_id.begin(), tok_params.eos_token_id.end(), token) == tok_params.eos_token_id.end()){
if (tok_params.eos_token_id != token){
embd.push_back(token);
token_nums ++;
} else {
break;
}
}

std::string output = llm_decode(embd);
std::string output = gguf_decode(embd, tok_params);

return output;
}
Expand All @@ -131,6 +162,10 @@ int main(int argc, char ** argv) {

// path to the npu model directory
char* model_dir;
// path to gguf model
// TODO: rm hard code
std::string path = "D:\\binbin\\Llama-3.2-3B-Instruct-Q4_1.gguf";
const char * gguf_path = path.c_str();
// prompt to generate text from
std::string prompt = "AI是什么?";
// number of tokens to predict
Expand Down Expand Up @@ -187,13 +222,32 @@ int main(int argc, char ** argv) {
params.model = model_dir;
params.prompt = prompt;

// gguf
llama_backend_init();
llama_model_params gguf_model_params = llama_model_default_params();
gguf_model_params.vocab_only = true;
llama_model * gguf_model = llama_load_model_from_file(gguf_path, gguf_model_params);
auto cparams = llama_context_default_params();
llama_context * ctx = llama_new_context_with_model(gguf_model, cparams);
if (!gguf_model) {
fprintf(stderr, "Error: could not load model from file '%s'.\n", gguf_path);
return 1;
}

gguf_tokenizer_params tok_params;
tok_params.ctx = ctx;
tok_params.bos_token_id = llama_token_bos(gguf_model);
tok_params.eos_token_id = llama_token_eos(gguf_model);
tok_params.add_bos = llama_add_bos_token(gguf_model);
tok_params.parse_special = true;

// npu_model_params model_params;
void* model = load_model_from_file(params.model);
npu_model_params model_params;
load_config_from_file(model_params, params.model);

tokenizer_params tok_params;
load_tokenizer(tok_params, params.model);
// tokenizer_params tok_params;
// load_tokenizer(tok_params, params.model);

npu_generation_params generation_params;
load_generation_config_from_file(generation_params, params.model);
Expand All @@ -214,11 +268,11 @@ int main(int argc, char ** argv) {
std::string full_prompt = add_chat_history(model_params, prompt, history, true);

// tokenize input
std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
std::vector<int32_t> embd_inp = gguf_tokenize(full_prompt, tok_params);
if (embd_inp.size() > model_params.max_prompt_len){
// empty chat history
full_prompt = add_chat_history(model_params, prompt, "", true);
embd_inp = llm_tokenize(full_prompt, false);
embd_inp = gguf_tokenize(full_prompt, tok_params);
}

generation_params.max_new_token = model_params.kv_len - embd_inp.size();
Expand All @@ -239,7 +293,7 @@ int main(int argc, char ** argv) {
std::string full_prompt = add_chat_template(model_params, params.prompt);

// tokenize input
std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
std::vector<int32_t> embd_inp = gguf_tokenize(full_prompt, tok_params);

// single text generation
std::string output = run_generate(model, embd_inp.data(), embd_inp.size(),
Expand Down
Loading