intel · plusbang · Jan 26, 2025 · Jan 26, 2025 · Jan 26, 2025 · Jan 26, 2025
diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/CMakeLists.txt b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/CMakeLists.txt
@@ -18,17 +18,28 @@ endif()
 add_library(npu_llm STATIC IMPORTED)
 set_target_properties(npu_llm PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/npu_llm.lib)
 
+add_library(llama STATIC IMPORTED)
+set_target_properties(llama PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/llama.lib)
+
+add_library(common STATIC IMPORTED)
+set_target_properties(common PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/common.lib)
+
+add_library(ggml STATIC IMPORTED)
+set_target_properties(ggml PROPERTIES IMPORTED_LOCATION ${LIBRARY_DIR}/ggml.lib)
+
 set(TARGET llama-cli-npu)
 add_executable(${TARGET} llama-cli-npu.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE npu_llm)
+target_link_libraries(${TARGET} PRIVATE npu_llm common llama ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
 add_custom_command(TARGET llama-cli-npu POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${LIBRARY_DIR}/npu_llm.dll
+        ${LIBRARY_DIR}/llama.dll
+        ${LIBRARY_DIR}/ggml.dll
         ${CMAKE_BINARY_DIR}/Release/
-    COMMENT "Copying npu_llm.dll to build/Release\n"
+    COMMENT "Copying npu_llm.dll llama.dll ggml.dll to build/Release\n"
 )
 
 add_custom_command(TARGET llama-cli-npu POST_BUILD

diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/llama-cli-npu.cpp b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/CPP_Examples/llama-cli-npu.cpp
@@ -22,6 +22,23 @@
 #include "common.h"
 #include "npu_llm.h"
 
+#include "llamacpp/arg.h"
+#include "llamacpp/common.h"
+#include "llamacpp/log.h"
+#include "llamacpp/llama.h"
+#include <filesystem>
+#include <vector>
+#include<iostream>
+
+
+struct gguf_tokenizer_params {
+    llama_context * ctx;
+    int32_t bos_token_id;
+    int32_t eos_token_id;
+    bool add_bos;
+    bool parse_special;
+};
+
 
 static void print_usage(int, char ** argv) {
     printf("\nexample usage:\n");
@@ -30,6 +47,20 @@ static void print_usage(int, char ** argv) {
 }
 
 
+vector<int32_t> gguf_tokenize(std::string prompt,
+                              gguf_tokenizer_params tok_params) {
+  int n_tokens = prompt.length() + 2 * tok_params.add_bos;
+  std::vector<int32_t> ids = llama_tokenize(tok_params.ctx, prompt,
+                                            tok_params.add_bos, tok_params.parse_special);
+  return ids;
+}
+
+std::string gguf_decode(vector<int32_t> tokens, gguf_tokenizer_params tok_params) {
+  std::string output = llama_detokenize(tok_params.ctx, tokens, tok_params.parse_special);
+  return output;
+}
+
+
 const std::string llama2_template = "<s>[INST] <<SYS>>\n\n<</SYS>>\n\n%s [/INST]";
 const std::string llama3_template = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n";
 const std::string minicpm_template = "<用户>%s<AI>";
@@ -99,7 +130,7 @@ std::string add_chat_history(npu_model_params model_params,
 }
 
 std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_inp_size,
-                         npu_model_params model_params, tokenizer_params tok_params, npu_generation_params generation_params){
+                         npu_model_params model_params, gguf_tokenizer_params tok_params, npu_generation_params generation_params){
     float* logits = run_prefill(void_model, embd_inp_ptr, embd_inp_size,
                                 generation_params.repetition_penalty);
     int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
@@ -112,15 +143,15 @@ std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_i
         auto logits = run_decode(void_model, embd[i-1],
                                  generation_params.repetition_penalty);
         int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
-        if (std::find(tok_params.eos_token_id.begin(), tok_params.eos_token_id.end(), token) == tok_params.eos_token_id.end()){
+        if (tok_params.eos_token_id != token){
             embd.push_back(token);
             token_nums ++;
         } else {
             break;
         }
     }
 
-    std::string output = llm_decode(embd);
+    std::string output = gguf_decode(embd, tok_params);
 
     return output;
 }
@@ -131,6 +162,10 @@ int main(int argc, char ** argv) {
 
     // path to the npu model directory
     char* model_dir;
+    // path to gguf model
+    // TODO: rm hard code
+    std::string path = "D:\\binbin\\Llama-3.2-3B-Instruct-Q4_1.gguf";
+    const char * gguf_path = path.c_str();
     // prompt to generate text from
     std::string prompt = "AI是什么?";
     // number of tokens to predict
@@ -187,13 +222,32 @@ int main(int argc, char ** argv) {
     params.model = model_dir;
     params.prompt = prompt;
 
+    // gguf
+    llama_backend_init();
+    llama_model_params gguf_model_params = llama_model_default_params();
+    gguf_model_params.vocab_only = true;
+    llama_model * gguf_model = llama_load_model_from_file(gguf_path, gguf_model_params);
+    auto cparams = llama_context_default_params();
+    llama_context * ctx = llama_new_context_with_model(gguf_model, cparams);
+    if (!gguf_model) {
+        fprintf(stderr, "Error: could not load model from file '%s'.\n", gguf_path);
+        return 1;
+    }
+
+    gguf_tokenizer_params tok_params;
+    tok_params.ctx = ctx;
+    tok_params.bos_token_id =  llama_token_bos(gguf_model);
+    tok_params.eos_token_id =  llama_token_eos(gguf_model);
+    tok_params.add_bos = llama_add_bos_token(gguf_model);
+    tok_params.parse_special = true;
+
     // npu_model_params model_params;
     void* model = load_model_from_file(params.model);
     npu_model_params model_params;
     load_config_from_file(model_params, params.model);
 
-    tokenizer_params tok_params;
-    load_tokenizer(tok_params, params.model);
+    // tokenizer_params tok_params;
+    // load_tokenizer(tok_params, params.model);
 
     npu_generation_params generation_params;
     load_generation_config_from_file(generation_params, params.model);
@@ -214,11 +268,11 @@ int main(int argc, char ** argv) {
                 std::string full_prompt = add_chat_history(model_params, prompt, history, true);
 
                 // tokenize input
-                std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
+                std::vector<int32_t> embd_inp = gguf_tokenize(full_prompt, tok_params);
                 if (embd_inp.size() > model_params.max_prompt_len){
                     // empty chat history
                     full_prompt = add_chat_history(model_params, prompt, "", true);
-                    embd_inp = llm_tokenize(full_prompt, false);
+                    embd_inp = gguf_tokenize(full_prompt, tok_params);
                 }
 
                 generation_params.max_new_token = model_params.kv_len - embd_inp.size();
@@ -239,7 +293,7 @@ int main(int argc, char ** argv) {
         std::string full_prompt = add_chat_template(model_params, params.prompt);
 
         // tokenize input
-        std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
+        std::vector<int32_t> embd_inp = gguf_tokenize(full_prompt, tok_params);
 
         // single text generation
         std::string output = run_generate(model, embd_inp.data(), embd_inp.size(),