xbot:使用线程池

ylsdamxssjxxdd · Sep 2, 2024 · fdcb01a · fdcb01a
1 parent 89da367
commit fdcb01a
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 56 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,7 +11,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}/Release) # 设置
 
 ###################################### 编译选项 ######################################
 option(BODY_PACK                     "pack eva"                                   OFF) # 是否打包
-option(GGML_CUDA                     "ggml: use CUDA"                             ON) # 速度900%
+option(GGML_CUDA                     "ggml: use CUDA"                             OFF) # 速度900%
 option(GGML_VULKAN                   "ggml: use Vulkan"                           OFF) # 速度250%,暂不支持sd
 
 ##################################### 处理编译选项 ####################################

diff --git a/ui/xbot.cpp b/ui/xbot.cpp
@@ -4,15 +4,15 @@
 #include "xbot.h"
 
 xBot::xBot() {
-    log_disable();                                    //禁止llama.cpp输出日志文件
+    log_disable();                                //禁止llama.cpp输出日志文件
     llama_log_set(xBot::bot_log_callback, this);  //设置回调,获取llama的日志
     QObject::connect(this, &xBot::bot_llama_log, this, &xBot::recv_llama_log);
-    showSpecial = true;  // 是否显示特殊标志 <bos> <eos> <eot>
 
     //初始的模型参数
     gpt_params_.n_gpu_layers = DEFAULT_NGL;     // gpu负载层数
     gpt_params_.model = "";                     //模型路径
-    gpt_params_.cpuparams.n_threads = DEFAULT_NTHREAD;    //默认使用一半的线程数
+    gpt_params_.cpuparams.n_threads = DEFAULT_NTHREAD;  //文字生成线程数，默认使用一半的线程数
+    gpt_params_.cpuparams_batch.n_threads = DEFAULT_NTHREAD;  //上文处理线程数，为了简单，与文字生成线程数保持一致
     gpt_params_.n_ctx = DEFAULT_NCTX;           //上下文最大长度
     gpt_params_.n_batch = DEFAULT_BATCH;        //一次最大处理批量,主要分批次推理用户的输入,新增似乎和推理时内存泄露有关
 
@@ -622,6 +622,8 @@ void xBot::load(QString modelpath_) {
         model = nullptr;
         emit bot2ui_kv(0, n_past);  //新增,当前没有缓存
         Brain_vector.clear();
+        ggml_threadpool_free(threadpool);
+        ggml_threadpool_free(threadpool_batch);
         emit bot2expend_brainvector(Brain_vector, gpt_params_.n_ctx, 1);  // 1强制刷新记忆矩阵
         emit bot2ui_state("bot:" + jtr("free model and ctx"));
     } else {
@@ -653,6 +655,28 @@ void xBot::load(QString modelpath_) {
     llama_init_result llama_init = llama_init_from_gpt_params(gpt_params_);
     model = llama_init.model;
     ctx = llama_init.context;
+
+    //创建线程池
+    struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(gpt_params_.cpuparams_batch);
+    struct ggml_threadpool_params tpp = ggml_threadpool_params_from_cpu_params(gpt_params_.cpuparams);
+    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) 
+    {
+        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        if (!threadpool_batch) // 线程池创建失败
+        {
+            is_first_load = true;
+            emit bot2ui_loadover(false, 0);
+            emit bot2ui_state(jtr("eva broken"), EVA_SIGNAL);
+            emit bot2ui_state("bot:batch threadpool create failed", WRONG_SIGNAL);
+            return;
+        }
+        // Start the non-batch threadpool in the paused state
+        tpp.paused = true;
+    }
+    set_process_priority(gpt_params_.cpuparams.priority);
+    ggml_threadpool_params_match(&tpp, &tpp_batch);
+    threadpool = ggml_threadpool_new(&tpp);
+    llama_attach_threadpool(ctx, threadpool, threadpool_batch);
 
     //挂载视觉
     if (mmprojpath != "") {
@@ -855,10 +879,7 @@ void xBot::preDecodeSystemPrompt() {
 
         std::string str;
         str = llama_token_to_piece(ctx, token);
-        if (!showSpecial && (token == eos_token || token == eot_token || token == bos_token)) {
-        } else {
-            token_str += str;
-        }
+        token_str += str;
         Brain_vector.push_back({i + 1, token, QString::fromStdString(str)});
         // qDebug()<<token<<QString::fromStdString(llama_token_to_piece(ctx, token));
     }
@@ -954,10 +975,7 @@ void xBot::push_out(INPUTS input, std::vector<llama_token> embd_output, int cont
         for (int i = 0; i < embd_output.size(); ++i) {
             const llama_token token = embd_output[i];
             std::string str = llama_token_to_piece(ctx, token);
-            if (!showSpecial && (token == eos_token || token == eot_token || token == bos_token)) {
-            } else {
-                token_str += str;
-            }
+            token_str += str;
         }
         //如果是工具输出的结果给过来的话，用天蓝色，前缀后缀都是\n则认为是工具
         if (input.role == ROLE_TOOL) {
@@ -1013,6 +1031,7 @@ void xBot::recv_set(SETTINGS settings, bool can_reload) {
     //如果线程数改变则重新加载模型
     if (gpt_params_.cpuparams.n_threads != settings.nthread) {
         gpt_params_.cpuparams.n_threads = settings.nthread;
+        gpt_params_.cpuparams_batch.n_threads = settings.nthread;
         reload_flag = true;
     }
     //如果ctx改变则重新加载模型

diff --git a/ui/xbot.h b/ui/xbot.h
@@ -47,6 +47,8 @@ class xBot : public QObject {
     llama_model_params hparams;       //模型内部参数
     gpt_params gpt_params_;           //控制模型的参数,内含控制采样的参数sparams
     llama_sampling_context *sparams;  //采样用参数
+    struct ggml_threadpool * threadpool = NULL; // 线程池，文字生成
+    struct ggml_threadpool * threadpool_batch = NULL; // 线程池，上文处理
 
     llama_model *model;  //模型
     llama_context *ctx;  //上下文
@@ -128,7 +130,6 @@ class xBot : public QObject {
     bool is_debuging = false;              // debug中状态
     int debuging_one = 0;                  // debuging时控制循环只进行一次
     std::vector<Brain_Cell> Brain_vector;  //记忆向量(当前记忆)
-    bool showSpecial = true;               // 是否显示特殊标志
 
    public slots:
     void recv_stop();//接受停止信号
@@ -166,46 +167,3 @@ class xBot : public QObject {
 };
 
 #endif  // XBOT_H
-
-
-//-------------------------------------------------------------------------
-//----------------------------------多后端支持--------------------------------
-//-------------------------------------------------------------------------
-
-// // 定义函数指针类型
-// typedef struct llama_model* (*llama_load_model_from_file_t)(const char *path_model, struct llama_model_params params);
-// typedef struct llama_context* (*llama_new_context_with_model_t)(struct llama_model * model, struct llama_context_params params);
-// typedef bool (*llava_eval_image_embed_t)(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
-// typedef struct clip_ctx * (*clip_model_load_t)(const char * fname, int verbosity);
-
-// 定义函数指针
-// llama_load_model_from_file_t llama_load_model_from_file_;
-// llama_new_context_with_model_t llama_new_context_with_model_;
-// llava_eval_image_embed_t llava_eval_image_embed_;
-// clip_model_load_t clip_model_load_;
-
-// // 动态加载
-// QString accelerate = "cuda";
-// QString libraryPath_ggml = applicationDirPath_ + "/llama-dll/ggml-" + accelerate;
-// QString libraryPath_llama = applicationDirPath_ + "/llama-dll/llama-" + accelerate;
-// QString libraryPath_llava_shared = applicationDirPath_ + "/llama-dll/llava_shared-" + accelerate;
-
-// QLibrary ggml_Lib(libraryPath_ggml);// 加载DLL
-// if (!ggml_Lib.load()) {qDebug()<<"Failed to load the DLL." << ggml_Lib.errorString();}
-
-// QLibrary llama_Lib(libraryPath_llama);// 加载DLL
-// if (!llama_Lib.load()) {qDebug()<<"Failed to load the DLL." << llama_Lib.errorString();}
-
-// QLibrary llava_shared_Lib(libraryPath_llava_shared);// 加载DLL
-// if (!llava_shared_Lib.load()) {qDebug()<<"Failed to load the DLL." << llava_shared_Lib.errorString();}
-
-// // 获取函数指针
-// llama_load_model_from_file_ = (llama_load_model_from_file_t)llama_Lib.resolve("llama_load_model_from_file");
-// llama_new_context_with_model_ = (llama_new_context_with_model_t)llama_Lib.resolve("llama_new_context_with_model");
-// if (!llama_load_model_from_file_) {qDebug()<<"Failed to resolve the function llama_load_model_from_file_.";}
-// if (!llama_new_context_with_model_) {qDebug()<<"Failed to resolve the function llama_new_context_with_model_.";}
-
-// llava_eval_image_embed_ = (llava_eval_image_embed_t)llava_shared_Lib.resolve("llava_eval_image_embed");
-// if (!llava_eval_image_embed_) {qDebug()<<"Failed to resolve the function llava_eval_image_embed.";}
-// clip_model_load_ = (clip_model_load_t)llava_shared_Lib.resolve("clip_model_load");
-// if (!clip_model_load_) {qDebug()<<"Failed to resolve the function clip_model_load.";}