TabbyML · wsxiaoys · Sep 5, 2023 · Sep 3, 2023 · Sep 3, 2023 · Sep 3, 2023
diff --git a/crates/llama-cpp-bindings/include/engine.h b/crates/llama-cpp-bindings/include/engine.h
@@ -11,6 +11,9 @@ class TextInferenceEngine {
 
   virtual uint32_t start(const rust::Str prompt) const = 0;
   virtual uint32_t step(uint32_t next_token_id) const = 0;
+  virtual void end() const = 0;
+
+  virtual uint32_t eos_token() const = 0;
 };
 
 std::shared_ptr<TextInferenceEngine> create_engine(rust::Str model_path);

diff --git a/crates/llama-cpp-bindings/llama.cpp b/crates/llama-cpp-bindings/llama.cpp
diff --git a/crates/llama-cpp-bindings/src/engine.cc b/crates/llama-cpp-bindings/src/engine.cc
@@ -37,6 +37,7 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
 
   uint32_t start(const rust::Str prompt) const override {
     auto* ctx = ctx_.get();
+    llama_reset_timings(ctx);
     std::vector<llama_token> tokens_list = tokenize(ctx, std::string(prompt), /* add_bos = */ true);
     eval(tokens_list, /* reset = */ true);
     return sample();
@@ -47,6 +48,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
     return sample();
   }
 
+  void end() const override {
+    llama_print_timings(ctx_.get());
+  }
+
+  uint32_t eos_token() const override {
+    return llama_token_eos(ctx_.get());
+  }
+
  private:
   uint32_t sample() const {
     auto* ctx = ctx_.get();
@@ -65,7 +74,7 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
           tokens_list.data(),
           tokens_list.size(),
           reset ? 0 : llama_get_kv_cache_token_count(ctx),
-          /* n_threads = */ 1)) {
+          /* n_threads = */ 4)) {
       fprintf(stderr, "%s : failed to eval\n", __func__);
       return false;
     }
@@ -92,7 +101,8 @@ std::shared_ptr<TextInferenceEngine> create_engine(rust::Str model_path) {
   static BackendInitializer initializer;
 
   llama_context_params ctx_params = llama_context_default_params();
-  ctx_params.n_gpu_layers = 4;
+  ctx_params.n_ctx = 2048;
+  ctx_params.n_gpu_layers = 1;
 
   llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), ctx_params);
 

diff --git a/crates/llama-cpp-bindings/src/lib.rs b/crates/llama-cpp-bindings/src/lib.rs
@@ -19,6 +19,9 @@ mod ffi {
 
         fn start(&self, prompt: &str) -> u32;
         fn step(&self, next_token_id: u32) -> u32;
+        fn end(&self);
+
+        fn eos_token(&self) -> u32;
     }
 }
 
@@ -62,7 +65,13 @@ impl TextGeneration for LlamaEngine {
 
         let output_ids = tokio::task::spawn_blocking(move || {
             let engine = engine.lock().unwrap();
+            let eos_token = engine.eos_token();
+
             let mut next_token_id = engine.start(&prompt);
+            if next_token_id == eos_token {
+                return Vec::new();
+            }
+
             let mut n_remains = options.max_decoding_length - 1;
             let mut output_ids = vec![next_token_id];
 
@@ -73,18 +82,22 @@ impl TextGeneration for LlamaEngine {
                 }
 
                 next_token_id = engine.step(next_token_id);
+                if next_token_id == eos_token {
+                    break;
+                }
+
                 if stop_condition.next_token(next_token_id) {
                     break;
                 }
                 output_ids.push(next_token_id);
                 n_remains -= 1;
             }
 
+            engine.end();
             output_ids
         })
         .await
         .expect("Inference failed");
-
         self.tokenizer.decode(&output_ids, true).unwrap()
     }
 }
+0 −1		.github/workflows/build.yml
+36 −0		.github/workflows/code-coverage.yml
+7 −0		.gitignore
+6 −1		CMakeLists.txt
+117 −89		Makefile
+11 −2		Package.swift
+37 −5		README.md
+14 −0		codecov.yml
+152 −2		common/common.cpp
+36 −0		common/common.h
+1 −0		common/console.cpp
+16 −16		common/log.h
+28 −4		convert.py
+52 −0		embedfile.c
+1 −0		examples/CMakeLists.txt
+0 −5		examples/baby-llama/baby-llama.cpp
+5 −3		examples/beam-search/beam-search.cpp
+38 −35		examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+7 −6		examples/gptneox-wip/gptneox-main.cpp
+2 −2		examples/main/README.md
+19 −121		examples/main/main.cpp
+19 −5		examples/quantize/quantize.cpp
+17 −5		examples/server/server.cpp
+8 −0		examples/speculative/CMakeLists.txt
+234 −0		examples/speculative/speculative.cpp
+7 −0		ggml-alloc.c
+17 −0		ggml-cuda.cu
+55 −14		ggml-metal.m
+160 −74		ggml-metal.metal
+1 −1		ggml-opencl.cpp
+227 −46		ggml.c
+7,851 −0		ggml_metal_file.c
+2 −2		gguf-py/gguf/gguf.py
+1 −1		gguf-py/pyproject.toml
+42 −0		grammars/c.gbnf
+37 −11		k_quants.c
+72 −15		llama.cpp
+1 −0		llama.h