llama: clear kv cache for each run_model call

This commit adds a call to `llama_kv_cache_clear` for each call to `run_model`. This is done because the same sequence id is currently being used for each call to `run_model` which can cause tokens from a previous call to be in the catch. This can cause the model to use tokens from a previous decode call in the attention mechanism which can cause the model to generate incorrect information. Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
sobelio · Dec 15, 2023 · 5333d50 · 5333d50
1 parent cbc1fdc
commit 5333d50
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 1 deletion.
diff --git a/crates/llm-chain-llama/src/context.rs b/crates/llm-chain-llama/src/context.rs
@@ -6,7 +6,7 @@ use crate::options::LlamaInvocation;
 use anyhow::Result;
 use llm_chain_llama_sys::{
     llama_context, llama_context_default_params, llama_context_params, llama_decode, llama_eval,
-    llama_free, llama_get_embeddings, llama_get_logits, llama_get_logits_ith,
+    llama_free, llama_get_embeddings, llama_get_logits, llama_get_logits_ith, llama_kv_cache_clear,
     llama_load_model_from_file, llama_model, llama_n_embd, llama_n_vocab,
     llama_new_context_with_model, llama_sample_repetition_penalties, llama_sample_tail_free,
     llama_sample_temperature, llama_sample_token, llama_sample_token_greedy,
@@ -310,6 +310,10 @@ impl LLamaContext {
         unsafe { llama_token_nl(self.model) }
     }
 
+    pub fn llama_kv_cache_clear(&self) {
+        unsafe { llama_kv_cache_clear(self.ctx) }
+    }
+
     pub fn llama_token_to_piece(
         &self,
         token_id: i32,

diff --git a/crates/llm-chain-llama/src/executor.rs b/crates/llm-chain-llama/src/executor.rs
@@ -62,6 +62,17 @@ impl Executor {
             let context_size = context_size;
             let context = context.blocking_lock();
 
+            // The following clears the Key-Value cache to allow conversational
+            // (chat) applications to be able to call run_model multiple times
+            // using the same context. Without this, and because the same
+            // sequence id is used below, the cache can contain tokens from
+            // a previous interaction which may cause the model to generate
+            // a response that is not appropriate for the current prompt.
+            //
+            // TODO(danbev) Is there a better way to do this, perhaps by using
+            // sequence ids in some way?
+            context.llama_kv_cache_clear();
+
             let tokenized_stop_prompt = tokenize(
                 &context,
                 input