diff --git a/Cargo.lock b/Cargo.lock index dfe6f9eb..786b8c34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1829,6 +1829,7 @@ version = "0.13.0" dependencies = [ "anyhow", "async-trait", + "futures", "lazy_static", "llm-chain", "llm-chain-llama-sys", @@ -1919,6 +1920,7 @@ dependencies = [ "anyhow", "async-trait", "llm-chain", + "llm-chain-llama", "llm-chain-openai", "qdrant-client", "serde", diff --git a/crates/llm-chain-llama/Cargo.toml b/crates/llm-chain-llama/Cargo.toml index edf23d22..7f95da73 100644 --- a/crates/llm-chain-llama/Cargo.toml +++ b/crates/llm-chain-llama/Cargo.toml @@ -22,6 +22,7 @@ serde = { version = "1.0.163", features = ["derive"] } thiserror.workspace = true lazy_static = "1.4.0" tokio.workspace = true +futures = "0.3.29" [dev-dependencies] tokio = { version = "1.28.2", features = ["macros", "rt"] } diff --git a/crates/llm-chain-llama/examples/simple_embeddings.rs b/crates/llm-chain-llama/examples/simple_embeddings.rs new file mode 100644 index 00000000..aef05cc5 --- /dev/null +++ b/crates/llm-chain-llama/examples/simple_embeddings.rs @@ -0,0 +1,26 @@ +use llm_chain::options; +use llm_chain::traits::Embeddings; + +/// This example demonstrates using llm-chain-llama for generating +/// embeddings. +/// +/// Usage: +/// env LLM_CHAIN_MODEL= cargo run --example simple_embeddings +/// +#[tokio::main] +async fn main() -> Result<(), Box> { + let opts = options!( + NThreads: 4_usize, + MaxTokens: 2048_usize + ); + let embeddings = llm_chain_llama::embeddings::Embeddings::new_with_options(opts)?; + let embedded_vecs = embeddings + .embed_texts(vec![ + "This is an amazing way of writing LLM-powered applications".to_string(), + ]) + .await + .unwrap(); + println!("Embedded text: {:?}", embedded_vecs[0]); + + Ok(()) +} diff --git a/crates/llm-chain-llama/src/batch.rs b/crates/llm-chain-llama/src/batch.rs index 17af1c73..3b87d9cd 100644 --- a/crates/llm-chain-llama/src/batch.rs +++ b/crates/llm-chain-llama/src/batch.rs @@ -5,7 +5,7 @@ use std::ptr::null_mut; #[allow(dead_code)] pub struct LlamaBatch { n_tokens: i32, - token: Vec, + tokens: Vec, embd: Vec, pos: Vec, n_seq_id: Vec, @@ -29,7 +29,7 @@ impl LlamaBatch { Self { n_tokens: tokens.len() as i32, - token: tokens, + tokens, embd, pos, n_seq_id, @@ -44,7 +44,7 @@ impl LlamaBatch { pub fn new_with_token(token: i32, pos: i32) -> Self { Self { n_tokens: 1, - token: vec![token], + tokens: vec![token], embd: vec![], pos: vec![pos], n_seq_id: vec![1], @@ -75,22 +75,39 @@ impl Drop for LlamaBatch { fn convert_llama_batch(batch: &LlamaBatch) -> llama_batch { let n_tokens = batch.n_tokens; - let token_ptr = Box::leak(batch.token.clone().into_boxed_slice()).as_mut_ptr(); + let token_ptr = Box::leak(batch.tokens.clone().into_boxed_slice()).as_mut_ptr(); let embd_ptr = if batch.embd.is_empty() { null_mut() } else { Box::leak(batch.embd.clone().into_boxed_slice()).as_mut_ptr() }; - let pos_ptr = Box::leak(batch.pos.clone().into_boxed_slice()).as_mut_ptr(); - let n_seq_id_ptr = Box::leak(batch.n_seq_id.clone().into_boxed_slice()).as_mut_ptr(); - let raw_pointers = batch - .seq_id - .clone() - .into_iter() - .map(|inner_vec| Box::leak(inner_vec.into_boxed_slice()).as_mut_ptr()) - .collect::>(); - let seq_id_ptr = Box::leak(raw_pointers.into_boxed_slice()).as_mut_ptr(); - let logits_ptr = Box::leak(batch.logits.clone().into_boxed_slice()).as_mut_ptr(); + let pos_ptr = if batch.pos.is_empty() { + null_mut() + } else { + Box::leak(batch.pos.clone().into_boxed_slice()).as_mut_ptr() + }; + let n_seq_id_ptr = if batch.n_seq_id.is_empty() { + null_mut() + } else { + Box::leak(batch.n_seq_id.clone().into_boxed_slice()).as_mut_ptr() + }; + + let seq_id_ptr = if batch.seq_id.is_empty() { + null_mut() + } else { + let raw_pointers = batch + .seq_id + .clone() + .into_iter() + .map(|inner_vec| Box::leak(inner_vec.into_boxed_slice()).as_mut_ptr()) + .collect::>(); + Box::leak(raw_pointers.into_boxed_slice()).as_mut_ptr() + }; + let logits_ptr = if batch.logits.is_empty() { + null_mut() + } else { + Box::leak(batch.logits.clone().into_boxed_slice()).as_mut_ptr() + }; llama_batch { n_tokens, token: token_ptr, diff --git a/crates/llm-chain-llama/src/context.rs b/crates/llm-chain-llama/src/context.rs index a84894ce..904b8b37 100644 --- a/crates/llm-chain-llama/src/context.rs +++ b/crates/llm-chain-llama/src/context.rs @@ -6,13 +6,13 @@ use crate::options::LlamaInvocation; use anyhow::Result; use llm_chain_llama_sys::{ llama_context, llama_context_default_params, llama_context_params, llama_decode, llama_eval, - llama_free, llama_get_logits, llama_get_logits_ith, llama_load_model_from_file, llama_model, - llama_n_vocab, llama_new_context_with_model, llama_sample_repetition_penalties, - llama_sample_tail_free, llama_sample_temperature, llama_sample_token, - llama_sample_token_greedy, llama_sample_token_mirostat, llama_sample_token_mirostat_v2, - llama_sample_top_k, llama_sample_top_p, llama_sample_typical, llama_token_data, - llama_token_data_array, llama_token_eos, llama_token_get_text, llama_token_nl, - llama_token_to_piece, + llama_free, llama_get_embeddings, llama_get_logits, llama_get_logits_ith, llama_kv_cache_clear, + llama_load_model_from_file, llama_model, llama_n_embd, llama_n_vocab, + llama_new_context_with_model, llama_sample_repetition_penalties, llama_sample_tail_free, + llama_sample_temperature, llama_sample_token, llama_sample_token_greedy, + llama_sample_token_mirostat, llama_sample_token_mirostat_v2, llama_sample_top_k, + llama_sample_top_p, llama_sample_typical, llama_token_data, llama_token_data_array, + llama_token_eos, llama_token_get_text, llama_token_nl, llama_token_to_piece, }; pub use batch::LlamaBatch; @@ -161,6 +161,15 @@ impl LLamaContext { Vec::from(unsafe { std::slice::from_raw_parts(float_ptr, self.llama_n_vocab() as usize) }) } + pub fn llama_get_embeddings(&self) -> Vec { + unsafe { + let len = llama_n_embd(self.model); + let ptr = llama_get_embeddings(self.ctx); + let slice = std::slice::from_raw_parts_mut(ptr, len as usize); + slice.to_vec() + } + } + // Executes the LLama sampling process with the specified configuration. pub fn llama_sample( &self, @@ -301,6 +310,10 @@ impl LLamaContext { unsafe { llama_token_nl(self.model) } } + pub fn llama_kv_cache_clear(&self) { + unsafe { llama_kv_cache_clear(self.ctx) } + } + pub fn llama_token_to_piece( &self, token_id: i32, diff --git a/crates/llm-chain-llama/src/embeddings.rs b/crates/llm-chain-llama/src/embeddings.rs new file mode 100644 index 00000000..d8283dfd --- /dev/null +++ b/crates/llm-chain-llama/src/embeddings.rs @@ -0,0 +1,159 @@ +use crate::batch::LlamaBatch; +use crate::context::ContextParams; +use crate::context::LLamaContext; +use crate::model::ModelParams; +use crate::options::{LlamaInvocation, DEFAULT_OPTIONS}; +use crate::tokenizer; +use async_trait::async_trait; +use futures::future::try_join_all; +use llm_chain::options::{options_from_env, Opt, OptDiscriminants, Options, OptionsCascade}; +use llm_chain::prompt::Data; +use llm_chain::traits::EmbeddingsCreationError; +use llm_chain::traits::{self, EmbeddingsError}; +use std::sync::Arc; +use std::{error::Error, fmt::Debug}; +use tokio::sync::Mutex; + +/// Generate embeddings using the llama. +/// +/// This intended be similar to running the embedding example in llama.cpp: +/// ./embedding -m --log-disable -p "Hello world" 2>/dev/null +/// +pub struct Embeddings { + context: Arc>, + options: Options, +} + +#[async_trait] +impl traits::Embeddings for Embeddings { + type Error = LlamaEmbeddingsError; + + async fn embed_texts(&self, texts: Vec) -> Result>, Self::Error> { + let futures = texts.into_iter().map(|text| self.embed_query(text)); + let embeddings = try_join_all(futures).await?; + Ok(embeddings) + } + + async fn embed_query(&self, query: String) -> Result, Self::Error> { + let options = vec![&DEFAULT_OPTIONS, &self.options]; + let invocation = + LlamaInvocation::new(OptionsCascade::from_vec(options), &Data::Text(query)).unwrap(); + let embeddings = self.generate_embeddings(invocation).await?; + Ok(embeddings) + } +} + +#[allow(dead_code)] +impl Embeddings { + pub fn new_with_options(opt: Options) -> Result { + //TODO(danbev) This is pretty much a duplication of the code in + // llm_chain::executor::Executor::new_with_options. Find a good place + // to share this code. + let opts_from_env = + options_from_env().map_err(|err| EmbeddingsCreationError::InnerError(err.into()))?; + let cas = OptionsCascade::new() + .with_options(&DEFAULT_OPTIONS) + .with_options(&opts_from_env) + .with_options(&opt); + + let Some(Opt::Model(model)) = cas.get(OptDiscriminants::Model) else { + return Err(EmbeddingsCreationError::FieldRequiredError( + "model_path".to_string(), + )); + }; + + let mut mp = ModelParams::new(); + if let Some(Opt::NGpuLayers(value)) = cas.get(OptDiscriminants::NGpuLayers) { + mp.n_gpu_layers = *value; + } + if let Some(Opt::MainGpu(value)) = cas.get(OptDiscriminants::MainGpu) { + mp.main_gpu = *value; + } + if let Some(Opt::TensorSplit(values)) = cas.get(OptDiscriminants::TensorSplit) { + mp.tensor_split = values.clone(); + } + // Currently, the setting of vocab_only is not allowed as it will cause + // a crash when using the llama executor which needs to have wieghts loaded + // in order to work. + mp.vocab_only = false; + + if let Some(Opt::UseMmap(value)) = cas.get(OptDiscriminants::UseMmap) { + mp.use_mmap = *value; + } + if let Some(Opt::UseMlock(value)) = cas.get(OptDiscriminants::UseMlock) { + mp.use_mlock = *value; + } + + let mut cp = ContextParams::new(); + if let Some(Opt::NThreads(value)) = cas.get(OptDiscriminants::NThreads) { + cp.n_threads = *value as u32; + } + + if let Some(Opt::MaxContextSize(value)) = cas.get(OptDiscriminants::MaxContextSize) { + cp.n_ctx = *value as u32; + } + + if let Some(Opt::MaxBatchSize(value)) = cas.get(OptDiscriminants::MaxBatchSize) { + cp.n_batch = *value as u32; + } + cp.embedding = true; + + Ok(Self { + context: Arc::new(Mutex::new(LLamaContext::from_file_and_params( + &model.to_path(), + Some(&mp), + Some(&cp), + )?)), + options: opt, + }) + } + + fn get_model_path(options: &Options) -> Result { + let opts_from_env = + options_from_env().map_err(|err| EmbeddingsCreationError::InnerError(err.into()))?; + let cas = OptionsCascade::new() + .with_options(&DEFAULT_OPTIONS) + .with_options(&opts_from_env) + .with_options(&options); + let model_path = cas + .get(OptDiscriminants::Model) + .and_then(|x| match x { + Opt::Model(m) => Some(m), + _ => None, + }) + .ok_or(EmbeddingsCreationError::FieldRequiredError( + "model_path".to_string(), + ))?; + Ok(model_path.to_path()) + } + + async fn generate_embeddings( + &self, + input: LlamaInvocation, + ) -> Result, LlamaEmbeddingsError> { + let context = self.context.clone(); + let embeddings = tokio::task::spawn_blocking(move || { + let context = context.blocking_lock(); + let prompt_text = input.prompt.to_text(); + let tokens = tokenizer::tokenize(&context, prompt_text.as_str(), true, false); + //TODO(danbev) Handle the case where the number of tokens + // are larger than the n_batch size. + let batch = LlamaBatch::new_with_tokens(tokens.clone(), 1); + let _ = context + .llama_decode(&batch) + .map_err(|e| LlamaEmbeddingsError::InnerError(e.into())); + context.llama_get_embeddings() + }); + embeddings + .await + .map_err(|e| LlamaEmbeddingsError::InnerError(e.into())) + } +} + +#[derive(thiserror::Error, Debug)] +pub enum LlamaEmbeddingsError { + #[error("error when trying to generate embeddings: {0}")] + InnerError(#[from] Box), +} + +impl EmbeddingsError for LlamaEmbeddingsError {} diff --git a/crates/llm-chain-llama/src/executor.rs b/crates/llm-chain-llama/src/executor.rs index 174580e6..a3a3b4cd 100644 --- a/crates/llm-chain-llama/src/executor.rs +++ b/crates/llm-chain-llama/src/executor.rs @@ -38,8 +38,8 @@ macro_rules! must_send { /// Executor is responsible for running the LLAMA model and managing its context. pub struct Executor { - context: Arc>, - options: Options, + pub(crate) context: Arc>, + pub(crate) options: Options, context_params: ContextParams, } @@ -62,6 +62,17 @@ impl Executor { let context_size = context_size; let context = context.blocking_lock(); + // The following clears the Key-Value cache to allow conversational + // (chat) applications to be able to call run_model multiple times + // using the same context. Without this, and because the same + // sequence id is used below, the cache can contain tokens from + // a previous interaction which may cause the model to generate + // a response that is not appropriate for the current prompt. + // + // TODO(danbev) Is there a better way to do this, perhaps by using + // sequence ids in some way? + context.llama_kv_cache_clear(); + let tokenized_stop_prompt = tokenize( &context, input diff --git a/crates/llm-chain-llama/src/lib.rs b/crates/llm-chain-llama/src/lib.rs index 16222a9d..0a1bafab 100644 --- a/crates/llm-chain-llama/src/lib.rs +++ b/crates/llm-chain-llama/src/lib.rs @@ -23,6 +23,7 @@ mod batch; mod context; +pub mod embeddings; mod executor; mod model; mod options; diff --git a/crates/llm-chain-qdrant/Cargo.toml b/crates/llm-chain-qdrant/Cargo.toml index 63da1076..a480ad7e 100644 --- a/crates/llm-chain-qdrant/Cargo.toml +++ b/crates/llm-chain-qdrant/Cargo.toml @@ -24,5 +24,6 @@ uuid = "1.3.3" [dev-dependencies] llm-chain-openai = { path = "../llm-chain-openai" } +llm-chain-llama = { path = "../llm-chain-llama" } tokio.workspace = true serde_yaml.workspace = true diff --git a/crates/llm-chain-qdrant/examples/similarity_search_llama.rs b/crates/llm-chain-qdrant/examples/similarity_search_llama.rs new file mode 100644 index 00000000..7d0dd3c3 --- /dev/null +++ b/crates/llm-chain-qdrant/examples/similarity_search_llama.rs @@ -0,0 +1,102 @@ +use llm_chain::options; +use std::sync::Arc; + +use llm_chain::{ + schema::{Document, EmptyMetadata}, + traits::VectorStore, +}; +use llm_chain_llama::embeddings::Embeddings; +use llm_chain_qdrant::Qdrant; +use qdrant_client::{ + prelude::{QdrantClient, QdrantClientConfig}, + qdrant::{CreateCollection, Distance, VectorParams, VectorsConfig}, +}; + +/// This example demonstrates using llm-chain-llama for generating +/// embeddings in combination with the Qdrant vector store. +/// +/// Usage: +/// $ env LLM_CHAIN_MODEL= cargo run --example similarity_search_llama +/// +#[tokio::main(flavor = "current_thread")] +async fn main() { + // Qdrant prep + let config = QdrantClientConfig::from_url("http://localhost:6334"); + let client = Arc::new(QdrantClient::new(Some(config)).unwrap()); + let collection_name = "my-collection".to_string(); + let embedding_size = 4096; + + if !client + .has_collection(collection_name.clone()) + .await + .unwrap() + { + client + .create_collection(&CreateCollection { + collection_name: collection_name.clone(), + vectors_config: Some(VectorsConfig { + config: Some(qdrant_client::qdrant::vectors_config::Config::Params( + VectorParams { + size: embedding_size, + distance: Distance::Cosine.into(), + hnsw_config: None, + quantization_config: None, + on_disk: None, + }, + )), + }), + ..Default::default() + }) + .await + .unwrap(); + } + let opts = options!( + NThreads: 4_usize, + MaxTokens: 3000_usize + ); + let embeddings = Embeddings::new_with_options(opts).unwrap(); + + // Storing documents + let qdrant: Qdrant = Qdrant::new( + client.clone(), + collection_name.clone(), + embeddings, + None, + None, + None, + ); + + let doc_dog_definition = r#"The dog (Canis familiaris[4][5] or Canis lupus familiaris[5]) is a domesticated descendant of the wolf. Also called the domestic dog, it is derived from the extinct Pleistocene wolf,[6][7] and the modern wolf is the dog's nearest living relative.[8] Dogs were the first species to be domesticated[9][8] by hunter-gatherers over 15,000 years ago[7] before the development of agriculture.[1] Due to their long association with humans, dogs have expanded to a large number of domestic individuals[10] and gained the ability to thrive on a starch-rich diet that would be inadequate for other canids.[11] + + The dog has been selectively bred over millennia for various behaviors, sensory capabilities, and physical attributes.[12] Dog breeds vary widely in shape, size, and color. They perform many roles for humans, such as hunting, herding, pulling loads, protection, assisting police and the military, companionship, therapy, and aiding disabled people. Over the millennia, dogs became uniquely adapted to human behavior, and the human–canine bond has been a topic of frequent study.[13] This influence on human society has given them the sobriquet of "man's best friend"."#.to_string(); + + let doc_woodstock_sound = r#"Sound for the concert was engineered by sound engineer Bill Hanley. "It worked very well", he says of the event. "I built special speaker columns on the hills and had 16 loudspeaker arrays in a square platform going up to the hill on 70-foot [21 m] towers. We set it up for 150,000 to 200,000 people. Of course, 500,000 showed up."[48] ALTEC designed marine plywood cabinets that weighed half a ton apiece and stood 6 feet (1.8 m) tall, almost 4 feet (1.2 m) deep, and 3 feet (0.91 m) wide. Each of these enclosures carried four 15-inch (380 mm) JBL D140 loudspeakers. The tweeters consisted of 4×2-Cell & 2×10-Cell Altec Horns. Behind the stage were three transformers providing 2,000 amperes of current to power the amplification setup.[49][page needed] For many years this system was collectively referred to as the Woodstock Bins.[50] The live performances were captured on two 8-track Scully recorders in a tractor trailer back stage by Edwin Kramer and Lee Osbourne on 1-inch Scotch recording tape at 15 ips, then mixed at the Record Plant studio in New York.[51]"#.to_string(); + + let doc_reddit_creep_shots = r#"A year after the closure of r/jailbait, another subreddit called r/CreepShots drew controversy in the press for hosting sexualized images of women without their knowledge.[34] In the wake of this media attention, u/violentacrez was added to r/CreepShots as a moderator;[35] reports emerged that Gawker reporter Adrian Chen was planning an exposé that would reveal the real-life identity of this user, who moderated dozens of controversial subreddits, as well as a few hundred general-interest communities. Several major subreddits banned links to Gawker in response to the impending exposé, and the account u/violentacrez was deleted.[36][37][38] Moderators defended their decisions to block the site from these sections of Reddit on the basis that the impending report was "doxing" (a term for exposing the identity of a pseudonymous person), and that such exposure threatened the site's structural integrity.[38]"#.to_string(); + + let doc_ids = qdrant + .add_documents( + vec![ + doc_dog_definition, + doc_woodstock_sound, + doc_reddit_creep_shots, + ] + .into_iter() + .map(Document::new) + .collect(), + ) + .await + .unwrap(); + + println!("Documents stored under IDs: {:?}", doc_ids); + + let response = qdrant + .similarity_search( + "Sound engineering is involved with concerts and music events".to_string(), + 1, + ) + .await + .unwrap(); + + println!("Retrieved stored documents: {:?}", response); +} diff --git a/crates/llm-chain/src/traits.rs b/crates/llm-chain/src/traits.rs index 65bc4284..3a302401 100644 --- a/crates/llm-chain/src/traits.rs +++ b/crates/llm-chain/src/traits.rs @@ -128,6 +128,15 @@ pub trait Embeddings { async fn embed_query(&self, query: String) -> Result, Self::Error>; } +#[derive(thiserror::Error, Debug)] +#[error("unable to create embeddings")] +pub enum EmbeddingsCreationError { + #[error("unable to create embeddings: {0}")] + InnerError(#[from] Box), + #[error("Field must be set: {0}")] + FieldRequiredError(String), +} + /// This marker trait is needed so users of VectorStore can derive From pub trait VectorStoreError {}