diff --git a/Cargo.lock b/Cargo.lock index c273c883..c2c12869 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -646,6 +646,7 @@ checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" name = "llama-cpp-2" version = "0.1.69" dependencies = [ + "encoding_rs", "enumflags2", "llama-cpp-sys-2", "thiserror", @@ -1230,14 +1231,6 @@ dependencies = [ "percent-encoding", ] -[[package]] -name = "usage" -version = "0.1.69" -dependencies = [ - "encoding_rs", - "llama-cpp-2", -] - [[package]] name = "utf16_iter" version = "1.0.5" diff --git a/Cargo.toml b/Cargo.toml index 40d5cdf5..8661274f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [workspace] resolver = "2" -members = ["llama-cpp-sys-2", "llama-cpp-2", "embeddings", "examples/usage", "examples/simple"] +members = ["llama-cpp-sys-2", "llama-cpp-2", "embeddings", "examples/simple"] [workspace.dependencies] # core library deps diff --git a/examples/usage/src/main.rs b/examples/usage.rs similarity index 90% rename from examples/usage/src/main.rs rename to examples/usage.rs index 437ff928..1b7d1f5d 100644 --- a/examples/usage/src/main.rs +++ b/examples/usage.rs @@ -1,14 +1,13 @@ //! # Usage -//! +//! //! This is just about the smallest possible way to do inference. To fetch a model from hugging face: -//! -//! ```bash +//! +//! ```console //! git clone --recursive https://github.com/utilityai/llama-cpp-rs //! cd llama-cpp-rs/examples/usage //! wget https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf -//! cargo run --bin usage -- qwen2-1_5b-instruct-q4_0.gguf +//! cargo run --example usage -- qwen2-1_5b-instruct-q4_0.gguf //! ``` -use std::io::Write; use llama_cpp_2::context::params::LlamaContextParams; use llama_cpp_2::llama_backend::LlamaBackend; use llama_cpp_2::llama_batch::LlamaBatch; @@ -16,6 +15,7 @@ use llama_cpp_2::model::params::LlamaModelParams; use llama_cpp_2::model::LlamaModel; use llama_cpp_2::model::{AddBos, Special}; use llama_cpp_2::token::data_array::LlamaTokenDataArray; +use std::io::Write; #[allow(clippy::cast_possible_wrap, clippy::cast_possible_truncation)] fn main() { @@ -23,7 +23,8 @@ fn main() { let backend = LlamaBackend::init().unwrap(); let params = LlamaModelParams::default(); - let prompt = "<|im_start|>user\nHello! how are you?<|im_end|>\n<|im_start|>assistant\n".to_string(); + let prompt = + "<|im_start|>user\nHello! how are you?<|im_end|>\n<|im_start|>assistant\n".to_string(); LlamaContextParams::default(); let model = LlamaModel::load_from_file(&backend, model_path, ¶ms).expect("unable to load model"); @@ -48,14 +49,11 @@ fn main() { } ctx.decode(&mut batch).expect("llama_decode() failed"); - let mut n_cur = batch.n_tokens(); - // The `Decoder` let mut decoder = encoding_rs::UTF_8.new_decoder(); - while n_cur <= n_len { // sample the next token { @@ -72,7 +70,9 @@ fn main() { break; } - let output_bytes = model.token_to_bytes(new_token_id, Special::Tokenize).unwrap(); + let output_bytes = model + .token_to_bytes(new_token_id, Special::Tokenize) + .unwrap(); // use `Decoder.decode_to_string()` to avoid the intermediate buffer let mut output_string = String::with_capacity(32); let _decode_result = decoder.decode_to_string(&output_bytes, &mut output_string, false); diff --git a/examples/usage/Cargo.toml b/examples/usage/Cargo.toml deleted file mode 100644 index ac5de8d1..00000000 --- a/examples/usage/Cargo.toml +++ /dev/null @@ -1,19 +0,0 @@ -[package] -name = "usage" -version = "0.1.69" -edition = "2021" - -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html - -[dependencies] -llama-cpp-2 = { path = "../../llama-cpp-2", version = "0.1.69" } -encoding_rs = { workspace = true } - -[features] -cuda = ["llama-cpp-2/cuda"] -metal = ["llama-cpp-2/metal"] -native = ["llama-cpp-2/native"] -vulkan = ["llama-cpp-2/vulkan"] - -[lints] -workspace = true diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml index 907b6191..4c184639 100644 --- a/llama-cpp-2/Cargo.toml +++ b/llama-cpp-2/Cargo.toml @@ -14,6 +14,9 @@ llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", version = "0.1.69" } thiserror = { workspace = true } tracing = { workspace = true } +[dev-dependencies] +encoding_rs = { workspace = true } + [features] cuda = ["llama-cpp-sys-2/cuda"] metal = ["llama-cpp-sys-2/metal"] @@ -32,3 +35,7 @@ workspace = true [package.metadata.docs.rs] features = ["sampler"] + +[[example]] +name = "usage" +path = "../examples/usage.rs" diff --git a/llama-cpp-2/src/context/sample/sampler.rs b/llama-cpp-2/src/context/sample/sampler.rs index cfe90499..948a1aa5 100644 --- a/llama-cpp-2/src/context/sample/sampler.rs +++ b/llama-cpp-2/src/context/sample/sampler.rs @@ -3,7 +3,7 @@ //! like [`crate::context::LlamaContext`] or token history to the sampler. //! //! # Example -//! +//! //! **Llama.cpp default sampler** //! //! ```rust diff --git a/llama-cpp-2/src/model.rs b/llama-cpp-2/src/model.rs index 38965200..39e3ea3b 100644 --- a/llama-cpp-2/src/model.rs +++ b/llama-cpp-2/src/model.rs @@ -1,6 +1,6 @@ //! A safe wrapper around `llama_model`. -use std::ffi::CString; use std::ffi::CStr; +use std::ffi::CString; use std::num::NonZeroU16; use std::os::raw::c_int; use std::path::Path; @@ -550,7 +550,11 @@ impl LlamaModel { if res > buff.len() as i32 { return Err(ApplyChatTemplateError::BuffSizeError); } - Ok::(CStr::from_ptr(buff.as_mut_ptr()).to_string_lossy().to_string()) + Ok::( + CStr::from_ptr(buff.as_mut_ptr()) + .to_string_lossy() + .to_string(), + ) }?; Ok(formatted_chat) } diff --git a/llama-cpp-2/src/model/params/kv_overrides.rs b/llama-cpp-2/src/model/params/kv_overrides.rs index 7d10256d..8bbcbdd4 100644 --- a/llama-cpp-2/src/model/params/kv_overrides.rs +++ b/llama-cpp-2/src/model/params/kv_overrides.rs @@ -33,17 +33,13 @@ impl ParamOverrideValue { llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 { val_bool: *value } } ParamOverrideValue::Float(value) => { - llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 { - val_f64: *value, - } + llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 { val_f64: *value } } ParamOverrideValue::Int(value) => { llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 { val_i64: *value } } ParamOverrideValue::Str(c_string) => { - llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 { - val_str: *c_string, - } + llama_cpp_sys_2::llama_model_kv_override__bindgen_ty_1 { val_str: *c_string } } } } diff --git a/llama-cpp-2/src/token_type.rs b/llama-cpp-2/src/token_type.rs index 47eaf287..ca51be36 100644 --- a/llama-cpp-2/src/token_type.rs +++ b/llama-cpp-2/src/token_type.rs @@ -42,7 +42,7 @@ impl TryFrom for LlamaTokenAttrs { type Error = LlamaTokenTypeFromIntError; fn try_from(value: llama_cpp_sys_2::llama_vocab_type) -> Result { - Ok(Self(BitFlags::from_bits(value).map_err(|e| { + Ok(Self(BitFlags::from_bits(value as _).map_err(|e| { LlamaTokenTypeFromIntError::UnknownValue(e.invalid_bits()) })?)) } diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 35b393ec..24bd7fc5 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -91,11 +91,9 @@ fn compile_bindings( llama_header_path: &Path, ) -> Result<(), Box> { println!("Generating bindings.."); - - let includes = [ - llama_header_path.join("ggml").join("include"), - ]; - + + let includes = [llama_header_path.join("ggml").join("include")]; + let bindings = bindgen::Builder::default() .clang_args(includes.map(|path| format!("-I{}", path.to_string_lossy()))) .header( @@ -425,9 +423,7 @@ fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'stati // nvcc.flag("-Wno-pedantic"); // } - for lib in [ - "cuda", "cublas", "cudart", "cublasLt" - ] { + for lib in ["cuda", "cublas", "cudart", "cublasLt"] { println!("cargo:rustc-link-lib={}", lib); } if !nvcc.get_compiler().is_like_msvc() { @@ -623,31 +619,44 @@ fn gen_vulkan_shaders(out_path: impl AsRef) -> (impl AsRef, impl AsR .cpp(true) .get_compiler(); - assert!(!cxx.is_like_msvc(), "Compiling Vulkan GGML with MSVC is not supported at this time."); + assert!( + !cxx.is_like_msvc(), + "Compiling Vulkan GGML with MSVC is not supported at this time." + ); let vulkan_shaders_gen_bin = out_path.as_ref().join("vulkan-shaders-gen"); cxx.to_command() .args([ - vulkan_shaders_src.join("vulkan-shaders-gen.cpp").as_os_str(), - "-o".as_ref(), vulkan_shaders_gen_bin.as_os_str() + vulkan_shaders_src + .join("vulkan-shaders-gen.cpp") + .as_os_str(), + "-o".as_ref(), + vulkan_shaders_gen_bin.as_os_str(), ]) - .output().expect("Could not compile Vulkan shader generator"); + .output() + .expect("Could not compile Vulkan shader generator"); let header = out_path.as_ref().join("ggml-vulkan-shaders.hpp"); let source = out_path.as_ref().join("ggml-vulkan-shaders.cpp"); Command::new(vulkan_shaders_gen_bin) .args([ - "--glslc".as_ref(), "glslc".as_ref(), - "--input-dir".as_ref(), vulkan_shaders_src.as_os_str(), - "--output-dir".as_ref(), out_path.as_ref().join("vulkan-shaders.spv").as_os_str(), - "--target-hpp".as_ref(), header.as_os_str(), - "--target-cpp".as_ref(), source.as_os_str(), - "--no-clean".as_ref() + "--glslc".as_ref(), + "glslc".as_ref(), + "--input-dir".as_ref(), + vulkan_shaders_src.as_os_str(), + "--output-dir".as_ref(), + out_path.as_ref().join("vulkan-shaders.spv").as_os_str(), + "--target-hpp".as_ref(), + header.as_os_str(), + "--target-cpp".as_ref(), + source.as_os_str(), + "--no-clean".as_ref(), ]) - .output().expect("Could not run Vulkan shader generator"); - + .output() + .expect("Could not run Vulkan shader generator"); + (out_path, source) }