diff --git a/Cargo.lock b/Cargo.lock index efa53644..8f839616 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -494,6 +494,7 @@ version = "0.1.56" dependencies = [ "bindgen", "cc", + "glob", "once_cell", ] diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml index 454276d1..c3b2e940 100644 --- a/llama-cpp-2/Cargo.toml +++ b/llama-cpp-2/Cargo.toml @@ -16,9 +16,10 @@ tracing = { workspace = true } [features] cuda = ["llama-cpp-sys-2/cuda"] metal = ["llama-cpp-sys-2/metal"] +hipblas = ["llama-cpp-sys-2/hipblas"] sampler = [] -[target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies] +[target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies] llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", features=["metal"], version = "0.1.48" } [lints] diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs index 52d63c7f..049922fb 100644 --- a/llama-cpp-2/src/lib.rs +++ b/llama-cpp-2/src/lib.rs @@ -12,6 +12,7 @@ //! # Feature Flags //! //! - `cuda` enables CUDA gpu support. +//! - `hipblas` enables hipBLAS (ROCm) gpu support (experimental). //! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling. use std::ffi::NulError; use std::fmt::Debug; diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml index 0dae9980..e6323465 100644 --- a/llama-cpp-sys-2/Cargo.toml +++ b/llama-cpp-sys-2/Cargo.toml @@ -51,8 +51,9 @@ include = [ bindgen = { workspace = true } cc = { workspace = true, features = ["parallel"] } once_cell = "1.19.0" +glob = "0.3.1" [features] cuda = [] metal = [] - +hipblas = [] diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs index 07e0e4ff..720ad9e3 100644 --- a/llama-cpp-sys-2/build.rs +++ b/llama-cpp-sys-2/build.rs @@ -1,11 +1,13 @@ -use std::env; +use std::env::{self, VarError}; use std::fs::{read_dir, File}; use std::io::Write; use std::path::{Path, PathBuf}; use std::process::Command; +use std::str::FromStr; use cc::Build; use once_cell::sync::Lazy; +use glob::glob; // This build file is based on: // https://github.com/mdrokz/rust-llama.cpp/blob/master/build.rs @@ -365,23 +367,16 @@ fn compile_blis(cx: &mut Build) { } fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static str { - const DEFAULT_ROCM_PATH_STR: &str = "/opt/rocm/"; + let rocm_path_str = env::var("ROCM_PATH").or(Ok::(String::from_str("/opt/rocm/").unwrap())).unwrap(); - let rocm_path_str = env::var("ROCM_PATH") - .map_err(|_| DEFAULT_ROCM_PATH_STR.to_string()) - .unwrap(); - println!("Compiling HIPBLAS GGML. Using ROCm from {rocm_path_str}"); + println!("Compiling hipBLAS GGML. Using ROCm from {rocm_path_str}"); let rocm_path = PathBuf::from(rocm_path_str); let rocm_include = rocm_path.join("include"); let rocm_lib = rocm_path.join("lib"); let rocm_hip_bin = rocm_path.join("bin/hipcc"); - let cuda_lib = "ggml-cuda"; - let cuda_file = cuda_lib.to_string() + ".cu"; - let cuda_header = cuda_lib.to_string() + ".h"; - - let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUBLAS"]; + let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUDA"]; for def in defines { cx.define(def, None); cxx.define(def, None); @@ -390,24 +385,39 @@ fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static cx.include(&rocm_include); cxx.include(&rocm_include); + let ggml_cuda = glob(LLAMA_PATH.join("ggml-cuda").join("*.cu").to_str().unwrap()) + .unwrap().filter_map(Result::ok).collect::>(); + let ggml_template_fattn = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("fattn-vec*.cu").to_str().unwrap()) + .unwrap().filter_map(Result::ok).collect::>(); + let ggml_template_wmma = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("fattn-wmma*.cu").to_str().unwrap()) + .unwrap().filter_map(Result::ok).collect::>(); + let ggml_template_mmq = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("mmq*.cu").to_str().unwrap()) + .unwrap().filter_map(Result::ok).collect::>(); + hip.compiler(rocm_hip_bin) .std("c++11") - .file(LLAMA_PATH.join(cuda_file)) - .include(LLAMA_PATH.join(cuda_header)) + .define("LLAMA_CUDA_DMMV_X", Some("32")) + .define("LLAMA_CUDA_MMV_Y", Some("1")) + .define("LLAMA_CUDA_KQUANTS_ITER", Some("2")) + .file(LLAMA_PATH.join("ggml-cuda.cu")) + .files(ggml_cuda) + .files(ggml_template_fattn) + .files(ggml_template_wmma) + .files(ggml_template_mmq) + .include(LLAMA_PATH.join("")) + .include(LLAMA_PATH.join("ggml-cuda")) .define("GGML_USE_HIPBLAS", None) - .compile(cuda_lib); + .define("GGML_USE_CUDA", None) + .compile("ggml-cuda"); - println!( - "cargo:rustc-link-search=native={}", - rocm_lib.to_string_lossy() - ); + println!("cargo:rustc-link-search=native={}", rocm_lib.to_string_lossy()); let rocm_libs = ["hipblas", "rocblas", "amdhip64"]; for lib in rocm_libs { println!("cargo:rustc-link-lib={lib}"); } - cuda_lib + "ggml-cuda" } fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'static str { diff --git a/simple/src/main.rs b/simple/src/main.rs index 8e6700c8..2fc4d817 100644 --- a/simple/src/main.rs +++ b/simple/src/main.rs @@ -44,7 +44,7 @@ struct Args { #[arg(short = 'o', value_parser = parse_key_val)] key_value_overrides: Vec<(String, ParamOverrideValue)>, /// Disable offloading layers to the gpu - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hipblas"))] #[clap(long)] disable_gpu: bool, #[arg(short = 's', long, help = "RNG seed (default: 1234)")] @@ -124,7 +124,7 @@ fn main() -> Result<()> { model, prompt, file, - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hipblas"))] disable_gpu, key_value_overrides, seed, @@ -138,13 +138,13 @@ fn main() -> Result<()> { // offload all layers to the gpu let model_params = { - #[cfg(feature = "cuda")] + #[cfg(any(feature = "cuda", feature = "hipblas"))] if !disable_gpu { LlamaModelParams::default().with_n_gpu_layers(1000) } else { LlamaModelParams::default() } - #[cfg(not(feature = "cuda"))] + #[cfg(not(any(feature = "cuda", feature = "hipblas")))] LlamaModelParams::default() };