diff --git a/Cargo.lock b/Cargo.lock
index efa53644..8f839616 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -494,6 +494,7 @@ version = "0.1.56"
 dependencies = [
  "bindgen",
  "cc",
+ "glob",
  "once_cell",
 ]
 
diff --git a/llama-cpp-2/Cargo.toml b/llama-cpp-2/Cargo.toml
index 454276d1..c3b2e940 100644
--- a/llama-cpp-2/Cargo.toml
+++ b/llama-cpp-2/Cargo.toml
@@ -16,9 +16,10 @@ tracing = { workspace = true }
 [features]
 cuda = ["llama-cpp-sys-2/cuda"]
 metal = ["llama-cpp-sys-2/metal"]
+hipblas = ["llama-cpp-sys-2/hipblas"]
 sampler = []
 
-[target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies]  
+[target.'cfg(all(target_os = "macos", any(target_arch = "aarch64", target_arch = "arm64")))'.dependencies]
 llama-cpp-sys-2 = { path = "../llama-cpp-sys-2", features=["metal"], version = "0.1.48" }
 
 [lints]
diff --git a/llama-cpp-2/src/lib.rs b/llama-cpp-2/src/lib.rs
index 52d63c7f..049922fb 100644
--- a/llama-cpp-2/src/lib.rs
+++ b/llama-cpp-2/src/lib.rs
@@ -12,6 +12,7 @@
 //! # Feature Flags
 //!
 //! - `cuda` enables CUDA gpu support.
+//! - `hipblas` enables hipBLAS (ROCm) gpu support (experimental).
 //! - `sampler` adds the [`context::sample::sampler`] struct for a more rusty way of sampling.
 use std::ffi::NulError;
 use std::fmt::Debug;
diff --git a/llama-cpp-sys-2/Cargo.toml b/llama-cpp-sys-2/Cargo.toml
index 0dae9980..e6323465 100644
--- a/llama-cpp-sys-2/Cargo.toml
+++ b/llama-cpp-sys-2/Cargo.toml
@@ -51,8 +51,9 @@ include = [
 bindgen = { workspace = true }
 cc = { workspace = true, features = ["parallel"] }
 once_cell = "1.19.0"
+glob = "0.3.1"
 
 [features]
 cuda = []
 metal = []
-
+hipblas = []
diff --git a/llama-cpp-sys-2/build.rs b/llama-cpp-sys-2/build.rs
index 07e0e4ff..720ad9e3 100644
--- a/llama-cpp-sys-2/build.rs
+++ b/llama-cpp-sys-2/build.rs
@@ -1,11 +1,13 @@
-use std::env;
+use std::env::{self, VarError};
 use std::fs::{read_dir, File};
 use std::io::Write;
 use std::path::{Path, PathBuf};
 use std::process::Command;
+use std::str::FromStr;
 
 use cc::Build;
 use once_cell::sync::Lazy;
+use glob::glob;
 
 // This build file is based on:
 // https://github.com/mdrokz/rust-llama.cpp/blob/master/build.rs
@@ -365,23 +367,16 @@ fn compile_blis(cx: &mut Build) {
 }
 
 fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static str {
-    const DEFAULT_ROCM_PATH_STR: &str = "/opt/rocm/";
+    let rocm_path_str = env::var("ROCM_PATH").or(Ok::<String, VarError>(String::from_str("/opt/rocm/").unwrap())).unwrap();
 
-    let rocm_path_str = env::var("ROCM_PATH")
-        .map_err(|_| DEFAULT_ROCM_PATH_STR.to_string())
-        .unwrap();
-    println!("Compiling HIPBLAS GGML. Using ROCm from {rocm_path_str}");
+    println!("Compiling hipBLAS GGML. Using ROCm from {rocm_path_str}");
 
     let rocm_path = PathBuf::from(rocm_path_str);
     let rocm_include = rocm_path.join("include");
     let rocm_lib = rocm_path.join("lib");
     let rocm_hip_bin = rocm_path.join("bin/hipcc");
 
-    let cuda_lib = "ggml-cuda";
-    let cuda_file = cuda_lib.to_string() + ".cu";
-    let cuda_header = cuda_lib.to_string() + ".h";
-
-    let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUBLAS"];
+    let defines = ["GGML_USE_HIPBLAS", "GGML_USE_CUDA"];
     for def in defines {
         cx.define(def, None);
         cxx.define(def, None);
@@ -390,24 +385,39 @@ fn compile_hipblas(cx: &mut Build, cxx: &mut Build, mut hip: Build) -> &'static
     cx.include(&rocm_include);
     cxx.include(&rocm_include);
 
+    let ggml_cuda = glob(LLAMA_PATH.join("ggml-cuda").join("*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+    let ggml_template_fattn = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("fattn-vec*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+    let ggml_template_wmma = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("fattn-wmma*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+    let ggml_template_mmq = glob(LLAMA_PATH.join("ggml-cuda").join("template-instances").join("mmq*.cu").to_str().unwrap())
+        .unwrap().filter_map(Result::ok).collect::<Vec<_>>();
+
     hip.compiler(rocm_hip_bin)
         .std("c++11")
-        .file(LLAMA_PATH.join(cuda_file))
-        .include(LLAMA_PATH.join(cuda_header))
+        .define("LLAMA_CUDA_DMMV_X", Some("32"))
+        .define("LLAMA_CUDA_MMV_Y", Some("1"))
+        .define("LLAMA_CUDA_KQUANTS_ITER", Some("2"))
+        .file(LLAMA_PATH.join("ggml-cuda.cu"))
+        .files(ggml_cuda)
+        .files(ggml_template_fattn)
+        .files(ggml_template_wmma)
+        .files(ggml_template_mmq)
+        .include(LLAMA_PATH.join(""))
+        .include(LLAMA_PATH.join("ggml-cuda"))
         .define("GGML_USE_HIPBLAS", None)
-        .compile(cuda_lib);
+        .define("GGML_USE_CUDA", None)
+        .compile("ggml-cuda");
 
-    println!(
-        "cargo:rustc-link-search=native={}",
-        rocm_lib.to_string_lossy()
-    );
+    println!("cargo:rustc-link-search=native={}", rocm_lib.to_string_lossy());
 
     let rocm_libs = ["hipblas", "rocblas", "amdhip64"];
     for lib in rocm_libs {
         println!("cargo:rustc-link-lib={lib}");
     }
 
-    cuda_lib
+    "ggml-cuda"
 }
 
 fn compile_cuda(cx: &mut Build, cxx: &mut Build, featless_cxx: Build) -> &'static str {
diff --git a/simple/src/main.rs b/simple/src/main.rs
index 8e6700c8..2fc4d817 100644
--- a/simple/src/main.rs
+++ b/simple/src/main.rs
@@ -44,7 +44,7 @@ struct Args {
     #[arg(short = 'o', value_parser = parse_key_val)]
     key_value_overrides: Vec<(String, ParamOverrideValue)>,
     /// Disable offloading layers to the gpu
-    #[cfg(feature = "cuda")]
+    #[cfg(any(feature = "cuda", feature = "hipblas"))]
     #[clap(long)]
     disable_gpu: bool,
     #[arg(short = 's', long, help = "RNG seed (default: 1234)")]
@@ -124,7 +124,7 @@ fn main() -> Result<()> {
         model,
         prompt,
         file,
-        #[cfg(feature = "cuda")]
+        #[cfg(any(feature = "cuda", feature = "hipblas"))]
         disable_gpu,
         key_value_overrides,
         seed,
@@ -138,13 +138,13 @@ fn main() -> Result<()> {
 
     // offload all layers to the gpu
     let model_params = {
-        #[cfg(feature = "cuda")]
+        #[cfg(any(feature = "cuda", feature = "hipblas"))]
         if !disable_gpu {
             LlamaModelParams::default().with_n_gpu_layers(1000)
         } else {
             LlamaModelParams::default()
         }
-        #[cfg(not(feature = "cuda"))]
+        #[cfg(not(any(feature = "cuda", feature = "hipblas")))]
         LlamaModelParams::default()
     };