diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs index 33e7b28..472d393 100644 --- a/crates/cuda_std/src/lib.rs +++ b/crates/cuda_std/src/lib.rs @@ -36,17 +36,16 @@ extern crate alloc; +pub mod atomic; +pub mod cfg; pub mod float; #[allow(warnings)] pub mod intrinsics; pub mod io; pub mod mem; pub mod misc; -// WIP -// pub mod rt; -pub mod atomic; -pub mod cfg; pub mod ptr; +pub mod rt; pub mod shared; pub mod thread; pub mod warp; diff --git a/crates/cuda_std/src/rt/error.rs b/crates/cuda_std/src/rt/error.rs index 18992c2..024a9de 100644 --- a/crates/cuda_std/src/rt/error.rs +++ b/crates/cuda_std/src/rt/error.rs @@ -132,7 +132,7 @@ pub enum CudaError { /// Result type for most CUDA functions. pub type CudaResult = Result; -pub(crate) trait ToResult { +pub trait ToResult { fn to_result(self) -> CudaResult<()>; } impl ToResult for cudaError_t { diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs index 36ce621..edd54bb 100644 --- a/crates/cuda_std/src/rt/mod.rs +++ b/crates/cuda_std/src/rt/mod.rs @@ -31,7 +31,7 @@ bitflags::bitflags! { #[derive(Debug)] pub struct Stream { - raw: cuda::cudaStream_t, + pub raw: cuda::cudaStream_t, } impl Stream { @@ -48,8 +48,8 @@ impl Stream { } #[doc(hidden)] - pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> { - unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() } + pub unsafe fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> { + cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() } } @@ -64,38 +64,49 @@ impl Drop for Stream { #[macro_export] macro_rules! launch { ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{ - let grid_dim = ::$crate::rt::GridDim::from($grid_dim); - let block_dim = ::$crate::rt::BlockDim::from($block_dim); - let mut buf = ::$crate::rt::sys::cudaGetParameterBufferV2( - &$func as *const _ as *const ::core::ffi::c_void, - ::$crate::rt::sys::dim3 { + use $crate::rt::ToResult; + use $crate::float::GpuFloat; + let grid_dim = $crate::rt::GridSize::from($grid_dim); + let block_dim = $crate::rt::BlockSize::from($block_dim); + + // Get a device buffer for kernel launch. + let fptr = $func as *const (); + let mut buf = $crate::rt::sys::cudaGetParameterBufferV2( + fptr as *const ::core::ffi::c_void, + $crate::rt::sys::dim3 { x: grid_dim.x, y: grid_dim.y, z: grid_dim.z }, - ::$crate::rt::sys::dim3 { + $crate::rt::sys::dim3 { x: block_dim.x, y: block_dim.y, z: block_dim.z }, - $smem_size - ) as *mut u8; - unsafe { - let mut offset = 0; - $( - let param = $param; - let size = ::core::mem::size_of_val(¶m) - let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1; - offset = buf_idx * size; - let ptr = ¶m as *const _ as *const u8; - let dst = buf.add(offset); - ::core::ptr::copy_nonoverlapping(¶m as *const _ as *const u8, dst, size); - )* + $smem_size, + ); + + // Ensure buffer is not a nil ptr. + if buf.is_null() { + return; } + + // Load data into buffer. + let mut offset = 0; + $( + let param = $param; + let size = ::core::mem::size_of_val(¶m); + let param_ptr = ¶m as *const _ as *const ::core::ffi::c_void; + let dst = buf.add(offset).copy_from(param_ptr, size); + offset += size; + )* if false { + // Ensure function call compatibility at compile time. $func($($param),*); } - $stream.launch(buf as *mut ::core::ffi::c_void).to_result() + + // Launch the kernel. + $stream.launch(buf) }}; } diff --git a/crates/cuda_std/src/rt/sys.rs b/crates/cuda_std/src/rt/sys.rs index 5a0bfc4..5fad1b9 100644 --- a/crates/cuda_std/src/rt/sys.rs +++ b/crates/cuda_std/src/rt/sys.rs @@ -14,6 +14,16 @@ pub use crate::rt::driver_types_sys::*; // to share this stuff with cust. extern "C" { + pub fn cudaGetParameterBuffer(alignment: usize, size: usize) -> *mut c_void; + pub fn cudaLaunchDevice( + func: *const c_void, + parameterBuffer: *const c_void, + gridDimension: dim3, + blockDimension: dim3, + sharedMemSize: c_uint, + stream: cudaStream_t, + ) -> cudaError_t; + pub fn cudaDeviceGetAttribute( value: *mut c_int, attr: cudaDeviceAttr, diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml index 39eab9c..d63a746 100644 --- a/crates/cust/Cargo.toml +++ b/crates/cust/Cargo.toml @@ -22,6 +22,7 @@ mint = { version = "^0.5", optional = true } num-complex = { version = "0.4", optional = true } vek = { version = "0.15.1", optional = true, default-features = false } bytemuck = { version = "1.7.3", optional = true } +find_cuda_helper = { path = "../find_cuda_helper", version = "0.2" } [features] default= ["bytemuck"] diff --git a/crates/cust/src/link.rs b/crates/cust/src/link.rs index fc231d9..53a1c75 100644 --- a/crates/cust/src/link.rs +++ b/crates/cust/src/link.rs @@ -3,9 +3,9 @@ use std::mem::MaybeUninit; use std::ptr::null_mut; +use crate::error::{CudaError, CudaResult, ToResult}; use crate::sys as cuda; - -use crate::error::{CudaResult, ToResult}; +use find_cuda_helper::find_lib_cudadevrt; static UNNAMED: &str = "\0"; @@ -25,6 +25,12 @@ impl Linker { // Therefore we use box to alloc the memory for us, then into_raw it so we now have ownership // of the memory (and dont have any aliasing requirements attached either). + // // Just take advantage of C memory model and just pass individual elements, as there is only 1. + // let num_options: u32 = 1; + // let opt = &mut cuda::CUjit_option::CU_JIT_TARGET as *mut _; + // let mut opt_val = + // &mut cuda::CUjit_target::CU_TARGET_COMPUTE_75 as *mut _ as *mut ::std::os::raw::c_void; + unsafe { let mut raw = MaybeUninit::uninit(); cuda::cuLinkCreate_v2(0, null_mut(), null_mut(), raw.as_mut_ptr()).to_result()?; @@ -114,6 +120,28 @@ impl Linker { } } + /// Link device runtime lib. + pub fn add_libcudadevrt(&mut self) -> CudaResult<()> { + let path = find_lib_cudadevrt().ok_or_else(|| CudaError::FileNotFound)?; + let mut bytes = std::fs::read(path) + // TODO: don't panic, update the result type instead. + .expect("error linking libcudadevrt.a"); + + unsafe { + cuda::cuLinkAddData_v2( + self.raw, + cuda::CUjitInputType::CU_JIT_INPUT_LIBRARY, + bytes.as_mut_ptr() as *mut _, + bytes.len(), + UNNAMED.as_ptr().cast(), + 0, + std::ptr::null_mut(), + std::ptr::null_mut(), + ) + .to_result() + } + } + /// Runs the linker to generate the final cubin bytes. Also returns a duration /// for how long it took to run the linker. pub fn complete(self) -> CudaResult> { diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs index 9e71f57..0813aa5 100644 --- a/crates/cust/src/module.rs +++ b/crates/cust/src/module.rs @@ -338,7 +338,7 @@ impl Module { /// ``` #[deprecated( since = "0.3.0", - note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing + note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing an empty slice of options (usually) " )] diff --git a/crates/find_cuda_helper/src/lib.rs b/crates/find_cuda_helper/src/lib.rs index b3943be..53fd852 100644 --- a/crates/find_cuda_helper/src/lib.rs +++ b/crates/find_cuda_helper/src/lib.rs @@ -150,6 +150,16 @@ pub fn find_cuda_lib_dirs() -> Vec { valid_paths } +/// Find the location of `libcudadevrt.a`. +pub fn find_lib_cudadevrt() -> Option { + let root = find_cuda_root()?; + let lib = root.join("lib64").join("libcudadevrt.a"); + if lib.is_file() { + return Some(lib); + } + None +} + #[cfg(target_os = "windows")] pub fn find_optix_root() -> Option { // the optix SDK installer sets OPTIX_ROOT_DIR whenever it installs. diff --git a/crates/rustc_codegen_nvvm/build.rs b/crates/rustc_codegen_nvvm/build.rs index e5ed44f..3ea30bc 100644 --- a/crates/rustc_codegen_nvvm/build.rs +++ b/crates/rustc_codegen_nvvm/build.rs @@ -21,7 +21,7 @@ fn main() { // this is set by cuda_builder, but in case somebody is using the codegen // manually, default to 520 (which is what nvvm defaults to). if option_env!("CUDA_ARCH").is_none() { - println!("cargo:rustc-env=CUDA_ARCH=520") + println!("cargo:rustc-env=CUDA_ARCH=750") } }