From 3fd59498a25b28385a90e8984d67d66c2a6053fe Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Mon, 14 Nov 2022 19:39:40 -0600 Subject: [PATCH 1/3] Enable code for dynamic parallelism --- crates/cuda_std/src/lib.rs | 7 +++---- crates/cuda_std/src/rt/error.rs | 2 +- crates/cuda_std/src/rt/mod.rs | 16 +++++++++------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs index 33e7b28..472d393 100644 --- a/crates/cuda_std/src/lib.rs +++ b/crates/cuda_std/src/lib.rs @@ -36,17 +36,16 @@ extern crate alloc; +pub mod atomic; +pub mod cfg; pub mod float; #[allow(warnings)] pub mod intrinsics; pub mod io; pub mod mem; pub mod misc; -// WIP -// pub mod rt; -pub mod atomic; -pub mod cfg; pub mod ptr; +pub mod rt; pub mod shared; pub mod thread; pub mod warp; diff --git a/crates/cuda_std/src/rt/error.rs b/crates/cuda_std/src/rt/error.rs index 18992c2..024a9de 100644 --- a/crates/cuda_std/src/rt/error.rs +++ b/crates/cuda_std/src/rt/error.rs @@ -132,7 +132,7 @@ pub enum CudaError { /// Result type for most CUDA functions. pub type CudaResult = Result; -pub(crate) trait ToResult { +pub trait ToResult { fn to_result(self) -> CudaResult<()>; } impl ToResult for cudaError_t { diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs index 36ce621..98c263e 100644 --- a/crates/cuda_std/src/rt/mod.rs +++ b/crates/cuda_std/src/rt/mod.rs @@ -64,16 +64,18 @@ impl Drop for Stream { #[macro_export] macro_rules! launch { ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{ - let grid_dim = ::$crate::rt::GridDim::from($grid_dim); - let block_dim = ::$crate::rt::BlockDim::from($block_dim); - let mut buf = ::$crate::rt::sys::cudaGetParameterBufferV2( + use $crate::rt::ToResult; + use $crate::float::GpuFloat; + let grid_dim = $crate::rt::GridSize::from($grid_dim); + let block_dim = $crate::rt::BlockSize::from($block_dim); + let mut buf = $crate::rt::sys::cudaGetParameterBufferV2( &$func as *const _ as *const ::core::ffi::c_void, - ::$crate::rt::sys::dim3 { + $crate::rt::sys::dim3 { x: grid_dim.x, y: grid_dim.y, z: grid_dim.z }, - ::$crate::rt::sys::dim3 { + $crate::rt::sys::dim3 { x: block_dim.x, y: block_dim.y, z: block_dim.z @@ -84,7 +86,7 @@ macro_rules! launch { let mut offset = 0; $( let param = $param; - let size = ::core::mem::size_of_val(¶m) + let size = ::core::mem::size_of_val(¶m); let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1; offset = buf_idx * size; let ptr = ¶m as *const _ as *const u8; @@ -95,7 +97,7 @@ macro_rules! launch { if false { $func($($param),*); } - $stream.launch(buf as *mut ::core::ffi::c_void).to_result() + $stream.launch(buf as *mut ::core::ffi::c_void) }}; } From 7d622ca88fd13c0c34e4e94ef784a4bdcf7a131e Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Sun, 20 Nov 2022 00:03:15 -0600 Subject: [PATCH 2/3] WIP: working solution, but a lot of cruft. Clean up and unwind the unneeded stuff --- crates/cuda_std/src/rt/mod.rs | 115 +++++++++++++++++++++++------ crates/cuda_std/src/rt/sys.rs | 10 +++ crates/cust/src/link.rs | 22 ++++++ crates/cust/src/module.rs | 2 +- crates/rustc_codegen_nvvm/build.rs | 2 +- 5 files changed, 128 insertions(+), 23 deletions(-) diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs index 98c263e..675f886 100644 --- a/crates/cuda_std/src/rt/mod.rs +++ b/crates/cuda_std/src/rt/mod.rs @@ -31,10 +31,15 @@ bitflags::bitflags! { #[derive(Debug)] pub struct Stream { - raw: cuda::cudaStream_t, + pub raw: cuda::cudaStream_t, } impl Stream { + // /// Creates a new stream with flags. + // pub fn new(flags: StreamFlags) -> Self { + // Self {} + // } + /// Creates a new stream with flags. pub fn new(flags: StreamFlags) -> CudaResult { let mut stream = MaybeUninit::uninit(); @@ -47,10 +52,11 @@ impl Stream { } } - #[doc(hidden)] - pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> { - unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() } - } + // #[doc(hidden)] + // pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> { + // unsafe { cuda::cudaLaunchDeviceV2(param_buf, core::ptr::null_mut()).to_result() } + // // unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() } + // } } impl Drop for Stream { @@ -63,13 +69,17 @@ impl Drop for Stream { #[macro_export] macro_rules! launch { - ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{ + // ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{ + ($func:ident<<<$grid_dim:expr, $block_dim:expr, ($smem_size:expr)>>>($($param:expr),* $(,)?)) => {{ use $crate::rt::ToResult; use $crate::float::GpuFloat; let grid_dim = $crate::rt::GridSize::from($grid_dim); let block_dim = $crate::rt::BlockSize::from($block_dim); + + // Get a device buffer for kernel launch. + let fptr = $func as *const (); let mut buf = $crate::rt::sys::cudaGetParameterBufferV2( - &$func as *const _ as *const ::core::ffi::c_void, + fptr as *const ::core::ffi::c_void, $crate::rt::sys::dim3 { x: grid_dim.x, y: grid_dim.y, @@ -80,24 +90,87 @@ macro_rules! launch { y: block_dim.y, z: block_dim.z }, - $smem_size - ) as *mut u8; - unsafe { - let mut offset = 0; - $( - let param = $param; - let size = ::core::mem::size_of_val(¶m); - let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1; - offset = buf_idx * size; - let ptr = ¶m as *const _ as *const u8; - let dst = buf.add(offset); - ::core::ptr::copy_nonoverlapping(¶m as *const _ as *const u8, dst, size); - )* + $smem_size, + ); + + // Ensure buffer is not a nil ptr. + if buf.is_null() { + return; } + + // Load data into buffer. + let mut offset = 0; + $( + let param = $param; + let size = ::core::mem::size_of_val(¶m); + let param_ptr = ¶m as *const _ as *const ::core::ffi::c_void; + let dst = buf.add(offset).copy_from(param_ptr, size); + offset += size; + )* if false { $func($($param),*); } - $stream.launch(buf as *mut ::core::ffi::c_void) + // unsafe { + // let mut offset = 0; + // $( + // let param = $param; + // let size = ::core::mem::size_of_val(¶m); + // let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1; + // offset = buf_idx * size; + // let ptr = ¶m as *const _ as *const u8; + // let dst = buf.add(offset); + // ::core::ptr::copy_nonoverlapping(¶m as *const _ as *const u8, dst, size); + // )* + // } + // if false { + // $func($($param),*); + // } + + // Launch the kernel. + $crate::rt::sys::cudaLaunchDeviceV2(buf as *mut ::core::ffi::c_void, ::core::ptr::null_mut() as *mut _) + + // let mut buf = $crate::rt::sys::cudaGetParameterBuffer(alignment, size) as *mut u8; + + // // Populate the buffer with given arguments. + // let mut offset = 0; + // $( + // let param = $param; + // let size = ::core::mem::size_of_val(¶m); + // let buf_bytes_ptr = (buf as *mut u8).add(offset); + // ::core::ptr::copy_nonoverlapping($param as *const _, buf_bytes_ptr.into(), size); + // offset += size; + // )* + + // let mut offset = 0; + // $( + // let param = $param; + // let size = ::core::mem::size_of_val(¶m); + // let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1; + // offset = buf_idx * size; + // let ptr = ¶m as *const _ as *const u8; + // let dst = buf.add(offset); + // ::core::ptr::copy_nonoverlapping(¶m as *const _ as *const u8, dst, size); + // )* + + // // Launch the kernel. + // let fptr = $func as *const (); + // $crate::rt::sys::cudaLaunchDevice( + // fptr as *const ::core::ffi::c_void, + // buf as *mut ::core::ffi::c_void, + // $crate::rt::sys::dim3 { + // x: grid_dim.x, + // y: grid_dim.y, + // z: grid_dim.z + // }, + // $crate::rt::sys::dim3 { + // x: block_dim.x, + // y: block_dim.y, + // z: block_dim.z + // }, + // $smem_size, + // ::core::ptr::null_mut() as *mut _, + // // $stream.raw, + // ) }}; } diff --git a/crates/cuda_std/src/rt/sys.rs b/crates/cuda_std/src/rt/sys.rs index 5a0bfc4..5fad1b9 100644 --- a/crates/cuda_std/src/rt/sys.rs +++ b/crates/cuda_std/src/rt/sys.rs @@ -14,6 +14,16 @@ pub use crate::rt::driver_types_sys::*; // to share this stuff with cust. extern "C" { + pub fn cudaGetParameterBuffer(alignment: usize, size: usize) -> *mut c_void; + pub fn cudaLaunchDevice( + func: *const c_void, + parameterBuffer: *const c_void, + gridDimension: dim3, + blockDimension: dim3, + sharedMemSize: c_uint, + stream: cudaStream_t, + ) -> cudaError_t; + pub fn cudaDeviceGetAttribute( value: *mut c_int, attr: cudaDeviceAttr, diff --git a/crates/cust/src/link.rs b/crates/cust/src/link.rs index fc231d9..6219aaa 100644 --- a/crates/cust/src/link.rs +++ b/crates/cust/src/link.rs @@ -114,6 +114,28 @@ impl Linker { } } + /// Link device runtime lib. + pub fn add_libcudadevrt(&mut self) -> CudaResult<()> { + let mut bytes = std::fs::read("/usr/local/cuda-11/lib64/libcudadevrt.a") + .expect("could not read libcudadevrt.a"); + + unsafe { + cuda::cuLinkAddData_v2( + self.raw, + cuda::CUjitInputType::CU_JIT_INPUT_LIBRARY, + // cuda_sys wants *mut but from the API docs we know we retain ownership so + // this cast is sound. + bytes.as_mut_ptr() as *mut _, + bytes.len(), + UNNAMED.as_ptr().cast(), + 0, + std::ptr::null_mut(), + std::ptr::null_mut(), + ) + .to_result() + } + } + /// Runs the linker to generate the final cubin bytes. Also returns a duration /// for how long it took to run the linker. pub fn complete(self) -> CudaResult> { diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs index 9e71f57..0813aa5 100644 --- a/crates/cust/src/module.rs +++ b/crates/cust/src/module.rs @@ -338,7 +338,7 @@ impl Module { /// ``` #[deprecated( since = "0.3.0", - note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing + note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing an empty slice of options (usually) " )] diff --git a/crates/rustc_codegen_nvvm/build.rs b/crates/rustc_codegen_nvvm/build.rs index e5ed44f..3ea30bc 100644 --- a/crates/rustc_codegen_nvvm/build.rs +++ b/crates/rustc_codegen_nvvm/build.rs @@ -21,7 +21,7 @@ fn main() { // this is set by cuda_builder, but in case somebody is using the codegen // manually, default to 520 (which is what nvvm defaults to). if option_env!("CUDA_ARCH").is_none() { - println!("cargo:rustc-env=CUDA_ARCH=520") + println!("cargo:rustc-env=CUDA_ARCH=750") } } From 95f1066d5c07e1a99e8f71db32463a005ca98f3a Mon Sep 17 00:00:00 2001 From: Anthony Dodd Date: Wed, 29 Mar 2023 20:10:54 -0500 Subject: [PATCH 3/3] WIP: pop this and continue experimentation --- crates/cuda_std/src/rt/mod.rs | 78 +++--------------------------- crates/cust/Cargo.toml | 1 + crates/cust/src/link.rs | 18 ++++--- crates/find_cuda_helper/src/lib.rs | 10 ++++ 4 files changed, 30 insertions(+), 77 deletions(-) diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs index 675f886..edd54bb 100644 --- a/crates/cuda_std/src/rt/mod.rs +++ b/crates/cuda_std/src/rt/mod.rs @@ -35,11 +35,6 @@ pub struct Stream { } impl Stream { - // /// Creates a new stream with flags. - // pub fn new(flags: StreamFlags) -> Self { - // Self {} - // } - /// Creates a new stream with flags. pub fn new(flags: StreamFlags) -> CudaResult { let mut stream = MaybeUninit::uninit(); @@ -52,11 +47,10 @@ impl Stream { } } - // #[doc(hidden)] - // pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> { - // unsafe { cuda::cudaLaunchDeviceV2(param_buf, core::ptr::null_mut()).to_result() } - // // unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() } - // } + #[doc(hidden)] + pub unsafe fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> { + cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() + } } impl Drop for Stream { @@ -69,8 +63,7 @@ impl Drop for Stream { #[macro_export] macro_rules! launch { - // ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{ - ($func:ident<<<$grid_dim:expr, $block_dim:expr, ($smem_size:expr)>>>($($param:expr),* $(,)?)) => {{ + ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{ use $crate::rt::ToResult; use $crate::float::GpuFloat; let grid_dim = $crate::rt::GridSize::from($grid_dim); @@ -108,69 +101,12 @@ macro_rules! launch { offset += size; )* if false { + // Ensure function call compatibility at compile time. $func($($param),*); } - // unsafe { - // let mut offset = 0; - // $( - // let param = $param; - // let size = ::core::mem::size_of_val(¶m); - // let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1; - // offset = buf_idx * size; - // let ptr = ¶m as *const _ as *const u8; - // let dst = buf.add(offset); - // ::core::ptr::copy_nonoverlapping(¶m as *const _ as *const u8, dst, size); - // )* - // } - // if false { - // $func($($param),*); - // } // Launch the kernel. - $crate::rt::sys::cudaLaunchDeviceV2(buf as *mut ::core::ffi::c_void, ::core::ptr::null_mut() as *mut _) - - // let mut buf = $crate::rt::sys::cudaGetParameterBuffer(alignment, size) as *mut u8; - - // // Populate the buffer with given arguments. - // let mut offset = 0; - // $( - // let param = $param; - // let size = ::core::mem::size_of_val(¶m); - // let buf_bytes_ptr = (buf as *mut u8).add(offset); - // ::core::ptr::copy_nonoverlapping($param as *const _, buf_bytes_ptr.into(), size); - // offset += size; - // )* - - // let mut offset = 0; - // $( - // let param = $param; - // let size = ::core::mem::size_of_val(¶m); - // let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1; - // offset = buf_idx * size; - // let ptr = ¶m as *const _ as *const u8; - // let dst = buf.add(offset); - // ::core::ptr::copy_nonoverlapping(¶m as *const _ as *const u8, dst, size); - // )* - - // // Launch the kernel. - // let fptr = $func as *const (); - // $crate::rt::sys::cudaLaunchDevice( - // fptr as *const ::core::ffi::c_void, - // buf as *mut ::core::ffi::c_void, - // $crate::rt::sys::dim3 { - // x: grid_dim.x, - // y: grid_dim.y, - // z: grid_dim.z - // }, - // $crate::rt::sys::dim3 { - // x: block_dim.x, - // y: block_dim.y, - // z: block_dim.z - // }, - // $smem_size, - // ::core::ptr::null_mut() as *mut _, - // // $stream.raw, - // ) + $stream.launch(buf) }}; } diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml index 39eab9c..d63a746 100644 --- a/crates/cust/Cargo.toml +++ b/crates/cust/Cargo.toml @@ -22,6 +22,7 @@ mint = { version = "^0.5", optional = true } num-complex = { version = "0.4", optional = true } vek = { version = "0.15.1", optional = true, default-features = false } bytemuck = { version = "1.7.3", optional = true } +find_cuda_helper = { path = "../find_cuda_helper", version = "0.2" } [features] default= ["bytemuck"] diff --git a/crates/cust/src/link.rs b/crates/cust/src/link.rs index 6219aaa..53a1c75 100644 --- a/crates/cust/src/link.rs +++ b/crates/cust/src/link.rs @@ -3,9 +3,9 @@ use std::mem::MaybeUninit; use std::ptr::null_mut; +use crate::error::{CudaError, CudaResult, ToResult}; use crate::sys as cuda; - -use crate::error::{CudaResult, ToResult}; +use find_cuda_helper::find_lib_cudadevrt; static UNNAMED: &str = "\0"; @@ -25,6 +25,12 @@ impl Linker { // Therefore we use box to alloc the memory for us, then into_raw it so we now have ownership // of the memory (and dont have any aliasing requirements attached either). + // // Just take advantage of C memory model and just pass individual elements, as there is only 1. + // let num_options: u32 = 1; + // let opt = &mut cuda::CUjit_option::CU_JIT_TARGET as *mut _; + // let mut opt_val = + // &mut cuda::CUjit_target::CU_TARGET_COMPUTE_75 as *mut _ as *mut ::std::os::raw::c_void; + unsafe { let mut raw = MaybeUninit::uninit(); cuda::cuLinkCreate_v2(0, null_mut(), null_mut(), raw.as_mut_ptr()).to_result()?; @@ -116,15 +122,15 @@ impl Linker { /// Link device runtime lib. pub fn add_libcudadevrt(&mut self) -> CudaResult<()> { - let mut bytes = std::fs::read("/usr/local/cuda-11/lib64/libcudadevrt.a") - .expect("could not read libcudadevrt.a"); + let path = find_lib_cudadevrt().ok_or_else(|| CudaError::FileNotFound)?; + let mut bytes = std::fs::read(path) + // TODO: don't panic, update the result type instead. + .expect("error linking libcudadevrt.a"); unsafe { cuda::cuLinkAddData_v2( self.raw, cuda::CUjitInputType::CU_JIT_INPUT_LIBRARY, - // cuda_sys wants *mut but from the API docs we know we retain ownership so - // this cast is sound. bytes.as_mut_ptr() as *mut _, bytes.len(), UNNAMED.as_ptr().cast(), diff --git a/crates/find_cuda_helper/src/lib.rs b/crates/find_cuda_helper/src/lib.rs index b3943be..53fd852 100644 --- a/crates/find_cuda_helper/src/lib.rs +++ b/crates/find_cuda_helper/src/lib.rs @@ -150,6 +150,16 @@ pub fn find_cuda_lib_dirs() -> Vec { valid_paths } +/// Find the location of `libcudadevrt.a`. +pub fn find_lib_cudadevrt() -> Option { + let root = find_cuda_root()?; + let lib = root.join("lib64").join("libcudadevrt.a"); + if lib.is_file() { + return Some(lib); + } + None +} + #[cfg(target_os = "windows")] pub fn find_optix_root() -> Option { // the optix SDK installer sets OPTIX_ROOT_DIR whenever it installs.