From 3fd59498a25b28385a90e8984d67d66c2a6053fe Mon Sep 17 00:00:00 2001
From: Anthony Dodd <dodd.anthonyjosiah@gmail.com>
Date: Mon, 14 Nov 2022 19:39:40 -0600
Subject: [PATCH 1/3] Enable code for dynamic parallelism

---
 crates/cuda_std/src/lib.rs      |  7 +++----
 crates/cuda_std/src/rt/error.rs |  2 +-
 crates/cuda_std/src/rt/mod.rs   | 16 +++++++++-------
 3 files changed, 13 insertions(+), 12 deletions(-)
diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
index 33e7b28..472d393 100644
--- a/crates/cuda_std/src/lib.rs
+++ b/crates/cuda_std/src/lib.rs
@@ -36,17 +36,16 @@
 
 extern crate alloc;
 
+pub mod atomic;
+pub mod cfg;
 pub mod float;
 #[allow(warnings)]
 pub mod intrinsics;
 pub mod io;
 pub mod mem;
 pub mod misc;
-// WIP
-// pub mod rt;
-pub mod atomic;
-pub mod cfg;
 pub mod ptr;
+pub mod rt;
 pub mod shared;
 pub mod thread;
 pub mod warp;
diff --git a/crates/cuda_std/src/rt/error.rs b/crates/cuda_std/src/rt/error.rs
index 18992c2..024a9de 100644
--- a/crates/cuda_std/src/rt/error.rs
+++ b/crates/cuda_std/src/rt/error.rs
@@ -132,7 +132,7 @@ pub enum CudaError {
 /// Result type for most CUDA functions.
 pub type CudaResult<T> = Result<T, CudaError>;
 
-pub(crate) trait ToResult {
+pub trait ToResult {
     fn to_result(self) -> CudaResult<()>;
 }
 impl ToResult for cudaError_t {
diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
index 36ce621..98c263e 100644
--- a/crates/cuda_std/src/rt/mod.rs
+++ b/crates/cuda_std/src/rt/mod.rs
@@ -64,16 +64,18 @@ impl Drop for Stream {
 #[macro_export]
 macro_rules! launch {
     ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
-        let grid_dim = ::$crate::rt::GridDim::from($grid_dim);
-        let block_dim = ::$crate::rt::BlockDim::from($block_dim);
-        let mut buf = ::$crate::rt::sys::cudaGetParameterBufferV2(
+        use $crate::rt::ToResult;
+        use $crate::float::GpuFloat;
+        let grid_dim = $crate::rt::GridSize::from($grid_dim);
+        let block_dim = $crate::rt::BlockSize::from($block_dim);
+        let mut buf = $crate::rt::sys::cudaGetParameterBufferV2(
             &$func as *const _ as *const ::core::ffi::c_void,
-            ::$crate::rt::sys::dim3 {
+            $crate::rt::sys::dim3 {
                 x: grid_dim.x,
                 y: grid_dim.y,
                 z: grid_dim.z
             },
-            ::$crate::rt::sys::dim3 {
+            $crate::rt::sys::dim3 {
                 x: block_dim.x,
                 y: block_dim.y,
                 z: block_dim.z
@@ -84,7 +86,7 @@ macro_rules! launch {
             let mut offset = 0;
             $(
                 let param = $param;
-                let size = ::core::mem::size_of_val(&param)
+                let size = ::core::mem::size_of_val(&param);
                 let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
                 offset = buf_idx * size;
                 let ptr = &param as *const _ as *const u8;
@@ -95,7 +97,7 @@ macro_rules! launch {
         if false {
             $func($($param),*);
         }
-        $stream.launch(buf as *mut ::core::ffi::c_void).to_result()
+        $stream.launch(buf as *mut ::core::ffi::c_void)
     }};
 }
 

From 7d622ca88fd13c0c34e4e94ef784a4bdcf7a131e Mon Sep 17 00:00:00 2001
From: Anthony Dodd <dodd.anthonyjosiah@gmail.com>
Date: Sun, 20 Nov 2022 00:03:15 -0600
Subject: [PATCH 2/3] WIP: working solution, but a lot of cruft. Clean up and
 unwind the unneeded stuff

---
 crates/cuda_std/src/rt/mod.rs      | 115 +++++++++++++++++++++++------
 crates/cuda_std/src/rt/sys.rs      |  10 +++
 crates/cust/src/link.rs            |  22 ++++++
 crates/cust/src/module.rs          |   2 +-
 crates/rustc_codegen_nvvm/build.rs |   2 +-
 5 files changed, 128 insertions(+), 23 deletions(-)

diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
index 98c263e..675f886 100644
--- a/crates/cuda_std/src/rt/mod.rs
+++ b/crates/cuda_std/src/rt/mod.rs
@@ -31,10 +31,15 @@ bitflags::bitflags! {
 
 #[derive(Debug)]
 pub struct Stream {
-    raw: cuda::cudaStream_t,
+    pub raw: cuda::cudaStream_t,
 }
 
 impl Stream {
+    // /// Creates a new stream with flags.
+    // pub fn new(flags: StreamFlags) -> Self {
+    //     Self {}
+    // }
+
     /// Creates a new stream with flags.
     pub fn new(flags: StreamFlags) -> CudaResult<Self> {
         let mut stream = MaybeUninit::uninit();
@@ -47,10 +52,11 @@ impl Stream {
         }
     }
 
-    #[doc(hidden)]
-    pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
-        unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() }
-    }
+    // #[doc(hidden)]
+    // pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
+    //     unsafe { cuda::cudaLaunchDeviceV2(param_buf, core::ptr::null_mut()).to_result() }
+    //     // unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() }
+    // }
 }
 
 impl Drop for Stream {
@@ -63,13 +69,17 @@ impl Drop for Stream {
 
 #[macro_export]
 macro_rules! launch {
-    ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
+    // ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
+    ($func:ident<<<$grid_dim:expr, $block_dim:expr, ($smem_size:expr)>>>($($param:expr),* $(,)?)) => {{
         use $crate::rt::ToResult;
         use $crate::float::GpuFloat;
         let grid_dim = $crate::rt::GridSize::from($grid_dim);
         let block_dim = $crate::rt::BlockSize::from($block_dim);
+
+        // Get a device buffer for kernel launch.
+        let fptr = $func as *const ();
         let mut buf = $crate::rt::sys::cudaGetParameterBufferV2(
-            &$func as *const _ as *const ::core::ffi::c_void,
+            fptr as *const ::core::ffi::c_void,
             $crate::rt::sys::dim3 {
                 x: grid_dim.x,
                 y: grid_dim.y,
@@ -80,24 +90,87 @@ macro_rules! launch {
                 y: block_dim.y,
                 z: block_dim.z
             },
-            $smem_size
-        ) as *mut u8;
-        unsafe {
-            let mut offset = 0;
-            $(
-                let param = $param;
-                let size = ::core::mem::size_of_val(&param);
-                let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
-                offset = buf_idx * size;
-                let ptr = &param as *const _ as *const u8;
-                let dst = buf.add(offset);
-                ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
-            )*
+            $smem_size,
+        );
+
+        // Ensure buffer is not a nil ptr.
+        if buf.is_null() {
+            return;
         }
+
+        // Load data into buffer.
+        let mut offset = 0;
+        $(
+            let param = $param;
+            let size = ::core::mem::size_of_val(&param);
+            let param_ptr = &param as *const _ as *const ::core::ffi::c_void;
+            let dst = buf.add(offset).copy_from(param_ptr, size);
+            offset += size;
+        )*
         if false {
             $func($($param),*);
         }
-        $stream.launch(buf as *mut ::core::ffi::c_void)
+        // unsafe {
+        //     let mut offset = 0;
+        //     $(
+        //         let param = $param;
+        //         let size = ::core::mem::size_of_val(&param);
+        //         let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
+        //         offset = buf_idx * size;
+        //         let ptr = &param as *const _ as *const u8;
+        //         let dst = buf.add(offset);
+        //         ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
+        //     )*
+        // }
+        // if false {
+        //     $func($($param),*);
+        // }
+
+        // Launch the kernel.
+        $crate::rt::sys::cudaLaunchDeviceV2(buf as *mut ::core::ffi::c_void, ::core::ptr::null_mut() as *mut _)
+
+        // let mut buf = $crate::rt::sys::cudaGetParameterBuffer(alignment, size) as *mut u8;
+
+        // // Populate the buffer with given arguments.
+        // let mut offset = 0;
+        // $(
+        //     let param = $param;
+        //     let size = ::core::mem::size_of_val(&param);
+        //     let buf_bytes_ptr = (buf as *mut u8).add(offset);
+        //     ::core::ptr::copy_nonoverlapping($param as *const _, buf_bytes_ptr.into(), size);
+        //     offset += size;
+        // )*
+
+        // let mut offset = 0;
+        // $(
+        //     let param = $param;
+        //     let size = ::core::mem::size_of_val(&param);
+        //     let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
+        //     offset = buf_idx * size;
+        //     let ptr = &param as *const _ as *const u8;
+        //     let dst = buf.add(offset);
+        //     ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
+        // )*
+
+        // // Launch the kernel.
+        // let fptr = $func as *const ();
+        // $crate::rt::sys::cudaLaunchDevice(
+        //     fptr as *const ::core::ffi::c_void,
+        //     buf as *mut ::core::ffi::c_void,
+        //     $crate::rt::sys::dim3 {
+        //         x: grid_dim.x,
+        //         y: grid_dim.y,
+        //         z: grid_dim.z
+        //     },
+        //     $crate::rt::sys::dim3 {
+        //         x: block_dim.x,
+        //         y: block_dim.y,
+        //         z: block_dim.z
+        //     },
+        //     $smem_size,
+        //     ::core::ptr::null_mut() as *mut _,
+        //     // $stream.raw,
+        // )
     }};
 }
 
diff --git a/crates/cuda_std/src/rt/sys.rs b/crates/cuda_std/src/rt/sys.rs
index 5a0bfc4..5fad1b9 100644
--- a/crates/cuda_std/src/rt/sys.rs
+++ b/crates/cuda_std/src/rt/sys.rs
@@ -14,6 +14,16 @@ pub use crate::rt::driver_types_sys::*;
 // to share this stuff with cust.
 
 extern "C" {
+    pub fn cudaGetParameterBuffer(alignment: usize, size: usize) -> *mut c_void;
+    pub fn cudaLaunchDevice(
+        func: *const c_void,
+        parameterBuffer: *const c_void,
+        gridDimension: dim3,
+        blockDimension: dim3,
+        sharedMemSize: c_uint,
+        stream: cudaStream_t,
+    ) -> cudaError_t;
+
     pub fn cudaDeviceGetAttribute(
         value: *mut c_int,
         attr: cudaDeviceAttr,
diff --git a/crates/cust/src/link.rs b/crates/cust/src/link.rs
index fc231d9..6219aaa 100644
--- a/crates/cust/src/link.rs
+++ b/crates/cust/src/link.rs
@@ -114,6 +114,28 @@ impl Linker {
         }
     }
 
+    /// Link device runtime lib.
+    pub fn add_libcudadevrt(&mut self) -> CudaResult<()> {
+        let mut bytes = std::fs::read("/usr/local/cuda-11/lib64/libcudadevrt.a")
+            .expect("could not read libcudadevrt.a");
+
+        unsafe {
+            cuda::cuLinkAddData_v2(
+                self.raw,
+                cuda::CUjitInputType::CU_JIT_INPUT_LIBRARY,
+                // cuda_sys wants *mut but from the API docs we know we retain ownership so
+                // this cast is sound.
+                bytes.as_mut_ptr() as *mut _,
+                bytes.len(),
+                UNNAMED.as_ptr().cast(),
+                0,
+                std::ptr::null_mut(),
+                std::ptr::null_mut(),
+            )
+            .to_result()
+        }
+    }
+
     /// Runs the linker to generate the final cubin bytes. Also returns a duration
     /// for how long it took to run the linker.
     pub fn complete(self) -> CudaResult<Vec<u8>> {
diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs
index 9e71f57..0813aa5 100644
--- a/crates/cust/src/module.rs
+++ b/crates/cust/src/module.rs
@@ -338,7 +338,7 @@ impl Module {
     /// ```
     #[deprecated(
         since = "0.3.0",
-        note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing 
+        note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing
     an empty slice of options (usually)
     "
     )]
diff --git a/crates/rustc_codegen_nvvm/build.rs b/crates/rustc_codegen_nvvm/build.rs
index e5ed44f..3ea30bc 100644
--- a/crates/rustc_codegen_nvvm/build.rs
+++ b/crates/rustc_codegen_nvvm/build.rs
@@ -21,7 +21,7 @@ fn main() {
     // this is set by cuda_builder, but in case somebody is using the codegen
     // manually, default to 520 (which is what nvvm defaults to).
     if option_env!("CUDA_ARCH").is_none() {
-        println!("cargo:rustc-env=CUDA_ARCH=520")
+        println!("cargo:rustc-env=CUDA_ARCH=750")
     }
 }
 

From 95f1066d5c07e1a99e8f71db32463a005ca98f3a Mon Sep 17 00:00:00 2001
From: Anthony Dodd <dodd.anthonyjosiah@gmail.com>
Date: Wed, 29 Mar 2023 20:10:54 -0500
Subject: [PATCH 3/3] WIP: pop this and continue experimentation

---
 crates/cuda_std/src/rt/mod.rs      | 78 +++---------------------------
 crates/cust/Cargo.toml             |  1 +
 crates/cust/src/link.rs            | 18 ++++---
 crates/find_cuda_helper/src/lib.rs | 10 ++++
 4 files changed, 30 insertions(+), 77 deletions(-)

diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
index 675f886..edd54bb 100644
--- a/crates/cuda_std/src/rt/mod.rs
+++ b/crates/cuda_std/src/rt/mod.rs
@@ -35,11 +35,6 @@ pub struct Stream {
 }
 
 impl Stream {
-    // /// Creates a new stream with flags.
-    // pub fn new(flags: StreamFlags) -> Self {
-    //     Self {}
-    // }
-
     /// Creates a new stream with flags.
     pub fn new(flags: StreamFlags) -> CudaResult<Self> {
         let mut stream = MaybeUninit::uninit();
@@ -52,11 +47,10 @@ impl Stream {
         }
     }
 
-    // #[doc(hidden)]
-    // pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
-    //     unsafe { cuda::cudaLaunchDeviceV2(param_buf, core::ptr::null_mut()).to_result() }
-    //     // unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() }
-    // }
+    #[doc(hidden)]
+    pub unsafe fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
+        cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result()
+    }
 }
 
 impl Drop for Stream {
@@ -69,8 +63,7 @@ impl Drop for Stream {
 
 #[macro_export]
 macro_rules! launch {
-    // ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
-    ($func:ident<<<$grid_dim:expr, $block_dim:expr, ($smem_size:expr)>>>($($param:expr),* $(,)?)) => {{
+    ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
         use $crate::rt::ToResult;
         use $crate::float::GpuFloat;
         let grid_dim = $crate::rt::GridSize::from($grid_dim);
@@ -108,69 +101,12 @@ macro_rules! launch {
             offset += size;
         )*
         if false {
+            // Ensure function call compatibility at compile time.
             $func($($param),*);
         }
-        // unsafe {
-        //     let mut offset = 0;
-        //     $(
-        //         let param = $param;
-        //         let size = ::core::mem::size_of_val(&param);
-        //         let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
-        //         offset = buf_idx * size;
-        //         let ptr = &param as *const _ as *const u8;
-        //         let dst = buf.add(offset);
-        //         ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
-        //     )*
-        // }
-        // if false {
-        //     $func($($param),*);
-        // }
 
         // Launch the kernel.
-        $crate::rt::sys::cudaLaunchDeviceV2(buf as *mut ::core::ffi::c_void, ::core::ptr::null_mut() as *mut _)
-
-        // let mut buf = $crate::rt::sys::cudaGetParameterBuffer(alignment, size) as *mut u8;
-
-        // // Populate the buffer with given arguments.
-        // let mut offset = 0;
-        // $(
-        //     let param = $param;
-        //     let size = ::core::mem::size_of_val(&param);
-        //     let buf_bytes_ptr = (buf as *mut u8).add(offset);
-        //     ::core::ptr::copy_nonoverlapping($param as *const _, buf_bytes_ptr.into(), size);
-        //     offset += size;
-        // )*
-
-        // let mut offset = 0;
-        // $(
-        //     let param = $param;
-        //     let size = ::core::mem::size_of_val(&param);
-        //     let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
-        //     offset = buf_idx * size;
-        //     let ptr = &param as *const _ as *const u8;
-        //     let dst = buf.add(offset);
-        //     ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
-        // )*
-
-        // // Launch the kernel.
-        // let fptr = $func as *const ();
-        // $crate::rt::sys::cudaLaunchDevice(
-        //     fptr as *const ::core::ffi::c_void,
-        //     buf as *mut ::core::ffi::c_void,
-        //     $crate::rt::sys::dim3 {
-        //         x: grid_dim.x,
-        //         y: grid_dim.y,
-        //         z: grid_dim.z
-        //     },
-        //     $crate::rt::sys::dim3 {
-        //         x: block_dim.x,
-        //         y: block_dim.y,
-        //         z: block_dim.z
-        //     },
-        //     $smem_size,
-        //     ::core::ptr::null_mut() as *mut _,
-        //     // $stream.raw,
-        // )
+        $stream.launch(buf)
     }};
 }
 
diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
index 39eab9c..d63a746 100644
--- a/crates/cust/Cargo.toml
+++ b/crates/cust/Cargo.toml
@@ -22,6 +22,7 @@ mint = { version = "^0.5", optional = true }
 num-complex = { version = "0.4", optional = true }
 vek = { version = "0.15.1", optional = true, default-features = false }
 bytemuck = { version = "1.7.3", optional = true }
+find_cuda_helper = { path = "../find_cuda_helper", version = "0.2" }
 
 [features]
 default= ["bytemuck"]
diff --git a/crates/cust/src/link.rs b/crates/cust/src/link.rs
index 6219aaa..53a1c75 100644
--- a/crates/cust/src/link.rs
+++ b/crates/cust/src/link.rs
@@ -3,9 +3,9 @@
 use std::mem::MaybeUninit;
 use std::ptr::null_mut;
 
+use crate::error::{CudaError, CudaResult, ToResult};
 use crate::sys as cuda;
-
-use crate::error::{CudaResult, ToResult};
+use find_cuda_helper::find_lib_cudadevrt;
 
 static UNNAMED: &str = "\0";
 
@@ -25,6 +25,12 @@ impl Linker {
         // Therefore we use box to alloc the memory for us, then into_raw it so we now have ownership
         // of the memory (and dont have any aliasing requirements attached either).
 
+        // // Just take advantage of C memory model and just pass individual elements, as there is only 1.
+        // let num_options: u32 = 1;
+        // let opt = &mut cuda::CUjit_option::CU_JIT_TARGET as *mut _;
+        // let mut opt_val =
+        //     &mut cuda::CUjit_target::CU_TARGET_COMPUTE_75 as *mut _ as *mut ::std::os::raw::c_void;
+
         unsafe {
             let mut raw = MaybeUninit::uninit();
             cuda::cuLinkCreate_v2(0, null_mut(), null_mut(), raw.as_mut_ptr()).to_result()?;
@@ -116,15 +122,15 @@ impl Linker {
 
     /// Link device runtime lib.
     pub fn add_libcudadevrt(&mut self) -> CudaResult<()> {
-        let mut bytes = std::fs::read("/usr/local/cuda-11/lib64/libcudadevrt.a")
-            .expect("could not read libcudadevrt.a");
+        let path = find_lib_cudadevrt().ok_or_else(|| CudaError::FileNotFound)?;
+        let mut bytes = std::fs::read(path)
+            // TODO: don't panic, update the result type instead.
+            .expect("error linking libcudadevrt.a");
 
         unsafe {
             cuda::cuLinkAddData_v2(
                 self.raw,
                 cuda::CUjitInputType::CU_JIT_INPUT_LIBRARY,
-                // cuda_sys wants *mut but from the API docs we know we retain ownership so
-                // this cast is sound.
                 bytes.as_mut_ptr() as *mut _,
                 bytes.len(),
                 UNNAMED.as_ptr().cast(),
diff --git a/crates/find_cuda_helper/src/lib.rs b/crates/find_cuda_helper/src/lib.rs
index b3943be..53fd852 100644
--- a/crates/find_cuda_helper/src/lib.rs
+++ b/crates/find_cuda_helper/src/lib.rs
@@ -150,6 +150,16 @@ pub fn find_cuda_lib_dirs() -> Vec<PathBuf> {
     valid_paths
 }
 
+/// Find the location of `libcudadevrt.a`.
+pub fn find_lib_cudadevrt() -> Option<PathBuf> {
+    let root = find_cuda_root()?;
+    let lib = root.join("lib64").join("libcudadevrt.a");
+    if lib.is_file() {
+        return Some(lib);
+    }
+    None
+}
+
 #[cfg(target_os = "windows")]
 pub fn find_optix_root() -> Option<PathBuf> {
     // the optix SDK installer sets OPTIX_ROOT_DIR whenever it installs.