diff --git a/crates/cuda_std/src/lib.rs b/crates/cuda_std/src/lib.rs
index 33e7b28..472d393 100644
--- a/crates/cuda_std/src/lib.rs
+++ b/crates/cuda_std/src/lib.rs
@@ -36,17 +36,16 @@
 
 extern crate alloc;
 
+pub mod atomic;
+pub mod cfg;
 pub mod float;
 #[allow(warnings)]
 pub mod intrinsics;
 pub mod io;
 pub mod mem;
 pub mod misc;
-// WIP
-// pub mod rt;
-pub mod atomic;
-pub mod cfg;
 pub mod ptr;
+pub mod rt;
 pub mod shared;
 pub mod thread;
 pub mod warp;
diff --git a/crates/cuda_std/src/rt/error.rs b/crates/cuda_std/src/rt/error.rs
index 18992c2..024a9de 100644
--- a/crates/cuda_std/src/rt/error.rs
+++ b/crates/cuda_std/src/rt/error.rs
@@ -132,7 +132,7 @@ pub enum CudaError {
 /// Result type for most CUDA functions.
 pub type CudaResult<T> = Result<T, CudaError>;
 
-pub(crate) trait ToResult {
+pub trait ToResult {
     fn to_result(self) -> CudaResult<()>;
 }
 impl ToResult for cudaError_t {
diff --git a/crates/cuda_std/src/rt/mod.rs b/crates/cuda_std/src/rt/mod.rs
index 36ce621..edd54bb 100644
--- a/crates/cuda_std/src/rt/mod.rs
+++ b/crates/cuda_std/src/rt/mod.rs
@@ -31,7 +31,7 @@ bitflags::bitflags! {
 
 #[derive(Debug)]
 pub struct Stream {
-    raw: cuda::cudaStream_t,
+    pub raw: cuda::cudaStream_t,
 }
 
 impl Stream {
@@ -48,8 +48,8 @@ impl Stream {
     }
 
     #[doc(hidden)]
-    pub fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
-        unsafe { cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result() }
+    pub unsafe fn launch(&self, param_buf: *mut c_void) -> CudaResult<()> {
+        cuda::cudaLaunchDeviceV2(param_buf, self.raw).to_result()
     }
 }
 
@@ -64,38 +64,49 @@ impl Drop for Stream {
 #[macro_export]
 macro_rules! launch {
     ($func:ident<<<$grid_dim:expr, $block_dim:expr, $smem_size:expr, $stream:ident>>>($($param:expr),* $(,)?)) => {{
-        let grid_dim = ::$crate::rt::GridDim::from($grid_dim);
-        let block_dim = ::$crate::rt::BlockDim::from($block_dim);
-        let mut buf = ::$crate::rt::sys::cudaGetParameterBufferV2(
-            &$func as *const _ as *const ::core::ffi::c_void,
-            ::$crate::rt::sys::dim3 {
+        use $crate::rt::ToResult;
+        use $crate::float::GpuFloat;
+        let grid_dim = $crate::rt::GridSize::from($grid_dim);
+        let block_dim = $crate::rt::BlockSize::from($block_dim);
+
+        // Get a device buffer for kernel launch.
+        let fptr = $func as *const ();
+        let mut buf = $crate::rt::sys::cudaGetParameterBufferV2(
+            fptr as *const ::core::ffi::c_void,
+            $crate::rt::sys::dim3 {
                 x: grid_dim.x,
                 y: grid_dim.y,
                 z: grid_dim.z
             },
-            ::$crate::rt::sys::dim3 {
+            $crate::rt::sys::dim3 {
                 x: block_dim.x,
                 y: block_dim.y,
                 z: block_dim.z
             },
-            $smem_size
-        ) as *mut u8;
-        unsafe {
-            let mut offset = 0;
-            $(
-                let param = $param;
-                let size = ::core::mem::size_of_val(&param)
-                let mut buf_idx = (offset as f32 / size as f32).ceil() as usize + 1;
-                offset = buf_idx * size;
-                let ptr = &param as *const _ as *const u8;
-                let dst = buf.add(offset);
-                ::core::ptr::copy_nonoverlapping(&param as *const _ as *const u8, dst, size);
-            )*
+            $smem_size,
+        );
+
+        // Ensure buffer is not a nil ptr.
+        if buf.is_null() {
+            return;
         }
+
+        // Load data into buffer.
+        let mut offset = 0;
+        $(
+            let param = $param;
+            let size = ::core::mem::size_of_val(&param);
+            let param_ptr = &param as *const _ as *const ::core::ffi::c_void;
+            let dst = buf.add(offset).copy_from(param_ptr, size);
+            offset += size;
+        )*
         if false {
+            // Ensure function call compatibility at compile time.
             $func($($param),*);
         }
-        $stream.launch(buf as *mut ::core::ffi::c_void).to_result()
+
+        // Launch the kernel.
+        $stream.launch(buf)
     }};
 }
 
diff --git a/crates/cuda_std/src/rt/sys.rs b/crates/cuda_std/src/rt/sys.rs
index 5a0bfc4..5fad1b9 100644
--- a/crates/cuda_std/src/rt/sys.rs
+++ b/crates/cuda_std/src/rt/sys.rs
@@ -14,6 +14,16 @@ pub use crate::rt::driver_types_sys::*;
 // to share this stuff with cust.
 
 extern "C" {
+    pub fn cudaGetParameterBuffer(alignment: usize, size: usize) -> *mut c_void;
+    pub fn cudaLaunchDevice(
+        func: *const c_void,
+        parameterBuffer: *const c_void,
+        gridDimension: dim3,
+        blockDimension: dim3,
+        sharedMemSize: c_uint,
+        stream: cudaStream_t,
+    ) -> cudaError_t;
+
     pub fn cudaDeviceGetAttribute(
         value: *mut c_int,
         attr: cudaDeviceAttr,
diff --git a/crates/cust/Cargo.toml b/crates/cust/Cargo.toml
index 39eab9c..d63a746 100644
--- a/crates/cust/Cargo.toml
+++ b/crates/cust/Cargo.toml
@@ -22,6 +22,7 @@ mint = { version = "^0.5", optional = true }
 num-complex = { version = "0.4", optional = true }
 vek = { version = "0.15.1", optional = true, default-features = false }
 bytemuck = { version = "1.7.3", optional = true }
+find_cuda_helper = { path = "../find_cuda_helper", version = "0.2" }
 
 [features]
 default= ["bytemuck"]
diff --git a/crates/cust/src/link.rs b/crates/cust/src/link.rs
index fc231d9..53a1c75 100644
--- a/crates/cust/src/link.rs
+++ b/crates/cust/src/link.rs
@@ -3,9 +3,9 @@
 use std::mem::MaybeUninit;
 use std::ptr::null_mut;
 
+use crate::error::{CudaError, CudaResult, ToResult};
 use crate::sys as cuda;
-
-use crate::error::{CudaResult, ToResult};
+use find_cuda_helper::find_lib_cudadevrt;
 
 static UNNAMED: &str = "\0";
 
@@ -25,6 +25,12 @@ impl Linker {
         // Therefore we use box to alloc the memory for us, then into_raw it so we now have ownership
         // of the memory (and dont have any aliasing requirements attached either).
 
+        // // Just take advantage of C memory model and just pass individual elements, as there is only 1.
+        // let num_options: u32 = 1;
+        // let opt = &mut cuda::CUjit_option::CU_JIT_TARGET as *mut _;
+        // let mut opt_val =
+        //     &mut cuda::CUjit_target::CU_TARGET_COMPUTE_75 as *mut _ as *mut ::std::os::raw::c_void;
+
         unsafe {
             let mut raw = MaybeUninit::uninit();
             cuda::cuLinkCreate_v2(0, null_mut(), null_mut(), raw.as_mut_ptr()).to_result()?;
@@ -114,6 +120,28 @@ impl Linker {
         }
     }
 
+    /// Link device runtime lib.
+    pub fn add_libcudadevrt(&mut self) -> CudaResult<()> {
+        let path = find_lib_cudadevrt().ok_or_else(|| CudaError::FileNotFound)?;
+        let mut bytes = std::fs::read(path)
+            // TODO: don't panic, update the result type instead.
+            .expect("error linking libcudadevrt.a");
+
+        unsafe {
+            cuda::cuLinkAddData_v2(
+                self.raw,
+                cuda::CUjitInputType::CU_JIT_INPUT_LIBRARY,
+                bytes.as_mut_ptr() as *mut _,
+                bytes.len(),
+                UNNAMED.as_ptr().cast(),
+                0,
+                std::ptr::null_mut(),
+                std::ptr::null_mut(),
+            )
+            .to_result()
+        }
+    }
+
     /// Runs the linker to generate the final cubin bytes. Also returns a duration
     /// for how long it took to run the linker.
     pub fn complete(self) -> CudaResult<Vec<u8>> {
diff --git a/crates/cust/src/module.rs b/crates/cust/src/module.rs
index 9e71f57..0813aa5 100644
--- a/crates/cust/src/module.rs
+++ b/crates/cust/src/module.rs
@@ -338,7 +338,7 @@ impl Module {
     /// ```
     #[deprecated(
         since = "0.3.0",
-        note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing 
+        note = "load_from_string was an inconsistent name with inconsistent params, use from_ptx/from_ptx_cstr, passing
     an empty slice of options (usually)
     "
     )]
diff --git a/crates/find_cuda_helper/src/lib.rs b/crates/find_cuda_helper/src/lib.rs
index b3943be..53fd852 100644
--- a/crates/find_cuda_helper/src/lib.rs
+++ b/crates/find_cuda_helper/src/lib.rs
@@ -150,6 +150,16 @@ pub fn find_cuda_lib_dirs() -> Vec<PathBuf> {
     valid_paths
 }
 
+/// Find the location of `libcudadevrt.a`.
+pub fn find_lib_cudadevrt() -> Option<PathBuf> {
+    let root = find_cuda_root()?;
+    let lib = root.join("lib64").join("libcudadevrt.a");
+    if lib.is_file() {
+        return Some(lib);
+    }
+    None
+}
+
 #[cfg(target_os = "windows")]
 pub fn find_optix_root() -> Option<PathBuf> {
     // the optix SDK installer sets OPTIX_ROOT_DIR whenever it installs.
diff --git a/crates/rustc_codegen_nvvm/build.rs b/crates/rustc_codegen_nvvm/build.rs
index e5ed44f..3ea30bc 100644
--- a/crates/rustc_codegen_nvvm/build.rs
+++ b/crates/rustc_codegen_nvvm/build.rs
@@ -21,7 +21,7 @@ fn main() {
     // this is set by cuda_builder, but in case somebody is using the codegen
     // manually, default to 520 (which is what nvvm defaults to).
     if option_env!("CUDA_ARCH").is_none() {
-        println!("cargo:rustc-env=CUDA_ARCH=520")
+        println!("cargo:rustc-env=CUDA_ARCH=750")
     }
 }