Skip to content

Commit

Permalink
Adds cudnn feature flag. Removes "test-cuda" feature flag. Using cuDN…
Browse files Browse the repository at this point in the history
…N for convolutions (#651)

* Using cuDNN for convolutions

* update to latest cudarc

* Rework feature flags

* Revert default feature flag change

* Removing test-cuda feature flag

* Reducing precision for batched conv2d
  • Loading branch information
coreylowman authored Apr 5, 2023
1 parent 3ec1042 commit 33254c5
Show file tree
Hide file tree
Showing 8 changed files with 215 additions and 12 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/cargo-check-features.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ jobs:
matrix:
config:
- toolchain: stable
command: cargo hack check --feature-powerset --no-dev-deps --depth 2 --skip default,nightly,cpu-mkl-matmul,cuda,test-cuda
command: cargo hack check --feature-powerset --no-dev-deps --depth 2 --skip default,nightly,cpu-mkl-matmul,cuda,cudnn
- toolchain: nightly
command: cargo hack check --each-feature --no-dev-deps --features nightly --skip default,cpu-mkl-matmul,cuda,test-cuda
command: cargo hack check --each-feature --no-dev-deps --features nightly --skip default,cpu-mkl-matmul,cuda,cudnn

steps:
- uses: actions/checkout@v2
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/cargo-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,9 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: check
args: --features test-cuda,ci-check
args: --features cuda,ci-check
- name: Check CUDNN
uses: actions-rs/cargo@v1
with:
command: check
args: --features cudnn,ci-check
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ matrixmultiply = { version = "0.3.2", default-features = false, optional = true
zip = { version = "0.6.2", default-features = false, optional = true }
cblas-sys = { version = "0.1.4", default-features = false, optional = true }
libc = { version = "0.2", default-features = false, optional = true }
cudarc = { version = "0.9.5", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
cudarc = { version = "0.9.6", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
num-traits = { version = "0.2.15", default-features = false }
safetensors = { version = "0.3", default-features = false, optional = true }
memmap2 = { version = "0.5", default-features = false, optional = true }
Expand All @@ -56,12 +56,13 @@ no-std = ["no-std-compat", "dep:spin", "cudarc?/no-std"]
cpu-seq-matmul = ["dep:matrixmultiply"]
cpu-par-matmul = ["std", "dep:matrixmultiply", "matrixmultiply?/threading"]
cpu-mkl-matmul = ["dep:cblas-sys", "dep:libc"]

cuda = ["dep:cudarc", "dep:glob"]
cudnn = ["cuda", "cudarc?/cudnn"]

numpy = ["dep:zip", "std"]
safetensors = ["dep:safetensors", "std", "dep:memmap2"]

test-cuda = ["cuda"]
test-f64 = []
test-integrations = []
ci-check = ["cudarc?/ci-check"]
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,10 @@ pub fn keep_denormals() {
#[cfg(test)]
pub(crate) mod tests {

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
pub type TestDevice = crate::tensor::Cpu;

#[cfg(feature = "test-cuda")]
#[cfg(feature = "cuda")]
pub type TestDevice = crate::tensor::Cuda;

#[cfg(not(feature = "test-f64"))]
Expand Down
15 changes: 15 additions & 0 deletions src/tensor/cuda/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ pub struct Cuda {
pub(crate) cpu: Cpu,
pub(crate) dev: Arc<CudaDevice>,
pub(crate) blas: Arc<CudaBlas>,
#[cfg(feature = "cudnn")]
pub(crate) cudnn: Arc<cudarc::cudnn::Cudnn>,
/// A second stream for kernels to optionally execute on.
pub(crate) par_stream: Arc<CudaStream>,
pub(crate) workspace: Arc<Mutex<CudaSlice<u8>>>,
Expand All @@ -28,6 +30,8 @@ pub struct Cuda {
#[derive(Debug)]
pub enum CudaError {
Blas(CublasError),
#[cfg(feature = "cudnn")]
Cudnn(cudarc::cudnn::CudnnError),
Driver(DriverError),
Cpu(CpuError),
}
Expand All @@ -50,6 +54,13 @@ impl From<DriverError> for CudaError {
}
}

#[cfg(feature = "cudnn")]
impl From<cudarc::cudnn::CudnnError> for CudaError {
fn from(value: cudarc::cudnn::CudnnError) -> Self {
Self::Cudnn(value)
}
}

impl Default for Cuda {
fn default() -> Self {
Self::seed_from_u64(0)
Expand All @@ -72,12 +83,16 @@ impl Cuda {
let cpu = Cpu::seed_from_u64(seed);
let dev = CudaDevice::new(ordinal)?;
let blas = Arc::new(CudaBlas::new(dev.clone())?);
#[cfg(feature = "cudnn")]
let cudnn = cudarc::cudnn::Cudnn::new(dev.clone())?;
let par_stream = Arc::new(dev.fork_default_stream()?);
let workspace = Arc::new(Mutex::new(dev.alloc_zeros::<u8>(0)?));
Ok(Self {
cpu,
dev,
blas,
#[cfg(feature = "cudnn")]
cudnn,
par_stream,
workspace,
})
Expand Down
177 changes: 177 additions & 0 deletions src/tensor_ops/conv2d/cudnn_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
use cudarc::cudnn::{self, Conv2dBackwardData, Conv2dBackwardFilter, Conv2dForward, CudnnDataType};
use cudarc::driver::DeviceSlice;

use crate::{
shapes::*,
tensor::{unique_id, Cuda, GhostTensor, Tensor},
};

use std::sync::Arc;

trait HasCudnnKernel<E> {}
impl HasCudnnKernel<f32> for Cuda {}
impl HasCudnnKernel<f64> for Cuda {}

fn make_4d<S: Shape>(strides: S::Concrete, pad: usize) -> [usize; 4] {
match S::NUM_DIMS {
3 => [pad, strides[0], strides[1], strides[2]],
4 => [strides[0], strides[1], strides[2], strides[3]],
_ => unreachable!("Only implemented for 3d & 4d arrays"),
}
}

impl<E: Dtype + CudnnDataType> super::Conv2DKernel<E> for Cuda
where
Self: HasCudnnKernel<E>,
{
fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Self::Err> {
let data = Arc::new(unsafe { self.dev.alloc::<E>(shape.num_elements()) }?);
Ok(Tensor {
id: unique_id(),
data,
shape,
strides: shape.strides(),
device: self.clone(),
tape: Default::default(),
})
}
fn forward<L: Shape, R: Shape, O: Shape>(
&self,
op: super::Conv2DOp,
lhs: &Tensor<L, E, Self>,
rhs: &Tensor<R, E, Self>,
out: &mut Tensor<O, E, Self>,
) -> Result<(), Self::Err> {
let conv = self.cudnn.create_conv2d::<E>(
[op.padding as i32, op.padding as i32],
[op.stride as i32, op.stride as i32],
[1, 1],
cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
)?;
let img = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<L>(lhs.shape.concrete(), 1).map(|x| x as i32),
make_4d::<L>(lhs.strides, 0).map(|x| x as i32),
)?;
let filter = self.cudnn.create_4d_filter::<E>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
make_4d::<R>(rhs.shape.concrete(), 1).map(|x| x as i32),
)?;
let y = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<O>(out.shape.concrete(), 1).map(|x| x as i32),
make_4d::<O>(out.strides, 0).map(|x| x as i32),
)?;
let op = Conv2dForward {
conv: &conv,
x: &img,
w: &filter,
y: &y,
};

let algo = op.pick_algorithm()?;
let workspace_size_in_bytes = op.get_workspace_size(algo)?;

unsafe {
let mut workspace = self.get_workspace::<u8>(workspace_size_in_bytes)?;
let mut workspace = workspace
.transmute_mut::<u8>(workspace_size_in_bytes)
.unwrap();
assert_eq!(workspace.len(), workspace_size_in_bytes);
op.launch(
algo,
Some(&mut workspace),
(E::ONE, Default::default()),
lhs.data.as_ref(),
rhs.data.as_ref(),
Arc::get_mut(&mut out.data).unwrap(),
)?;
}

Ok(())
}

fn backward<L: Shape, R: Shape, O: Shape>(
&self,
op: super::Conv2DOp,
lhs: &Tensor<L, E, Self>,
grad_lhs: &mut Self::Vec<E>,
rhs: &Tensor<R, E, Self>,
grad_rhs: &mut Self::Vec<E>,
out: &GhostTensor<O, E, Self>,
grad_out: &Self::Vec<E>,
) -> Result<(), Self::Err> {
let conv = self.cudnn.create_conv2d::<E>(
[op.padding as i32, op.padding as i32],
[op.stride as i32, op.stride as i32],
[1, 1],
cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
)?;
let img = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<L>(lhs.shape.concrete(), 1).map(|x| x as i32),
make_4d::<L>(lhs.strides, 0).map(|x| x as i32),
)?;
let filter = self.cudnn.create_4d_filter::<E>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
make_4d::<R>(rhs.shape.concrete(), 1).map(|x| x as i32),
)?;
let out = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<O>(out.shape.concrete(), 1).map(|x| x as i32),
make_4d::<O>(out.strides, 0).map(|x| x as i32),
)?;

{
let op = Conv2dBackwardData {
conv: &conv,
dx: &img,
w: &filter,
dy: &out,
};
let algo = op.pick_algorithm()?;
let workspace_size_in_bytes = op.get_workspace_size(algo)?;

unsafe {
let mut workspace = self.get_workspace::<u8>(workspace_size_in_bytes)?;
let mut workspace = workspace
.transmute_mut::<u8>(workspace_size_in_bytes)
.unwrap();
assert_eq!(workspace.len(), workspace_size_in_bytes);
op.launch(
algo,
Some(&mut workspace),
(E::ONE, Default::default()),
grad_lhs,
rhs.data.as_ref(),
grad_out,
)
}?;
}

{
let op = Conv2dBackwardFilter {
conv: &conv,
x: &img,
dw: &filter,
dy: &out,
};

let algo = op.pick_algorithm()?;
let workspace_size_in_bytes = op.get_workspace_size(algo)?;

unsafe {
let mut workspace = self.get_workspace::<u8>(workspace_size_in_bytes)?;
let mut workspace = workspace
.transmute_mut::<u8>(workspace_size_in_bytes)
.unwrap();
assert_eq!(workspace.len(), workspace_size_in_bytes);
op.launch(
algo,
Some(&mut workspace),
(E::ONE, Default::default()),
lhs.data.as_ref(),
grad_rhs,
grad_out,
)
}?;
}
Ok(())
}
}
9 changes: 7 additions & 2 deletions src/tensor_ops/conv2d/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
mod cpu_kernel;

#[cfg(feature = "cuda")]
#[cfg(all(not(feature = "cudnn"), feature = "cuda"))]
mod cuda_kernel;

#[cfg(feature = "cudnn")]
mod cudnn_kernel;

use crate::{shapes::*, tensor::*};

#[repr(C)]
Expand Down Expand Up @@ -226,6 +229,7 @@ impl<
mod tests {
use super::*;
use crate::{tensor_ops::*, tests::*};
use num_traits::FromPrimitive;

#[test]
/// Produced by
Expand Down Expand Up @@ -434,6 +438,7 @@ mod tests {
let x = x
.broadcast::<Rank4<10, 3, 28, 28>, _>()
.reshape::<Rank4<10, 3, 28, 28>>();
assert_eq!(x.strides, x.shape.strides());

let y: Tensor<Rank4<10, 5, 9, 9>, _, _, _> = x.leaky_trace().conv2d::<3, 2>(w.clone());
for i in 0..10 {
Expand All @@ -442,7 +447,7 @@ mod tests {

let grads = y.square().mean().backward();

assert_close(&w0, &(grads.get(&w)).array());
w0.assert_close(&(grads.get(&w)).array(), TestDtype::from_f32(1e-3).unwrap());

let x_grad = grads.get(&x) * 10.0;
for i in 0..10 {
Expand Down
6 changes: 3 additions & 3 deletions src/tensor_ops/select_and_gather/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ mod tests {
let _ = t.leaky_trace().select(dev.zeros_like(&(7, 4)));
}

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
#[test]
#[should_panic = "Index out of bounds: index=[7]"]
fn test_select_index_out_of_bounds() {
Expand All @@ -241,7 +241,7 @@ mod tests {
let _ = t.leaky_trace().gather(dev.zeros_like(&(5, 4, 2)));
}

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
#[test]
#[should_panic = "Index out of bounds: index=[7]"]
fn test_gather_index_out_of_bounds() {
Expand All @@ -250,7 +250,7 @@ mod tests {
let _ = t.leaky_trace().gather(dev.tensor([7, 6, 1, 2]));
}

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
#[test]
#[should_panic = "Index out of bounds: index=[5, 0]"]
fn test_gather_batch_out_of_bounds() {
Expand Down

0 comments on commit 33254c5

Please sign in to comment.