Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds cudnn feature flag. Removes "test-cuda" feature flag. Using cuDNN for convolutions #651

Merged
merged 7 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/cargo-check-features.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@ jobs:
matrix:
config:
- toolchain: stable
command: cargo hack check --feature-powerset --no-dev-deps --depth 2 --skip default,nightly,cpu-mkl-matmul,cuda,test-cuda
command: cargo hack check --feature-powerset --no-dev-deps --depth 2 --skip default,nightly,cpu-mkl-matmul,cuda,cudnn
- toolchain: nightly
command: cargo hack check --each-feature --no-dev-deps --features nightly --skip default,cpu-mkl-matmul,cuda,test-cuda
command: cargo hack check --each-feature --no-dev-deps --features nightly --skip default,cpu-mkl-matmul,cuda,cudnn

steps:
- uses: actions/checkout@v2
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/cargo-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,9 @@ jobs:
uses: actions-rs/cargo@v1
with:
command: check
args: --features test-cuda,ci-check
args: --features cuda,ci-check
- name: Check CUDNN
uses: actions-rs/cargo@v1
with:
command: check
args: --features cudnn,ci-check
5 changes: 3 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ matrixmultiply = { version = "0.3.2", default-features = false, optional = true
zip = { version = "0.6.2", default-features = false, optional = true }
cblas-sys = { version = "0.1.4", default-features = false, optional = true }
libc = { version = "0.2", default-features = false, optional = true }
cudarc = { version = "0.9.5", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
cudarc = { version = "0.9.6", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
num-traits = { version = "0.2.15", default-features = false }
safetensors = { version = "0.3", default-features = false, optional = true }
memmap2 = { version = "0.5", default-features = false, optional = true }
Expand All @@ -56,12 +56,13 @@ no-std = ["no-std-compat", "dep:spin", "cudarc?/no-std"]
cpu-seq-matmul = ["dep:matrixmultiply"]
cpu-par-matmul = ["std", "dep:matrixmultiply", "matrixmultiply?/threading"]
cpu-mkl-matmul = ["dep:cblas-sys", "dep:libc"]

cuda = ["dep:cudarc", "dep:glob"]
cudnn = ["cuda", "cudarc?/cudnn"]

numpy = ["dep:zip", "std"]
safetensors = ["dep:safetensors", "std", "dep:memmap2"]

test-cuda = ["cuda"]
test-f64 = []
test-integrations = []
ci-check = ["cudarc?/ci-check"]
Expand Down
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,10 @@ pub fn keep_denormals() {
#[cfg(test)]
pub(crate) mod tests {

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
pub type TestDevice = crate::tensor::Cpu;

#[cfg(feature = "test-cuda")]
#[cfg(feature = "cuda")]
pub type TestDevice = crate::tensor::Cuda;

#[cfg(not(feature = "test-f64"))]
Expand Down
15 changes: 15 additions & 0 deletions src/tensor/cuda/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ pub struct Cuda {
pub(crate) cpu: Cpu,
pub(crate) dev: Arc<CudaDevice>,
pub(crate) blas: Arc<CudaBlas>,
#[cfg(feature = "cudnn")]
pub(crate) cudnn: Arc<cudarc::cudnn::Cudnn>,
/// A second stream for kernels to optionally execute on.
pub(crate) par_stream: Arc<CudaStream>,
pub(crate) workspace: Arc<Mutex<CudaSlice<u8>>>,
Expand All @@ -28,6 +30,8 @@ pub struct Cuda {
#[derive(Debug)]
pub enum CudaError {
Blas(CublasError),
#[cfg(feature = "cudnn")]
Cudnn(cudarc::cudnn::CudnnError),
Driver(DriverError),
Cpu(CpuError),
}
Expand All @@ -50,6 +54,13 @@ impl From<DriverError> for CudaError {
}
}

#[cfg(feature = "cudnn")]
impl From<cudarc::cudnn::CudnnError> for CudaError {
fn from(value: cudarc::cudnn::CudnnError) -> Self {
Self::Cudnn(value)
}
}

impl Default for Cuda {
fn default() -> Self {
Self::seed_from_u64(0)
Expand All @@ -72,12 +83,16 @@ impl Cuda {
let cpu = Cpu::seed_from_u64(seed);
let dev = CudaDevice::new(ordinal)?;
let blas = Arc::new(CudaBlas::new(dev.clone())?);
#[cfg(feature = "cudnn")]
let cudnn = cudarc::cudnn::Cudnn::new(dev.clone())?;
let par_stream = Arc::new(dev.fork_default_stream()?);
let workspace = Arc::new(Mutex::new(dev.alloc_zeros::<u8>(0)?));
Ok(Self {
cpu,
dev,
blas,
#[cfg(feature = "cudnn")]
cudnn,
par_stream,
workspace,
})
Expand Down
177 changes: 177 additions & 0 deletions src/tensor_ops/conv2d/cudnn_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
use cudarc::cudnn::{self, Conv2dBackwardData, Conv2dBackwardFilter, Conv2dForward, CudnnDataType};
use cudarc::driver::DeviceSlice;

use crate::{
shapes::*,
tensor::{unique_id, Cuda, GhostTensor, Tensor},
};

use std::sync::Arc;

trait HasCudnnKernel<E> {}
impl HasCudnnKernel<f32> for Cuda {}
impl HasCudnnKernel<f64> for Cuda {}

fn make_4d<S: Shape>(strides: S::Concrete, pad: usize) -> [usize; 4] {
match S::NUM_DIMS {
3 => [pad, strides[0], strides[1], strides[2]],
4 => [strides[0], strides[1], strides[2], strides[3]],
_ => unreachable!("Only implemented for 3d & 4d arrays"),
}
}

impl<E: Dtype + CudnnDataType> super::Conv2DKernel<E> for Cuda
where
Self: HasCudnnKernel<E>,
{
fn alloc<S: Shape>(&self, shape: S) -> Result<Tensor<S, E, Self>, Self::Err> {
let data = Arc::new(unsafe { self.dev.alloc::<E>(shape.num_elements()) }?);
Ok(Tensor {
id: unique_id(),
data,
shape,
strides: shape.strides(),
device: self.clone(),
tape: Default::default(),
})
}
fn forward<L: Shape, R: Shape, O: Shape>(
&self,
op: super::Conv2DOp,
lhs: &Tensor<L, E, Self>,
rhs: &Tensor<R, E, Self>,
out: &mut Tensor<O, E, Self>,
) -> Result<(), Self::Err> {
let conv = self.cudnn.create_conv2d::<E>(
[op.padding as i32, op.padding as i32],
[op.stride as i32, op.stride as i32],
[1, 1],
cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
)?;
let img = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<L>(lhs.shape.concrete(), 1).map(|x| x as i32),
make_4d::<L>(lhs.strides, 0).map(|x| x as i32),
)?;
let filter = self.cudnn.create_4d_filter::<E>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
make_4d::<R>(rhs.shape.concrete(), 1).map(|x| x as i32),
)?;
let y = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<O>(out.shape.concrete(), 1).map(|x| x as i32),
make_4d::<O>(out.strides, 0).map(|x| x as i32),
)?;
let op = Conv2dForward {
conv: &conv,
x: &img,
w: &filter,
y: &y,
};

let algo = op.pick_algorithm()?;
let workspace_size_in_bytes = op.get_workspace_size(algo)?;

unsafe {
let mut workspace = self.get_workspace::<u8>(workspace_size_in_bytes)?;
let mut workspace = workspace
.transmute_mut::<u8>(workspace_size_in_bytes)
.unwrap();
assert_eq!(workspace.len(), workspace_size_in_bytes);
op.launch(
algo,
Some(&mut workspace),
(E::ONE, Default::default()),
lhs.data.as_ref(),
rhs.data.as_ref(),
Arc::get_mut(&mut out.data).unwrap(),
)?;
}

Ok(())
}

fn backward<L: Shape, R: Shape, O: Shape>(
&self,
op: super::Conv2DOp,
lhs: &Tensor<L, E, Self>,
grad_lhs: &mut Self::Vec<E>,
rhs: &Tensor<R, E, Self>,
grad_rhs: &mut Self::Vec<E>,
out: &GhostTensor<O, E, Self>,
grad_out: &Self::Vec<E>,
) -> Result<(), Self::Err> {
let conv = self.cudnn.create_conv2d::<E>(
[op.padding as i32, op.padding as i32],
[op.stride as i32, op.stride as i32],
[1, 1],
cudnn::sys::cudnnConvolutionMode_t::CUDNN_CROSS_CORRELATION,
)?;
let img = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<L>(lhs.shape.concrete(), 1).map(|x| x as i32),
make_4d::<L>(lhs.strides, 0).map(|x| x as i32),
)?;
let filter = self.cudnn.create_4d_filter::<E>(
cudnn::sys::cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
make_4d::<R>(rhs.shape.concrete(), 1).map(|x| x as i32),
)?;
let out = self.cudnn.create_4d_tensor_ex::<E>(
make_4d::<O>(out.shape.concrete(), 1).map(|x| x as i32),
make_4d::<O>(out.strides, 0).map(|x| x as i32),
)?;

{
let op = Conv2dBackwardData {
conv: &conv,
dx: &img,
w: &filter,
dy: &out,
};
let algo = op.pick_algorithm()?;
let workspace_size_in_bytes = op.get_workspace_size(algo)?;

unsafe {
let mut workspace = self.get_workspace::<u8>(workspace_size_in_bytes)?;
let mut workspace = workspace
.transmute_mut::<u8>(workspace_size_in_bytes)
.unwrap();
assert_eq!(workspace.len(), workspace_size_in_bytes);
op.launch(
algo,
Some(&mut workspace),
(E::ONE, Default::default()),
grad_lhs,
rhs.data.as_ref(),
grad_out,
)
}?;
}

{
let op = Conv2dBackwardFilter {
conv: &conv,
x: &img,
dw: &filter,
dy: &out,
};

let algo = op.pick_algorithm()?;
let workspace_size_in_bytes = op.get_workspace_size(algo)?;

unsafe {
let mut workspace = self.get_workspace::<u8>(workspace_size_in_bytes)?;
let mut workspace = workspace
.transmute_mut::<u8>(workspace_size_in_bytes)
.unwrap();
assert_eq!(workspace.len(), workspace_size_in_bytes);
op.launch(
algo,
Some(&mut workspace),
(E::ONE, Default::default()),
lhs.data.as_ref(),
grad_rhs,
grad_out,
)
}?;
}
Ok(())
}
}
9 changes: 7 additions & 2 deletions src/tensor_ops/conv2d/mod.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
mod cpu_kernel;

#[cfg(feature = "cuda")]
#[cfg(all(not(feature = "cudnn"), feature = "cuda"))]
mod cuda_kernel;

#[cfg(feature = "cudnn")]
mod cudnn_kernel;

use crate::{shapes::*, tensor::*};

#[repr(C)]
Expand Down Expand Up @@ -226,6 +229,7 @@ impl<
mod tests {
use super::*;
use crate::{tensor_ops::*, tests::*};
use num_traits::FromPrimitive;

#[test]
/// Produced by
Expand Down Expand Up @@ -434,6 +438,7 @@ mod tests {
let x = x
.broadcast::<Rank4<10, 3, 28, 28>, _>()
.reshape::<Rank4<10, 3, 28, 28>>();
assert_eq!(x.strides, x.shape.strides());

let y: Tensor<Rank4<10, 5, 9, 9>, _, _, _> = x.leaky_trace().conv2d::<3, 2>(w.clone());
for i in 0..10 {
Expand All @@ -442,7 +447,7 @@ mod tests {

let grads = y.square().mean().backward();

assert_close(&w0, &(grads.get(&w)).array());
w0.assert_close(&(grads.get(&w)).array(), TestDtype::from_f32(1e-3).unwrap());

let x_grad = grads.get(&x) * 10.0;
for i in 0..10 {
Expand Down
6 changes: 3 additions & 3 deletions src/tensor_ops/select_and_gather/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ mod tests {
let _ = t.leaky_trace().select(dev.zeros_like(&(7, 4)));
}

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
#[test]
#[should_panic = "Index out of bounds: index=[7]"]
fn test_select_index_out_of_bounds() {
Expand All @@ -241,7 +241,7 @@ mod tests {
let _ = t.leaky_trace().gather(dev.zeros_like(&(5, 4, 2)));
}

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
#[test]
#[should_panic = "Index out of bounds: index=[7]"]
fn test_gather_index_out_of_bounds() {
Expand All @@ -250,7 +250,7 @@ mod tests {
let _ = t.leaky_trace().gather(dev.tensor([7, 6, 1, 2]));
}

#[cfg(not(feature = "test-cuda"))]
#[cfg(not(feature = "cuda"))]
#[test]
#[should_panic = "Index out of bounds: index=[5, 0]"]
fn test_gather_batch_out_of_bounds() {
Expand Down