Skip to content

Commit

Permalink
Adding Cuda device and skeleton cuda kernel impls (#322)
Browse files Browse the repository at this point in the history
* WIP commit for cuda device

* Updating to latest cudarc, using take_async

* Rework Cuda allocation to use cpu

* Use TestDevice in tensor

* Using SampleTensor

* Clean up cuda device

* Adding cuda kernel to all ops

* Adding cuda kernels to optims and fixing nn TestDeviceUsage

* Remove cuda from default features

* Update cudarc version

* Adding Unpin to Unit

* Adding std feature for cudarc dependency

* Updating cudarc to 0.5.0
  • Loading branch information
coreylowman authored Jan 6, 2023
1 parent bb36005 commit cb2e687
Show file tree
Hide file tree
Showing 86 changed files with 1,565 additions and 65 deletions.
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,17 @@ matrixmultiply = { version = "0.3.2", default-features = false }
zip = { version = "0.6.2", default-features = false, optional = true }
cblas-sys = { version = "0.1.4", default-features = false, optional = true }
libc = { version = "0.2", default-features = false, optional = true }
cudarc = { version = "0.5.0", default-features = false, optional = true }

[features]
default = ["std", "numpy"]
std = ["no-std-compat/std", "rand/std", "rand_distr/std"]
std = ["no-std-compat/std", "rand/std", "rand_distr/std", "cudarc?/std"]
nightly = []
numpy = ["dep:zip", "std"]
cblas = ["dep:cblas-sys", "dep:libc"]
intel-mkl = ["cblas"]
cuda = ["dep:cudarc"]
test-cuda = ["cuda"]

[dev-dependencies]
rand = "0.8.5"
Expand Down
4 changes: 4 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,12 @@ pub fn keep_denormals() {
pub(crate) mod tests {
const TOLERANCE: f32 = 1e-6;

#[cfg(not(feature = "test-cuda"))]
pub type TestDevice = crate::tensor::Cpu;

#[cfg(feature = "test-cuda")]
pub type TestDevice = crate::tensor::Cuda;

pub trait AssertClose {
fn get_far_pair(&self, rhs: &Self, tolerance: f32) -> Option<(f32, f32)>;
fn assert_close(&self, rhs: &Self, tolerance: f32)
Expand Down
42 changes: 22 additions & 20 deletions src/nn/add_into.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ mod tests {
#[test]
fn test_add_into_3() {
let dev: TestDevice = Default::default();
let m: AddInto<(Linear<2, 5>, Linear<3, 5>, Linear<4, 5>)> = dev.build_module();
let m: AddInto<(Linear<2, 5, _>, Linear<3, 5, _>, Linear<4, 5, _>)> = dev.build_module();
let _: Tensor<Rank1<5>, _, _, OwnedTape<_>> = m.forward((
dev.zeros::<Rank1<2>>().traced(),
dev.zeros::<Rank1<3>>().traced(),
Expand All @@ -127,8 +127,12 @@ mod tests {
#[test]
fn test_add_into_4() {
let dev: TestDevice = Default::default();
type Model = AddInto<(Linear<2, 5>, Linear<3, 5>, Linear<4, 5>, Linear<5, 5>)>;
let m: Model = dev.build_module();
let m: AddInto<(
Linear<2, 5, _>,
Linear<3, 5, _>,
Linear<4, 5, _>,
Linear<5, 5, _>,
)> = dev.build_module();
let _: Tensor<Rank1<5>, _, _, OwnedTape<_>> = m.forward((
dev.zeros::<Rank1<2>>().traced(),
dev.zeros::<Rank1<3>>().traced(),
Expand All @@ -146,14 +150,13 @@ mod tests {
#[test]
fn test_add_into_5() {
let dev: TestDevice = Default::default();
type Model = AddInto<(
Linear<2, 5>,
Linear<3, 5>,
Linear<4, 5>,
Linear<5, 5>,
Linear<6, 5>,
)>;
let m: Model = dev.build_module();
let m: AddInto<(
Linear<2, 5, _>,
Linear<3, 5, _>,
Linear<4, 5, _>,
Linear<5, 5, _>,
Linear<6, 5, _>,
)> = dev.build_module();
let _: Tensor<Rank1<5>, _, _, OwnedTape<_>> = m.forward((
dev.zeros::<Rank1<2>>().traced(),
dev.zeros::<Rank1<3>>().traced(),
Expand All @@ -173,15 +176,14 @@ mod tests {
#[test]
fn test_add_into_6() {
let dev: TestDevice = Default::default();
type Model = AddInto<(
Linear<2, 5>,
Linear<3, 5>,
Linear<4, 5>,
Linear<5, 5>,
Linear<6, 5>,
Linear<7, 5>,
)>;
let m: Model = dev.build_module();
let m: AddInto<(
Linear<2, 5, _>,
Linear<3, 5, _>,
Linear<4, 5, _>,
Linear<5, 5, _>,
Linear<6, 5, _>,
Linear<7, 5, _>,
)> = dev.build_module();
let _: Tensor<Rank1<5>, _, _, OwnedTape<_>> = m.forward((
dev.zeros::<Rank1<2>>().traced(),
dev.zeros::<Rank1<3>>().traced(),
Expand Down
2 changes: 1 addition & 1 deletion src/nn/batchnorm2d.rs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ mod tests {
fn test_batchnorm2d_3d_forward_mut() {
let dev = TestDevice::seed_from_u64(0);

let x1: Tensor<Rank3<3, 2, 2>> = dev.sample(rand_distr::StandardNormal);
let x1: Tensor<Rank3<3, 2, 2>, f32, _> = dev.sample(rand_distr::StandardNormal);
let mut bn: BatchNorm2D<3, _> = dev.build_module();

let y1 = bn.forward_mut(x1.trace());
Expand Down
12 changes: 6 additions & 6 deletions src/nn/impl_module_for_tuples.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ mod tests {

#[test]
fn test_set_to_1() {
let dev: TestDevice = Default::default();
let dev: Cpu = Default::default();
assert_eq!(
SetTo1::<0, 5>::default().forward(dev.zeros()).array(),
[1.0, 0.0, 0.0, 0.0, 0.0]
Expand Down Expand Up @@ -184,31 +184,31 @@ mod tests {

#[test]
fn test_2_tuple_forward() {
let dev: TestDevice = Default::default();
let dev: Cpu = Default::default();
let model: (SetTo1<0, 2>, SetTo1<1, 2>) = Default::default();
let y = model.forward(dev.zeros());
assert_eq!(y.array(), [1.0, 1.0]);
}

#[test]
fn test_3_tuple_forward() {
let dev: TestDevice = Default::default();
let dev: Cpu = Default::default();
let model: (SetTo1<0, 3>, SetTo1<1, 3>, SetTo1<2, 3>) = Default::default();
let y = model.forward(dev.zeros());
assert_eq!(y.array(), [1.0, 1.0, 1.0]);
}

#[test]
fn test_4_tuple_forward() {
let dev: TestDevice = Default::default();
let dev: Cpu = Default::default();
let model: (SetTo1<0, 4>, SetTo1<1, 4>, SetTo1<2, 4>, SetTo1<3, 4>) = Default::default();
let y = model.forward(dev.zeros());
assert_eq!(y.array(), [1.0, 1.0, 1.0, 1.0]);
}

#[test]
fn test_5_tuple_forward() {
let dev: TestDevice = Default::default();
let dev: Cpu = Default::default();
let model: (
SetTo1<0, 5>,
SetTo1<1, 5>,
Expand All @@ -222,7 +222,7 @@ mod tests {

#[test]
fn test_6_tuple_forward() {
let dev: TestDevice = Default::default();
let dev: Cpu = Default::default();
let model: (
SetTo1<0, 6>,
SetTo1<1, 6>,
Expand Down
4 changes: 2 additions & 2 deletions src/nn/npz_impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,8 @@ mod tests {
let x = dev.sample_normal::<Rank3<3, 4, 5>>();
let file = NamedTempFile::new().expect("failed to create tempfile");

let mut saved: BatchNorm2D<3> = dev.build_module();
let mut loaded: BatchNorm2D<3> = dev.build_module();
let mut saved: BatchNorm2D<3, _> = dev.build_module();
let mut loaded: BatchNorm2D<3, _> = dev.build_module();

saved.running_mean.fill_with_distr(Standard);
saved.running_var.fill_with_distr(Standard);
Expand Down
46 changes: 23 additions & 23 deletions src/nn/split_into.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,7 @@ mod tests {
#[test]
fn test_split_into_2() {
let dev: TestDevice = Default::default();
type Model = SplitInto<(Linear<5, 1>, Linear<5, 2>)>;
let m: Model = dev.build_module();
let m: SplitInto<(Linear<5, 1, _>, Linear<5, 2, _>)> = dev.build_module();
let _: (Tensor<Rank1<1>, _, _>, Tensor<Rank1<2>, _, _, OwnedTape<_>>) =
m.forward(dev.zeros::<Rank1<5>>().traced());
let _: (
Expand All @@ -134,8 +133,7 @@ mod tests {
#[test]
fn test_split_into_3() {
let dev: TestDevice = Default::default();
type Model = SplitInto<(Linear<5, 1>, Linear<5, 2>, Linear<5, 3>)>;
let m: Model = dev.build_module();
let m: SplitInto<(Linear<5, 1, _>, Linear<5, 2, _>, Linear<5, 3, _>)> = dev.build_module();
let _: (
Tensor<Rank1<1>, _, _>,
Tensor<Rank1<2>, _, _>,
Expand All @@ -150,9 +148,13 @@ mod tests {

#[test]
fn test_split_into_4() {
type Model = SplitInto<(Linear<5, 1>, Linear<5, 2>, Linear<5, 3>, Linear<5, 4>)>;
let dev: TestDevice = Default::default();
let m: Model = dev.build_module();
let m: SplitInto<(
Linear<5, 1, _>,
Linear<5, 2, _>,
Linear<5, 3, _>,
Linear<5, 4, _>,
)> = dev.build_module();
let _: (
Tensor<Rank1<1>, _, _>,
Tensor<Rank1<2>, _, _>,
Expand All @@ -169,15 +171,14 @@ mod tests {

#[test]
fn test_split_into_5() {
type Model = SplitInto<(
Linear<5, 1>,
Linear<5, 2>,
Linear<5, 3>,
Linear<5, 4>,
Linear<5, 5>,
)>;
let dev: TestDevice = Default::default();
let m: Model = dev.build_module();
let m: SplitInto<(
Linear<5, 1, _>,
Linear<5, 2, _>,
Linear<5, 3, _>,
Linear<5, 4, _>,
Linear<5, 5, _>,
)> = dev.build_module();
let _: (
Tensor<Rank1<1>, _, _>,
Tensor<Rank1<2>, _, _>,
Expand All @@ -196,16 +197,15 @@ mod tests {

#[test]
fn test_split_into_6() {
type Model = SplitInto<(
Linear<5, 1>,
Linear<5, 2>,
Linear<5, 3>,
Linear<5, 4>,
Linear<5, 5>,
Linear<5, 6>,
)>;
let dev: TestDevice = Default::default();
let m: Model = dev.build_module();
let m: SplitInto<(
Linear<5, 1, _>,
Linear<5, 2, _>,
Linear<5, 3, _>,
Linear<5, 4, _>,
Linear<5, 5, _>,
Linear<5, 6, _>,
)> = dev.build_module();
let _: (
Tensor<Rank1<1>, _, _>,
Tensor<Rank1<2>, _, _>,
Expand Down
14 changes: 14 additions & 0 deletions src/optim/adam/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
use crate::{shapes::Shape, tensor::Cuda};

impl super::AdamKernel<f32> for Cuda {
fn update<S: Shape>(
t: i32,
cfg: &super::AdamConfig<f32>,
param: &mut Self::Storage<S, f32>,
moment1: &mut Self::Storage<S, f32>,
moment2: &mut Self::Storage<S, f32>,
grad: Self::Storage<S, f32>,
) {
todo!()
}
}
3 changes: 3 additions & 0 deletions src/optim/adam/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
mod cpu_kernel;

#[cfg(feature = "cuda")]
mod cuda_kernel;

use std::marker::PhantomData;

use crate::{
Expand Down
14 changes: 14 additions & 0 deletions src/optim/rmsprop/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
use crate::tensor::Cuda;

impl super::RMSpropKernel<f32> for Cuda {
fn update<S: crate::shapes::Shape>(
cfg: &super::RMSpropConfig<f32>,
param: &mut Self::Storage<S, f32>,
momentum: &mut Self::Storage<S, f32>,
square_avg: &mut Self::Storage<S, f32>,
grad_avg: &mut Self::Storage<S, f32>,
grad: Self::Storage<S, f32>,
) {
todo!()
}
}
3 changes: 3 additions & 0 deletions src/optim/rmsprop/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
mod cpu_kernel;

#[cfg(feature = "cuda")]
mod cuda_kernel;

use std::marker::PhantomData;

use crate::{
Expand Down
12 changes: 12 additions & 0 deletions src/optim/sgd/cuda_kernel.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
use crate::{shapes::*, tensor::Cuda};

impl<E: Dtype> super::SgdKernel<E> for Cuda {
fn update<S: Shape>(
cfg: &super::SgdConfig<E>,
param: &mut Self::Storage<S, E>,
velocity: &mut Self::Storage<S, E>,
grad: Self::Storage<S, E>,
) {
todo!()
}
}
3 changes: 3 additions & 0 deletions src/optim/sgd/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
mod cpu_kernel;

#[cfg(feature = "cuda")]
mod cuda_kernel;

use std::marker::PhantomData;

use crate::gradients::Gradients;
Expand Down
2 changes: 1 addition & 1 deletion src/shapes/shape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use super::{axes::*, ReduceShapeTo};

/// Represents a unit type, but no arithmetic.
pub trait Unit:
'static + Copy + Clone + Default + std::fmt::Debug + PartialOrd + Send + Sync
'static + Copy + Clone + Default + std::fmt::Debug + PartialOrd + Send + Sync + std::marker::Unpin
{
}
impl Unit for f32 {}
Expand Down
Loading

0 comments on commit cb2e687

Please sign in to comment.