From 513470e08285b73277e73e8e4304e73b39110acd Mon Sep 17 00:00:00 2001
From: Corey Lowman <coreylowman@users.noreply.github.com>
Date: Thu, 26 Jan 2023 10:46:10 -0500
Subject: [PATCH] Removing Device generic from Gradients & optimizers (#402)

* Removing Device generic from Gradients & optimizers

* Formatting
---
 examples/04-gradients.rs             |  2 +-
 examples/06-mnist.rs                 |  2 +-
 src/gradients.rs                     | 95 ++++++++++++----------------
 src/lib.rs                           |  4 +-
 src/nn/add_into.rs                   |  2 +-
 src/nn/impl_module_for_tuples.rs     |  2 +-
 src/nn/layer_norm.rs                 |  2 +-
 src/nn/linear.rs                     |  2 +-
 src/nn/mod.rs                        |  4 +-
 src/nn/repeated.rs                   |  2 +-
 src/nn/split_into.rs                 |  2 +-
 src/optim/adam/mod.rs                | 20 +++---
 src/optim/mod.rs                     |  2 +-
 src/optim/optimizer.rs               | 44 +++++++------
 src/optim/rmsprop/mod.rs             | 22 +++----
 src/optim/sgd/mod.rs                 | 18 +++---
 src/tensor/storage_traits.rs         | 12 ++--
 src/tensor_ops/utilities/backward.rs | 12 ++--
 18 files changed, 121 insertions(+), 128 deletions(-)
diff --git a/examples/04-gradients.rs b/examples/04-gradients.rs
index 8a30830e0..ad3536b19 100644
--- a/examples/04-gradients.rs
+++ b/examples/04-gradients.rs
@@ -31,7 +31,7 @@ fn main() {
     // finally you can use .backward() to extract the gradients!
     // NOTE: that this method is only available on tensors that **own**
     //       the tape!
-    let gradients: Gradients<Cpu> = e.backward();
+    let gradients: Gradients = e.backward();
 
     // now you can extract gradients for specific tensors
     // by querying with them
diff --git a/examples/06-mnist.rs b/examples/06-mnist.rs
index 81a94d28a..a924550ca 100644
--- a/examples/06-mnist.rs
+++ b/examples/06-mnist.rs
@@ -99,7 +99,7 @@ fn main() {
 
     // initialize model and optimizer
     let mut model: Mlp = dev.build_module();
-    let mut opt: Adam<Mlp, Dev> = Default::default();
+    let mut opt: Adam<Mlp> = Default::default();
 
     // initialize dataset
     let dataset = MnistDataset::train(&mnist_path);
diff --git a/src/gradients.rs b/src/gradients.rs
index 63d33611d..36b8efc71 100644
--- a/src/gradients.rs
+++ b/src/gradients.rs
@@ -1,11 +1,9 @@
 //! Implementations of [GradientTape] and generic Nd array containers via [Gradients].
 #![allow(clippy::type_complexity)]
 
-use core::marker::PhantomData;
 use std::collections::HashMap;
 use std::{boxed::Box, vec::Vec};
 
-use crate::shapes::{HasDtype, HasShape};
 use crate::tensor::storage_traits::{AllocGrad, DeviceStorage};
 use crate::unique_id::{HasUniqueId, UniqueId};
 
@@ -24,28 +22,24 @@ use crate::unique_id::{HasUniqueId, UniqueId};
 /// important part of key's implementing [HasShape], and [HasDtype] is that the associated type
 /// of that trait is used to downcast the box to the expected value.
 #[derive(Debug, Default)]
-pub struct Gradients<D: DeviceStorage> {
+pub struct Gradients {
     gradient_by_id: HashMap<UniqueId, Box<dyn std::any::Any>>,
-    device: PhantomData<*const D>,
 }
 
-impl<D: DeviceStorage> Gradients<D> {
+impl Gradients {
     /// Retrieves mutable gradient for `t`, allocating one if it isn't present.
-    pub(crate) fn get_or_alloc_mut<T>(
-        &mut self,
-        t: &T,
-    ) -> Result<&mut D::Storage<T::Shape, T::Dtype>, D::Err>
+    pub(crate) fn get_or_alloc_mut<T>(&mut self, t: &T) -> Result<&mut T::Gradient, T::Err>
     where
-        T: HasUniqueId + AllocGrad<D>,
+        T: HasUniqueId + AllocGrad,
     {
         self.try_alloc_for(t)?;
         Ok(self.get_mut(t))
     }
 
     /// Inserts a gradient for `t`
-    pub(crate) fn try_alloc_for<T>(&mut self, t: &T) -> Result<(), D::Err>
+    pub(crate) fn try_alloc_for<T>(&mut self, t: &T) -> Result<(), T::Err>
     where
-        T: HasUniqueId + AllocGrad<D>,
+        T: HasUniqueId + AllocGrad,
     {
         if !self.gradient_by_id.contains_key(t.id()) {
             let grad = t.try_alloc_grad()?;
@@ -57,10 +51,10 @@ impl<D: DeviceStorage> Gradients<D> {
     /// Removes and returns the data associated with `t.id()`.
     ///
     /// **Panics** if data associated with `t` is not found. This indicates an unrecoverable bug.
-    pub(crate) fn remove<T: HasUniqueId + HasShape + HasDtype>(
-        &mut self,
-        t: &T,
-    ) -> Option<D::Storage<T::Shape, T::Dtype>> {
+    pub(crate) fn remove<T>(&mut self, t: &T) -> Option<T::Gradient>
+    where
+        T: HasUniqueId + AllocGrad,
+    {
         self.gradient_by_id
             .remove_entry(t.id())
             .map(|e| *e.1.downcast().unwrap())
@@ -69,9 +63,9 @@ impl<D: DeviceStorage> Gradients<D> {
     /// Returns a mutable reference to the data associated with `t`.
     ///
     /// **Panics** if data associated with `t` is not found. This indicates an unrecoverable bug.
-    pub(crate) fn get_mut<T>(&mut self, t: &T) -> &mut D::Storage<T::Shape, T::Dtype>
+    pub(crate) fn get_mut<T>(&mut self, t: &T) -> &mut T::Gradient
     where
-        T: HasUniqueId + HasDtype + HasShape,
+        T: HasUniqueId + AllocGrad,
     {
         self.gradient_by_id
             .get_mut(t.id())
@@ -86,10 +80,10 @@ impl<D: DeviceStorage> Gradients<D> {
     ///
     /// If no data is associated with `t` yet, this will panic due to an unwrap()
     /// on a .get() to the underlying hashmap.
-    pub fn get<T: HasUniqueId + HasDtype + HasShape>(
-        &self,
-        t: &T,
-    ) -> &D::Storage<T::Shape, T::Dtype> {
+    pub fn get<T>(&self, t: &T) -> &T::Gradient
+    where
+        T: HasUniqueId + AllocGrad,
+    {
         self.gradient_by_id
             .get(t.id())
             .unwrap()
@@ -102,17 +96,10 @@ impl<D: DeviceStorage> Gradients<D> {
     /// `l` is the gradient to update, and `r` is the gradient to backprop.
     ///
     /// **Panics** if `l` and `r` have the same id.
-    pub(crate) fn mut_and_ref<L, R>(
-        &mut self,
-        l: &L,
-        r: &R,
-    ) -> (
-        &mut D::Storage<L::Shape, L::Dtype>,
-        &D::Storage<R::Shape, R::Dtype>,
-    )
+    pub(crate) fn mut_and_ref<L, R>(&mut self, l: &L, r: &R) -> (&mut L::Gradient, &R::Gradient)
     where
-        L: HasUniqueId + HasShape + HasDtype,
-        R: HasUniqueId + HasShape + HasDtype,
+        L: HasUniqueId + AllocGrad,
+        R: HasUniqueId + AllocGrad,
     {
         assert_ne!(l.id(), r.id());
         let l_ptr = self.get_mut(l) as *mut _;
@@ -128,15 +115,11 @@ impl<D: DeviceStorage> Gradients<D> {
         l1: &L1,
         l2: &L2,
         r: &R,
-    ) -> (
-        &mut D::Storage<L1::Shape, L1::Dtype>,
-        &mut D::Storage<L2::Shape, L2::Dtype>,
-        &D::Storage<R::Shape, R::Dtype>,
-    )
+    ) -> (&mut L1::Gradient, &mut L2::Gradient, &R::Gradient)
     where
-        L1: HasUniqueId + HasShape + HasDtype,
-        L2: HasUniqueId + HasShape + HasDtype,
-        R: HasUniqueId + HasShape + HasDtype,
+        L1: HasUniqueId + AllocGrad,
+        L2: HasUniqueId + AllocGrad,
+        R: HasUniqueId + AllocGrad,
     {
         assert_ne!(l1.id(), l2.id());
         assert_ne!(l1.id(), r.id());
@@ -183,8 +166,8 @@ impl<D: DeviceStorage> Gradients<D> {
 /// This would not be possible if these chain rule operations were inside of GradientTape!
 #[allow(clippy::type_complexity)]
 pub struct GradientTape<D: DeviceStorage> {
-    operations: Vec<Box<dyn FnOnce(&mut Gradients<D>) -> Result<(), D::Err>>>,
-    gradients: Gradients<D>,
+    operations: Vec<Box<dyn FnOnce(&mut Gradients) -> Result<(), D::Err>>>,
+    gradients: Gradients,
 }
 
 impl<D: DeviceStorage> Default for GradientTape<D> {
@@ -212,7 +195,7 @@ impl<D: DeviceStorage> GradientTape<D> {
     /// * `operation` - A FnOnce that acts on [Gradients].
     ///
     /// See src/tensor_ops for implementation examples.
-    pub(crate) fn add_backward_op<F: 'static + FnOnce(&mut Gradients<D>) -> Result<(), D::Err>>(
+    pub(crate) fn add_backward_op<F: 'static + FnOnce(&mut Gradients) -> Result<(), D::Err>>(
         &mut self,
         operation: F,
     ) {
@@ -222,7 +205,7 @@ impl<D: DeviceStorage> GradientTape<D> {
     /// Compute the [Gradients]! This just runs all the operations on a new [Gradients] struct.
     ///
     /// Note that this method takes ownership of self, so it can't be called twice!
-    pub(crate) fn execute(mut self) -> Result<Gradients<D>, D::Err> {
+    pub(crate) fn execute(mut self) -> Result<Gradients, D::Err> {
         for operation in self.operations.drain(..).rev() {
             (operation)(&mut self.gradients)?;
         }
@@ -251,34 +234,40 @@ pub struct NoneTape;
 pub trait Tape<D: DeviceStorage>: Default + Merge<Self> + Merge<NoneTape> {
     /// Whether this object currently owns the [GradientTape]. This is known at compile time.
     const OWNS_TAPE: bool;
-    fn add_backward_op<F: 'static + FnOnce(&mut Gradients<D>) -> Result<(), D::Err>>(
+    fn add_backward_op<F: 'static + FnOnce(&mut Gradients) -> Result<(), D::Err>>(
         &mut self,
         operation: F,
     );
-    fn try_alloc_grad<T: HasUniqueId + AllocGrad<D>>(&mut self, t: &T) -> Result<(), D::Err>;
+    fn try_alloc_grad<T: HasUniqueId + AllocGrad<Err = D::Err>>(
+        &mut self,
+        t: &T,
+    ) -> Result<(), D::Err>;
 }
 
 impl<D: DeviceStorage> Tape<D> for OwnedTape<D> {
     const OWNS_TAPE: bool = true;
-    fn add_backward_op<F: 'static + FnOnce(&mut Gradients<D>) -> Result<(), D::Err>>(
+    fn add_backward_op<F: 'static + FnOnce(&mut Gradients) -> Result<(), D::Err>>(
         &mut self,
         operation: F,
     ) {
         self.0.add_backward_op(operation)
     }
-    fn try_alloc_grad<T: HasUniqueId + AllocGrad<D>>(&mut self, t: &T) -> Result<(), D::Err> {
+    fn try_alloc_grad<T: HasUniqueId + AllocGrad<Err = D::Err>>(
+        &mut self,
+        t: &T,
+    ) -> Result<(), D::Err> {
         self.0.gradients.try_alloc_for(t)
     }
 }
 
 impl<D: DeviceStorage> Tape<D> for NoneTape {
     const OWNS_TAPE: bool = false;
-    fn add_backward_op<F: 'static + FnOnce(&mut Gradients<D>) -> Result<(), D::Err>>(
-        &mut self,
-        _: F,
-    ) {
+    fn add_backward_op<F: 'static + FnOnce(&mut Gradients) -> Result<(), D::Err>>(&mut self, _: F) {
     }
-    fn try_alloc_grad<T: HasUniqueId + AllocGrad<D>>(&mut self, _: &T) -> Result<(), D::Err> {
+    fn try_alloc_grad<T: HasUniqueId + AllocGrad<Err = D::Err>>(
+        &mut self,
+        _: &T,
+    ) -> Result<(), D::Err> {
         Ok(())
     }
 }
diff --git a/src/lib.rs b/src/lib.rs
index c4192fd1a..544243aa1 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -74,7 +74,7 @@
 //! let loss = cross_entropy_with_logits_loss(y, y_true);
 //!
 //! // call `backward()` to compute gradients. The tensor *must* have `OwnedTape`!
-//! let gradients: Gradients<Cpu> = loss.backward();
+//! let gradients: Gradients = loss.backward();
 //! ```
 //! 7. Use an optimizer from [crate::optim] to optimize your network!
 //! ```rust
@@ -84,7 +84,7 @@
 //! # let y_true = dev.sample_normal::<Rank1<5>>().softmax();
 //! # let y = model.forward(dev.zeros::<Rank1<10>>().trace());
 //! # let loss = cross_entropy_with_logits_loss(y, y_true);
-//! # let gradients: Gradients<Cpu> = loss.backward();
+//! # let gradients: Gradients = loss.backward();
 //! // Use stochastic gradient descent (Sgd), with a learning rate of 1e-2, and 0.9 momentum.
 //! let mut opt = Sgd::new(SgdConfig {
 //!     lr: 1e-2,
diff --git a/src/nn/add_into.rs b/src/nn/add_into.rs
index c5f7092ce..788ea018e 100644
--- a/src/nn/add_into.rs
+++ b/src/nn/add_into.rs
@@ -206,7 +206,7 @@ mod tests {
     fn test_missing_gradients() {
         let dev: TestDevice = Default::default();
         let mut model: AddInto<(Linear<5, 3, _>, Linear<5, 3, _>)> = dev.build_module();
-        let mut g: SimpleUpdater<_> = Default::default();
+        let mut g: SimpleUpdater = Default::default();
 
         // no gradients present
         let mut unused = Default::default();
diff --git a/src/nn/impl_module_for_tuples.rs b/src/nn/impl_module_for_tuples.rs
index 1f9d1fb38..f10a66061 100644
--- a/src/nn/impl_module_for_tuples.rs
+++ b/src/nn/impl_module_for_tuples.rs
@@ -239,7 +239,7 @@ mod tests {
     fn test_tuple_missing_gradients() {
         let dev: TestDevice = Default::default();
         let mut model: (Linear<5, 3, _>, Linear<5, 3, _>, Linear<5, 3, _>) = dev.build_module();
-        let mut g: SimpleUpdater<_> = Default::default();
+        let mut g: SimpleUpdater = Default::default();
 
         // no gradients present
         let mut unused: UnusedTensors = Default::default();
diff --git a/src/nn/layer_norm.rs b/src/nn/layer_norm.rs
index 96e00bfd2..842cb99a3 100644
--- a/src/nn/layer_norm.rs
+++ b/src/nn/layer_norm.rs
@@ -167,7 +167,7 @@ mod tests {
         let dev: TestDevice = Default::default();
 
         let mut model: LayerNorm1D<5, _> = dev.build_module();
-        let mut g: SimpleUpdater<_> = Default::default();
+        let mut g: SimpleUpdater = Default::default();
 
         // no gradients present
         let mut unused = Default::default();
diff --git a/src/nn/linear.rs b/src/nn/linear.rs
index d3b33295f..66aa8bbb0 100644
--- a/src/nn/linear.rs
+++ b/src/nn/linear.rs
@@ -250,7 +250,7 @@ mod tests {
         let dev: TestDevice = Default::default();
 
         let mut model: Linear<5, 3, _> = dev.build_module();
-        let mut g: SimpleUpdater<_> = Default::default();
+        let mut g: SimpleUpdater = Default::default();
 
         // no gradients present
         let mut unused = Default::default();
diff --git a/src/nn/mod.rs b/src/nn/mod.rs
index 58335aa1c..c18e45c63 100644
--- a/src/nn/mod.rs
+++ b/src/nn/mod.rs
@@ -133,9 +133,9 @@ mod tests {
     use crate::{gradients::Gradients, optim::ParamUpdater, shapes::Dtype, tensor::DeviceStorage};
 
     #[derive(Default)]
-    pub struct SimpleUpdater<D: DeviceStorage>(pub Gradients<D>);
+    pub struct SimpleUpdater(pub Gradients);
 
-    impl<D: DeviceStorage, E: Dtype> ParamUpdater<D, E> for SimpleUpdater<D> {
+    impl<D: DeviceStorage, E: Dtype> ParamUpdater<D, E> for SimpleUpdater {
         fn update_param<S: crate::shapes::Shape>(
             &mut self,
             p: &mut crate::tensor::Tensor<S, E, D>,
diff --git a/src/nn/repeated.rs b/src/nn/repeated.rs
index fd22aac15..c7d94552b 100644
--- a/src/nn/repeated.rs
+++ b/src/nn/repeated.rs
@@ -121,7 +121,7 @@ mod tests {
         let dev: TestDevice = Default::default();
 
         let mut model: Repeated<Linear<5, 5, _>, 3> = dev.build_module();
-        let mut g: SimpleUpdater<_> = Default::default();
+        let mut g: SimpleUpdater = Default::default();
 
         // no gradients present
         let mut unused = Default::default();
diff --git a/src/nn/split_into.rs b/src/nn/split_into.rs
index 4097b38ec..05a81583d 100644
--- a/src/nn/split_into.rs
+++ b/src/nn/split_into.rs
@@ -228,7 +228,7 @@ mod tests {
     fn test_missing_gradients() {
         let dev: TestDevice = Default::default();
         let mut model: SplitInto<(Linear<5, 3, _>, Linear<5, 3, _>)> = dev.build_module();
-        let mut g: SimpleUpdater<_> = Default::default();
+        let mut g: SimpleUpdater = Default::default();
 
         // no gradients present
         let mut unused = Default::default();
diff --git a/src/optim/adam/mod.rs b/src/optim/adam/mod.rs
index 8d93cc9be..4148a8a4e 100644
--- a/src/optim/adam/mod.rs
+++ b/src/optim/adam/mod.rs
@@ -8,7 +8,7 @@ use std::marker::PhantomData;
 use crate::{
     gradients::Gradients,
     shapes::{Dtype, Shape},
-    tensor::{Cpu, DeviceStorage},
+    tensor::DeviceStorage,
 };
 
 use super::{GradientUpdate, Optimizer, OptimizerUpdateError, ParamUpdater, WeightDecay};
@@ -77,19 +77,19 @@ impl Default for AdamConfig<f32> {
 ///
 /// See module level documentation at [crate::optim] for examples of how to actually use an optimizer.
 #[derive(Debug)]
-pub struct Adam<M, D: DeviceStorage = Cpu, E: Dtype = f32> {
+pub struct Adam<M, E: Dtype = f32> {
     /// Hyperparameter configuration
     pub cfg: AdamConfig<E>,
 
     t: i32,
-    gradients: Gradients<D>,
-    moment1: Gradients<D>,
-    moment2: Gradients<D>,
+    gradients: Gradients,
+    moment1: Gradients,
+    moment2: Gradients,
 
     marker: PhantomData<*const M>,
 }
 
-impl<M, D: DeviceStorage, E: Dtype> Default for Adam<M, D, E>
+impl<M, E: Dtype> Default for Adam<M, E>
 where
     AdamConfig<E>: Default,
 {
@@ -99,7 +99,7 @@ where
     }
 }
 
-impl<M, D: DeviceStorage, E: Dtype> Adam<M, D, E> {
+impl<M, E: Dtype> Adam<M, E> {
     /// Constructs using hyperparameters from `cfg`.
     pub fn new(cfg: AdamConfig<E>) -> Self {
         Self {
@@ -125,7 +125,7 @@ pub(super) trait AdamKernel<E: Dtype>: DeviceStorage {
     ) -> Result<(), Self::Err>;
 }
 
-impl<M, D: DeviceStorage + AdamKernel<E>, E: Dtype> ParamUpdater<D, E> for Adam<M, D, E> {
+impl<M, D: DeviceStorage + AdamKernel<E>, E: Dtype> ParamUpdater<D, E> for Adam<M, E> {
     fn update_param<S: Shape>(
         &mut self,
         p: &mut crate::tensor::Tensor<S, E, D>,
@@ -145,14 +145,14 @@ impl<M, D: DeviceStorage + AdamKernel<E>, E: Dtype> ParamUpdater<D, E> for Adam<
     }
 }
 
-impl<E: Dtype, D: DeviceStorage, M: GradientUpdate<D, E>> Optimizer<M, D, E> for Adam<M, D, E>
+impl<M: GradientUpdate<D, E>, D: AdamKernel<E>, E: Dtype> Optimizer<M, D, E> for Adam<M, E>
 where
     Self: ParamUpdater<D, E>,
 {
     fn update(
         &mut self,
         module: &mut M,
-        gradients: Gradients<D>,
+        gradients: Gradients,
     ) -> Result<(), OptimizerUpdateError<D>> {
         self.t = self.t.checked_add(1).unwrap();
         self.gradients = gradients;
diff --git a/src/optim/mod.rs b/src/optim/mod.rs
index 199ca890b..f8787a779 100644
--- a/src/optim/mod.rs
+++ b/src/optim/mod.rs
@@ -23,7 +23,7 @@
 //! # let loss = losses::mse_loss(y, dev.zeros());
 //! // -- snip loss computation --
 //!
-//! let gradients: Gradients<Cpu> = loss.backward();
+//! let gradients: Gradients = loss.backward();
 //! opt.update(&mut model, gradients);
 //! ```
 
diff --git a/src/optim/optimizer.rs b/src/optim/optimizer.rs
index 2c31f19a2..6d8fe4ef9 100644
--- a/src/optim/optimizer.rs
+++ b/src/optim/optimizer.rs
@@ -1,7 +1,7 @@
 use crate::{
     gradients::Gradients,
     shapes::{Dtype, Shape},
-    tensor::{DeviceStorage, Tensor},
+    tensor::{DeviceStorage, HasErr, Tensor},
     unique_id::{HasUniqueId, UniqueId},
 };
 
@@ -79,7 +79,7 @@ pub(super) fn momentum_to_cuda<E: Default>(wd: Option<Momentum<E>>) -> (Momentum
 ///
 /// 3. Optimizer itself is generic over M, not the update method. This means a single optimizer object
 /// can only work on objects of type `M`. This also requires you to specify the model up front for the optimizer.
-pub trait Optimizer<M: GradientUpdate<D, E>, D: DeviceStorage, E: Dtype> {
+pub trait Optimizer<M, D: DeviceStorage, E: Dtype> {
     /// Updates all of `module`'s parameters using `gradients`.
     ///
     /// Requires a `&mut self` because the optimizer may change some internally
@@ -87,10 +87,30 @@ pub trait Optimizer<M: GradientUpdate<D, E>, D: DeviceStorage, E: Dtype> {
     fn update(
         &mut self,
         module: &mut M,
-        gradients: Gradients<D>,
+        gradients: Gradients,
     ) -> Result<(), OptimizerUpdateError<D>>;
 }
 
+/// Represents something that can be updated with a [ParamUpdater].
+pub trait GradientUpdate<D: DeviceStorage, E: Dtype> {
+    /// Updates self given the [ParamUpdater].
+    fn update<U: ParamUpdater<D, E>>(
+        &mut self,
+        updater: &mut U,
+        unused: &mut UnusedTensors,
+    ) -> Result<(), D::Err>;
+}
+
+impl<S: Shape, E: Dtype, D: DeviceStorage> GradientUpdate<D, E> for Tensor<S, E, D> {
+    fn update<U: ParamUpdater<D, E>>(
+        &mut self,
+        updater: &mut U,
+        unused: &mut UnusedTensors,
+    ) -> Result<(), <Self as HasErr>::Err> {
+        updater.update_param(self, unused)
+    }
+}
+
 /// Represents something that can update a tensor.
 ///
 /// See [crate::optim::Sgd] and [crate::optim::Adam] for examples on implementing this.
@@ -124,24 +144,6 @@ impl UnusedTensors {
     }
 }
 
-/// Represents something that can be updated with a [ParamUpdater].
-pub trait GradientUpdate<D: DeviceStorage, E: Dtype>: Sized {
-    /// Updates self given the [ParamUpdater].
-    fn update<U>(&mut self, updater: &mut U, unused: &mut UnusedTensors) -> Result<(), D::Err>
-    where
-        U: ParamUpdater<D, E>;
-}
-
-impl<S: Shape, E: Dtype, D: DeviceStorage> GradientUpdate<D, E> for Tensor<S, E, D> {
-    fn update<U: ParamUpdater<D, E>>(
-        &mut self,
-        opt: &mut U,
-        unused: &mut UnusedTensors,
-    ) -> Result<(), D::Err> {
-        opt.update_param(self, unused)
-    }
-}
-
 /// An error indicating that a parameter was not used in gradient
 /// computation, and was therefore not present in [Gradients]
 /// while a [GradientUpdate] was trying to update it.
diff --git a/src/optim/rmsprop/mod.rs b/src/optim/rmsprop/mod.rs
index a989fa866..4cc1f3f90 100644
--- a/src/optim/rmsprop/mod.rs
+++ b/src/optim/rmsprop/mod.rs
@@ -8,7 +8,7 @@ use std::marker::PhantomData;
 use crate::{
     gradients::Gradients,
     shapes::{Dtype, Shape},
-    tensor::{Cpu, DeviceStorage, OneFillStorage, Tensor},
+    tensor::{DeviceStorage, OneFillStorage, Tensor},
 };
 
 use super::{
@@ -87,20 +87,20 @@ impl Default for RMSpropConfig<f32> {
 ///
 /// See module level documentation at [crate::optim] for examples of how to actually use an optimizer.
 #[derive(Debug)]
-pub struct RMSprop<M, D: DeviceStorage = Cpu, E: Dtype = f32> {
+pub struct RMSprop<M, E: Dtype = f32> {
     /// Hyperparameter configuration
     pub cfg: RMSpropConfig<E>,
 
     step: usize,
-    momentums: Gradients<D>,
-    square_avg: Gradients<D>,
-    grad_avg: Gradients<D>,
-    gradients: Gradients<D>,
+    momentums: Gradients,
+    square_avg: Gradients,
+    grad_avg: Gradients,
+    gradients: Gradients,
 
     marker: PhantomData<*const M>,
 }
 
-impl<M, D: DeviceStorage, E: Dtype> Default for RMSprop<M, D, E>
+impl<M, E: Dtype> Default for RMSprop<M, E>
 where
     RMSpropConfig<E>: Default,
 {
@@ -110,7 +110,7 @@ where
     }
 }
 
-impl<M, D: DeviceStorage, E: Dtype> RMSprop<M, D, E> {
+impl<M, E: Dtype> RMSprop<M, E> {
     /// Constructs using hyperparameters from `cfg`.
     pub fn new(cfg: RMSpropConfig<E>) -> Self {
         Self {
@@ -137,7 +137,7 @@ pub(super) trait RMSpropKernel<E: Dtype>: DeviceStorage {
     ) -> Result<(), Self::Err>;
 }
 
-impl<M, D: RMSpropKernel<f32> + OneFillStorage<f32>> ParamUpdater<D, f32> for RMSprop<M, D, f32> {
+impl<M, D: RMSpropKernel<f32> + OneFillStorage<f32>> ParamUpdater<D, f32> for RMSprop<M, f32> {
     fn update_param<S: Shape>(
         &mut self,
         p: &mut Tensor<S, f32, D>,
@@ -162,14 +162,14 @@ impl<M, D: RMSpropKernel<f32> + OneFillStorage<f32>> ParamUpdater<D, f32> for RM
     }
 }
 
-impl<E: Dtype, D: DeviceStorage, M: GradientUpdate<D, E>> Optimizer<M, D, E> for RMSprop<M, D, E>
+impl<M: GradientUpdate<D, E>, D: RMSpropKernel<E>, E: Dtype> Optimizer<M, D, E> for RMSprop<M, E>
 where
     Self: ParamUpdater<D, E>,
 {
     fn update(
         &mut self,
         module: &mut M,
-        gradients: Gradients<D>,
+        gradients: Gradients,
     ) -> Result<(), OptimizerUpdateError<D>> {
         self.gradients = gradients;
         let mut unused = Default::default();
diff --git a/src/optim/sgd/mod.rs b/src/optim/sgd/mod.rs
index b68d063b7..d5613f3f7 100644
--- a/src/optim/sgd/mod.rs
+++ b/src/optim/sgd/mod.rs
@@ -7,7 +7,7 @@ use std::marker::PhantomData;
 
 use crate::gradients::Gradients;
 use crate::shapes::{Dtype, Shape};
-use crate::tensor::{Cpu, DeviceStorage, Tensor};
+use crate::tensor::{DeviceStorage, Tensor};
 
 use super::optimizer::*;
 
@@ -115,17 +115,17 @@ impl Default for SgdConfig<f32> {
 ///
 /// See module level documentation at [crate::optim] for examples of how to actually use an optimizer.
 #[derive(Debug)]
-pub struct Sgd<M, D: DeviceStorage = Cpu, E: Dtype = f32> {
+pub struct Sgd<M, E: Dtype = f32> {
     /// Hyperparameter configuration
     pub cfg: SgdConfig<E>,
 
-    velocity: Gradients<D>,
-    gradients: Gradients<D>,
+    velocity: Gradients,
+    gradients: Gradients,
 
     marker: PhantomData<*const M>,
 }
 
-impl<M, D: DeviceStorage, E: Dtype> Default for Sgd<M, D, E>
+impl<M, E: Dtype> Default for Sgd<M, E>
 where
     SgdConfig<E>: Default,
 {
@@ -135,7 +135,7 @@ where
     }
 }
 
-impl<M, D: DeviceStorage, E: Dtype> Sgd<M, D, E> {
+impl<M, E: Dtype> Sgd<M, E> {
     /// Constructs using hyperparameters from `cfg`
     pub fn new(cfg: SgdConfig<E>) -> Self {
         Self {
@@ -157,7 +157,7 @@ pub(super) trait SgdKernel<E: Dtype>: DeviceStorage {
     ) -> Result<(), Self::Err>;
 }
 
-impl<M, D: SgdKernel<E>, E: Dtype> ParamUpdater<D, E> for Sgd<M, D, E> {
+impl<M, D: SgdKernel<E>, E: Dtype> ParamUpdater<D, E> for Sgd<M, E> {
     fn update_param<S: Shape>(
         &mut self,
         p: &mut Tensor<S, E, D>,
@@ -175,14 +175,14 @@ impl<M, D: SgdKernel<E>, E: Dtype> ParamUpdater<D, E> for Sgd<M, D, E> {
     }
 }
 
-impl<E: Dtype, D: DeviceStorage, M: GradientUpdate<D, E>> Optimizer<M, D, E> for Sgd<M, D, E>
+impl<M: GradientUpdate<D, E>, D: SgdKernel<E>, E: Dtype> Optimizer<M, D, E> for Sgd<M, E>
 where
     Self: ParamUpdater<D, E>,
 {
     fn update(
         &mut self,
         module: &mut M,
-        gradients: Gradients<D>,
+        gradients: Gradients,
     ) -> Result<(), OptimizerUpdateError<D>> {
         self.gradients = gradients;
         let mut unused = Default::default();
diff --git a/src/tensor/storage_traits.rs b/src/tensor/storage_traits.rs
index 5857c2121..b5ab11e25 100644
--- a/src/tensor/storage_traits.rs
+++ b/src/tensor/storage_traits.rs
@@ -2,7 +2,7 @@ use rand::distributions::Distribution;
 use rand_distr::{Standard, StandardNormal};
 
 use crate::{
-    shapes::{ConstShape, Dtype, HasDtype, HasShape, HasUnitType, Shape, Unit},
+    shapes::{ConstShape, Dtype, HasShape, HasUnitType, Shape, Unit},
     unique_id::unique_id,
 };
 
@@ -44,12 +44,14 @@ pub trait DeviceStorage: 'static + Default + Clone + HasErr {
 }
 
 /// Internal trait - Represents something that can allocate its own gradient.
-pub trait AllocGrad<D: DeviceStorage>: HasShape + HasDtype {
-    fn try_alloc_grad(&self) -> Result<D::Storage<Self::Shape, Self::Dtype>, D::Err>;
+pub trait AllocGrad: HasErr {
+    type Gradient: 'static;
+    fn try_alloc_grad(&self) -> Result<Self::Gradient, Self::Err>;
 }
 
-impl<S: Shape, E: Dtype, D: DeviceStorage, T> AllocGrad<D> for Tensor<S, E, D, T> {
-    fn try_alloc_grad(&self) -> Result<D::Storage<Self::Shape, Self::Dtype>, D::Err> {
+impl<S: Shape, E: Dtype, D: DeviceStorage, T> AllocGrad for Tensor<S, E, D, T> {
+    type Gradient = D::Storage<S, E>;
+    fn try_alloc_grad(&self) -> Result<Self::Gradient, D::Err> {
         self.device.try_alloc_grad(&self.storage)
     }
 }
diff --git a/src/tensor_ops/utilities/backward.rs b/src/tensor_ops/utilities/backward.rs
index c99a1690c..bcc6343ab 100644
--- a/src/tensor_ops/utilities/backward.rs
+++ b/src/tensor_ops/utilities/backward.rs
@@ -1,21 +1,21 @@
 use crate::gradients::{Gradients, OwnedTape, Tape};
 use crate::shapes::{Dtype, Rank0};
-use crate::tensor::{DeviceStorage, OneFillStorage, SplitTape, Tensor};
+use crate::tensor::{HasErr, OneFillStorage, SplitTape, Tensor};
 
 /// Runs backprop algorithm with all operations contained in the tape that `t` has.
 ///
 /// This function takes ownership of `self` and returns [Gradients].
-pub trait Backward<D: DeviceStorage>: Sized {
+pub trait Backward: HasErr {
     /// Runs backprop
-    fn backward(self) -> Gradients<D> {
+    fn backward(self) -> Gradients {
         self.try_backward().unwrap()
     }
     /// Fallible version of [Backward::backward]
-    fn try_backward(self) -> Result<Gradients<D>, D::Err>;
+    fn try_backward(self) -> Result<Gradients, Self::Err>;
 }
 
-impl<E: Dtype, D: OneFillStorage<E>> Backward<D> for Tensor<Rank0, E, D, OwnedTape<D>> {
-    fn try_backward(self) -> Result<Gradients<D>, D::Err> {
+impl<E: Dtype, D: OneFillStorage<E>> Backward for Tensor<Rank0, E, D, OwnedTape<D>> {
+    fn try_backward(self) -> Result<Gradients, Self::Err> {
         let (t, mut tape) = self.split_tape();
         tape.add_backward_op(move |grads| t.device.try_fill_with_ones(grads.get_mut(&t)));
         tape.0.execute()