coreylowman · coreylowman · Feb 22, 2023 · Feb 21, 2023 · Feb 21, 2023 · Feb 21, 2023
diff --git a/examples/02-ops.rs b/examples/02-ops.rs
@@ -2,7 +2,7 @@
 
 use dfdx::{
     shapes::{Rank0, Rank1, Rank2},
-    tensor::{AsArray, Cpu, SampleTensor, Tensor, ToDevice},
+    tensor::{AsArray, Cpu, SampleTensor, Tensor},
     tensor_ops::{MeanTo, TryMatMul},
 };
 
@@ -63,6 +63,8 @@ fn main() {
     // these operations are equal across devices
     #[cfg(feature = "cuda")]
     {
+        use dfdx::tensor::ToDevice;
+
         let cpu = Cpu::default();
 
         let a: Tensor<Rank1<3>, f32, _> = dev.sample_normal();

diff --git a/src/nn/add_into.rs b/src/nn/add_into.rs
@@ -1,6 +1,6 @@
-use crate::{optim::*, shapes::Dtype, tensor_ops::Device};
+use crate::{shapes::Dtype, tensor::*};
 
-use super::{BuildModule, BuildOnDevice, Module, ModuleMut, ResetParams, ToDevice};
+use super::{visitors::*, BuildModule, BuildOnDevice, Module, ModuleMut, ToDevice};
 
 /// Add inputs together into a single tensor. `T` should be a tuple
 //// where every element of the tuple has the same output type
@@ -23,28 +23,19 @@ use super::{BuildModule, BuildOnDevice, Module, ModuleMut, ResetParams, ToDevice
 #[derive(Debug, Default, Clone)]
 pub struct AddInto<T>(pub T);
 
-impl<T: GradientUpdate<D, E>, D: Device<E>, E: Dtype> GradientUpdate<D, E> for AddInto<T> {
-    fn update<U>(&mut self, updater: &mut U, unused: &mut UnusedTensors) -> Result<(), <D>::Err>
-    where
-        U: ParamUpdater<D, E>,
-    {
-        self.0.update(updater, unused)
-    }
-}
-
-impl<T: BuildOnDevice<D, E>, D: Device<E>, E: Dtype> BuildOnDevice<D, E> for AddInto<T> {
+impl<T: BuildOnDevice<D, E>, D: DeviceStorage, E: Dtype> BuildOnDevice<D, E> for AddInto<T> {
     type Built = AddInto<T::Built>;
 }
 
-impl<T: BuildModule<D, E>, D: Device<E>, E: Dtype> BuildModule<D, E> for AddInto<T> {
+impl<T: BuildModule<D, E>, D: DeviceStorage, E: Dtype> BuildModule<D, E> for AddInto<T> {
     fn try_build(device: &D) -> Result<Self, <D>::Err> {
         Ok(Self(BuildModule::try_build(device)?))
     }
 }
 
-impl<T: ResetParams<D, E>, D: Device<E>, E: Dtype> ResetParams<D, E> for AddInto<T> {
-    fn try_reset_params(&mut self) -> Result<(), <D>::Err> {
-        self.0.try_reset_params()
+impl<E: Dtype, D: DeviceStorage, T: TensorCollection<E, D>> TensorCollection<E, D> for AddInto<T> {
+    fn iter_tensors<V: TensorVisitor<Self, E, D>>(visitor: &mut V) -> Result<(), V::Err> {
+        visitor.visit_module(|s| &s.0, |s| &mut s.0, "0")
     }
 }
 
@@ -102,11 +93,9 @@ mod tests {
     use super::*;
     use crate::{
         gradients::OwnedTape,
-        nn::{builders::*, tests::SimpleUpdater, DeviceBuildExt},
+        nn::{builders::*, DeviceBuildExt},
         shapes::*,
-        tensor::*,
         tests::{TestDevice, TestDtype},
-        unique_id::HasUniqueId,
     };
 
     type TestAddIntoCpu = AddInto<(Linear<2, 5>, Linear<3, 5>)>;
@@ -221,37 +210,6 @@ mod tests {
         ));
     }
 
-    #[test]
-    fn test_missing_gradients() {
-        let dev: TestDevice = Default::default();
-        type Model = AddInto<(Linear<5, 3>, Linear<5, 3>)>;
-        let mut model = dev.build_module::<Model, TestDtype>();
-        let mut g: SimpleUpdater = Default::default();
-
-        // no gradients present
-        let mut unused = Default::default();
-        model.update(&mut g, &mut unused).unwrap();
-        assert_eq!(
-            &unused.ids,
-            &[
-                *model.0 .0.weight.id(),
-                *model.0 .0.bias.id(),
-                *model.0 .1.weight.id(),
-                *model.0 .1.bias.id()
-            ]
-        );
-
-        // weight gradient is present
-        g.0.try_alloc_for(&model.0 .0.weight).unwrap();
-        g.0.try_alloc_for(&model.0 .0.bias).unwrap();
-        g.0.try_alloc_for(&model.0 .1.weight).unwrap();
-        g.0.try_alloc_for(&model.0 .1.bias).unwrap();
-
-        let mut unused = Default::default();
-        model.update(&mut g, &mut unused).unwrap();
-        assert!(unused.is_empty());
-    }
-
     #[test]
     fn longer_network() {
         let dev: TestDevice = Default::default();

diff --git a/src/nn/batchnorm2d.rs b/src/nn/batchnorm2d.rs
@@ -1,6 +1,6 @@
-use crate::{gradients::*, optim::*, shapes::*, tensor::*, tensor_ops::*};
+use crate::{gradients::*, shapes::*, tensor::*, tensor_ops::*};
 
-use super::{BuildModule, BuildOnDevice, Module, ModuleMut, ResetParams, ToDevice};
+use super::{visitors::*, BuildModule, BuildOnDevice, Module, ModuleMut, ToDevice};
 
 pub mod builder {
     #[derive(Debug, Copy, Clone, Eq, PartialEq)]
@@ -183,13 +183,32 @@ impl<const C: usize, E: Dtype, D: Device<E>> BuildModule<D, E> for BatchNorm2D<C
     }
 }
 
-impl<const C: usize, E: Dtype, D: Device<E>> ResetParams<D, E> for BatchNorm2D<C, E, D> {
-    fn try_reset_params(&mut self) -> Result<(), D::Err> {
-        self.scale.try_fill_with_ones()?;
-        self.bias.try_fill_with_zeros()?;
-        self.running_mean.try_fill_with_zeros()?;
-        self.running_var.try_fill_with_ones()?;
-        Ok(())
+impl<const C: usize, E: Dtype, D: Device<E>> TensorCollection<E, D> for BatchNorm2D<C, E, D> {
+    fn iter_tensors<V: TensorVisitor<Self, E, D>>(visitor: &mut V) -> Result<(), V::Err> {
+        visitor.visit_tensor(
+            |s| &s.scale,
+            |s| &mut s.scale,
+            "scale",
+            TensorOptions::reset_to_ones(),
+        )?;
+        visitor.visit_tensor(
+            |s| &s.bias,
+            |s| &mut s.bias,
+            "bias",
+            TensorOptions::reset_to_zeros(),
+        )?;
+        visitor.visit_tensor(
+            |s| &s.running_mean,
+            |s| &mut s.running_mean,
+            "running_mean",
+            TensorOptions::detached(|t| t.try_fill_with_zeros()),
+        )?;
+        visitor.visit_tensor(
+            |s| &s.running_var,
+            |s| &mut s.running_var,
+            "running_var",
+            TensorOptions::detached(|t| t.try_fill_with_ones()),
+        )
     }
 }
 
@@ -209,21 +228,10 @@ impl<const C: usize, E: Dtype, D1: Device<E>, D2: Device<E>> ToDevice<D2>
     }
 }
 
-impl<const C: usize, E: Dtype, D: Device<E>> GradientUpdate<D, E> for BatchNorm2D<C, E, D> {
-    fn update<U>(&mut self, updater: &mut U, unused: &mut UnusedTensors) -> Result<(), <D>::Err>
-    where
-        U: ParamUpdater<D, E>,
-    {
-        self.scale.update(updater, unused)?;
-        self.bias.update(updater, unused)?;
-        Ok(())
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::builder::BatchNorm2D;
-    use crate::{nn::*, shapes::*, tensor::*, tensor_ops::*, tests::*};
+    use crate::{nn::*, optim::*, shapes::*, tensor::*, tensor_ops::*, tests::*};
 
     #[test]
     fn test_batchnorm2d_3d_forward_mut() {
@@ -344,4 +352,17 @@ mod tests {
             ],
         );
     }
+
+    #[test]
+    fn test_batchnorm2d_update() {
+        let dev: TestDevice = Default::default();
+
+        let x1: Tensor<Rank3<3, 4, 5>, TestDtype, _> = dev.sample_normal();
+        let mut bn = dev.build_module::<BatchNorm2D<3>, TestDtype>();
+        let y = bn.forward_mut(x1.trace());
+        let g = y.square().mean().backward();
+
+        let mut opt = Sgd::new(&bn, Default::default());
+        opt.update(&mut bn, g).expect("");
+    }
 }
diff --git a/src/nn/conv.rs b/src/nn/conv.rs
@@ -1,9 +1,9 @@
 use num_traits::Float;
 use rand_distr::uniform::SampleUniform;
 
-use crate::{gradients::Tape, optim::*, shapes::*, tensor::*, tensor_ops::*};
+use crate::{gradients::Tape, shapes::*, tensor::*, tensor_ops::*};
 
-use super::{BuildModule, BuildOnDevice, Module, ModuleMut, ResetParams, ToDevice};
+use super::{visitors::*, BuildModule, BuildOnDevice, Module, ModuleMut, ToDevice};
 
 pub mod builder {
     #[derive(Debug)]
@@ -54,18 +54,30 @@ pub struct Conv2D<
 }
 
 impl<const I: usize, const O: usize, const K: usize, const S: usize, const P: usize, E, D>
-    GradientUpdate<D, E> for Conv2D<I, O, K, S, P, E, D>
+    TensorCollection<E, D> for Conv2D<I, O, K, S, P, E, D>
 where
-    E: Dtype,
+    E: Dtype + Float + SampleUniform,
     D: Device<E>,
 {
-    fn update<U>(&mut self, updater: &mut U, unused: &mut UnusedTensors) -> Result<(), <D>::Err>
-    where
-        U: ParamUpdater<D, E>,
-    {
-        self.weight.update(updater, unused)?;
-        self.bias.update(updater, unused)?;
-        Ok(())
+    fn iter_tensors<V: TensorVisitor<Self, E, D>>(visitor: &mut V) -> Result<(), V::Err> {
+        visitor.visit_tensor(
+            |s| &s.weight,
+            |s| &mut s.weight,
+            "weight",
+            TensorOptions::reset_with(|t| {
+                let b = E::ONE / E::from_usize(I * K * K).unwrap().sqrt();
+                t.try_fill_with_distr(rand_distr::Uniform::new(-b, b))
+            }),
+        )?;
+        visitor.visit_tensor(
+            |s| &s.bias,
+            |s| &mut s.bias,
+            "bias",
+            TensorOptions::reset_with(|t| {
+                let b = E::ONE / E::from_usize(I * K * K).unwrap().sqrt();
+                t.try_fill_with_distr(rand_distr::Uniform::new(-b, b))
+            }),
+        )
     }
 }
 
@@ -85,23 +97,6 @@ where
     }
 }
 
-impl<const I: usize, const O: usize, const K: usize, const S: usize, const P: usize, E, D>
-    ResetParams<D, E> for Conv2D<I, O, K, S, P, E, D>
-where
-    E: Dtype + Float + SampleUniform,
-    D: Device<E>,
-{
-    fn try_reset_params(&mut self) -> Result<(), <D>::Err> {
-        let k = E::from_usize(I * K * K).unwrap();
-        let bound = E::ONE / k.sqrt();
-        self.weight
-            .try_fill_with_distr(rand_distr::Uniform::new(-bound, bound))?;
-        self.bias
-            .try_fill_with_distr(rand_distr::Uniform::new(-bound, bound))?;
-        Ok(())
-    }
-}
-
 impl<const I: usize, const O: usize, const K: usize, const S: usize, const P: usize, E, D1, D2>
     ToDevice<D2> for Conv2D<I, O, K, S, P, E, D1>
 where
@@ -175,6 +170,7 @@ impl<'a, B: Dim, const C: usize, H: Dim, W: Dim, E: Dtype, D: Device<E>, T: Tape
 mod tests {
     use crate::{
         nn::DeviceBuildExt,
+        optim::*,
         tensor::{AsArray, SampleTensor, ZerosTensor},
         tests::*,
     };