tracel-ai · nathanielsimard · Feb 26, 2024 · Jan 29, 2024 · Jan 30, 2024 · Jan 31, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/backend-comparison/Cargo.toml b/backend-comparison/Cargo.toml
@@ -10,7 +10,7 @@ repository = "https://github.com/tracel-ai/burn/tree/main/backend-comparison"
 version.workspace = true
 
 [features]
-default = ["burn/std"]
+default = ["burn/std", "burn/autodiff"]
 candle-cpu = ["burn/candle"]
 candle-cuda = ["burn/candle", "burn/cuda"]
 candle-metal = ["burn/candle", "burn/metal"]

diff --git a/backend-comparison/benches/custom_gelu.rs b/backend-comparison/benches/custom_gelu.rs
@@ -1,4 +1,5 @@
 use backend_comparison::persistence::save;
+use burn::backend::Autodiff;
 use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
 use burn_common::benchmark::{run_benchmark, Benchmark};
 use core::f64::consts::SQRT_2;
@@ -18,13 +19,18 @@ struct CustomGeluBenchmark<B: Backend, const D: usize> {
     shape: Shape<D>,
     device: B::Device,
     kind: GeluKind,
+    autodiff: bool,
 }
 
 impl<B: Backend, const D: usize> Benchmark for CustomGeluBenchmark<B, D> {
     type Args = Tensor<B, D>;
 
     fn name(&self) -> String {
-        "gelu".into()
+        match self.autodiff {
+            true => "gelu_autodiff",
+            false => "gelu",
+        }
+        .into()
     }
 
     fn options(&self) -> Option<String> {
@@ -35,11 +41,26 @@ impl<B: Backend, const D: usize> Benchmark for CustomGeluBenchmark<B, D> {
         vec![self.shape.dims.into()]
     }
 
-    fn execute(&self, args: Self::Args) {
-        match self.kind {
-            GeluKind::Reference => burn::tensor::activation::gelu(args),
-            GeluKind::WithReferenceErf => gelu_custom(args, Tensor::erf),
-            GeluKind::WithCustomErf => gelu_custom(args, erf_custom),
+    fn execute(&self, tensor: Self::Args) {
+        match self.autodiff {
+            true => {
+                let tensor: Tensor<Autodiff<B>, D> = Tensor::from_inner(tensor).require_grad();
+                let output = match self.kind {
+                    GeluKind::Reference => burn::tensor::activation::gelu(tensor.clone()),
+                    GeluKind::WithReferenceErf => gelu_custom(tensor.clone(), Tensor::erf),
+                    GeluKind::WithCustomErf => gelu_custom(tensor.clone(), erf_custom),
+                };
+                let mut gradients = output.sum().backward();
+                let _tmp = tensor.grad_remove(&mut gradients).unwrap();
+            }
+
+            false => {
+                match self.kind {
+                    GeluKind::Reference => burn::tensor::activation::gelu(tensor),
+                    GeluKind::WithReferenceErf => gelu_custom(tensor, Tensor::erf),
+                    GeluKind::WithCustomErf => gelu_custom(tensor, erf_custom),
+                };
+            }
         };
     }
 
@@ -52,7 +73,7 @@ impl<B: Backend, const D: usize> Benchmark for CustomGeluBenchmark<B, D> {
     }
 
     fn num_samples(&self) -> usize {
-        50
+        10
     }
 }
 
@@ -97,22 +118,39 @@ fn bench<B: Backend>(device: &B::Device) {
     const D: usize = 3;
     let shape: Shape<D> = [32, 512, 2048].into();
 
-    let reference_gelu =
-        CustomGeluBenchmark::<B, D>::new(shape.clone(), device.clone(), GeluKind::Reference);
-    let reference_erf_gelu =
-        CustomGeluBenchmark::<B, D>::new(shape.clone(), device.clone(), GeluKind::WithReferenceErf);
-    let custom_erf_gelu =
-        CustomGeluBenchmark::<B, D>::new(shape, device.clone(), GeluKind::WithCustomErf);
-
-    save::<B>(
-        vec![
-            run_benchmark(reference_gelu),
-            run_benchmark(reference_erf_gelu),
-            run_benchmark(custom_erf_gelu),
-        ],
-        device,
-    )
-    .unwrap();
+    let run = |autodiff: bool| {
+        let reference_gelu = CustomGeluBenchmark::<B, D>::new(
+            shape.clone(),
+            device.clone(),
+            GeluKind::Reference,
+            autodiff,
+        );
+        let reference_erf_gelu = CustomGeluBenchmark::<B, D>::new(
+            shape.clone(),
+            device.clone(),
+            GeluKind::WithReferenceErf,
+            autodiff,
+        );
+        let custom_erf_gelu = CustomGeluBenchmark::<B, D>::new(
+            shape.clone(),
+            device.clone(),
+            GeluKind::WithCustomErf,
+            autodiff,
+        );
+
+        save::<B>(
+            vec![
+                run_benchmark(reference_gelu),
+                run_benchmark(reference_erf_gelu),
+                run_benchmark(custom_erf_gelu),
+            ],
+            device,
+        )
+        .unwrap();
+    };
+
+    run(false);
+    run(true);
 }
 
 fn main() {

diff --git a/burn-book/src/advanced/backend-extension/custom-wgpu-kernel.md b/burn-book/src/advanced/backend-extension/custom-wgpu-kernel.md
@@ -31,10 +31,11 @@ pub trait Backend: burn::tensor::backend::Backend {
 pub trait AutodiffBackend: Backend + burn::tensor::backend::AutodiffBackend {}
 ```
 
-In our project, we can use these traits instead of the `burn::tensor::backend::{Backend, AutodiffBackend}`
-traits provided by Burn. Burn's user APIs typically make use of the `Tensor` struct rather than
-dealing directly with primitive tensor types. Therefore, we can encapsulate our newly defined
-backend traits with functions that expose new operations while maintaining a consistent API.
+In our project, we can use these traits instead of the
+`burn::tensor::backend::{Backend, AutodiffBackend}` traits provided by Burn. Burn's user APIs
+typically make use of the `Tensor` struct rather than dealing directly with primitive tensor types.
+Therefore, we can encapsulate our newly defined backend traits with functions that expose new
+operations while maintaining a consistent API.
 
 ```rust, ignore
 /// We define our custom implementation using the added function on our custom backend.
@@ -193,9 +194,9 @@ impl<E: FloatElement> DynamicKernel for FusedMatmulAddRelu<E> {
 }
 ```
 
-Subsequently, we'll go into implementing our custom backend trait for the WGPU backend.
-Note that we won't go into supporting the `fusion` feature flag in this tutorial, so
-we implement the trait for the raw `WgpuBackend` type.
+Subsequently, we'll go into implementing our custom backend trait for the WGPU backend. Note that we
+won't go into supporting the `fusion` feature flag in this tutorial, so we implement the trait for
+the raw `WgpuBackend` type.
 
 ```rust, ignore
 /// Implement our custom backend trait for the existing backend `WgpuBackend`.
@@ -296,7 +297,7 @@ operations.
 // Note that we could implement the backend trait only for the Wgpu backend instead of any backend that
 // also implements our own API. This would allow us to call any function only implemented for Wgpu
 // and potentially call a custom kernel crafted only for this task.
-impl<B: Backend> Backend for Autodiff<B> {
+impl<B: Backend, C: CheckpointStrategy> Backend for Autodiff<B, C> {
     fn fused_matmul_add_relu<const D: usize>(
         lhs: FloatTensor<Self, D>,
         rhs: FloatTensor<Self, D>,
@@ -309,30 +310,32 @@ impl<B: Backend> Backend for Autodiff<B> {
         // Implement the backward trait for the given backend B, the node gradient being of rank D
         // with three other gradients to calculate (lhs, rhs, and bias).
         impl<B: Backend, const D: usize> Backward<B, D, 3> for FusedMatmulAddReluBackward<D> {
-            // The state that must be built during the forward pass to compute the backward pass.
+            // Our state that we must build during the forward pass to compute the backward pass.
             //
             // Note that we could improve the performance further by only keeping the state of
             // tensors that are tracked, improving memory management, but for simplicity, we avoid
             // that part.
-            type State = (
-                FloatTensor<B, D>,
-                FloatTensor<B, D>,
-                FloatTensor<B, D>,
-                Shape<D>,
-            );
-
-            fn backward(self, ops: Ops<Self::State, 3>, grads: &mut Gradients) {
+            type State = (NodeID, NodeID, FloatTensor<B, D>, Shape<D>);
+
+            fn backward(
+                self,
+                ops: Ops<Self::State, 3>,
+                grads: &mut Gradients,
+                checkpointer: &mut Checkpointer,
+            ) {
                 // Get the nodes of each variable.
                 let [node_lhs, node_rhs, node_bias] = ops.parents;
                 // Fetch the gradient for the current node.
                 let grad = grads.consume::<B, D>(&ops.node);
 
-                // Set the state.
-                let (lhs, rhs, output, shape_bias) = ops.state;
+                // Set our state.
+                let (lhs_state, rhs_state, output, shape_bias) = ops.state;
+                let lhs = checkpointer.retrieve_node_output(lhs_state);
+                let rhs = checkpointer.retrieve_node_output(rhs_state);
 
-                // Fetch shapes of the tensors to support broadcasting.
-                let shape_lhs = B::shape(&lhs);
-                let shape_rhs = B::shape(&rhs);
+                // Fetch shapes of our tensor to support broadcasting.
+                let shape_lhs = B::float_shape(&lhs);
+                let shape_rhs = B::float_shape(&rhs);
 
                 // Compute the gradient of the output using the already existing `relu_backward`
                 // function in the basic Burn backend trait.
@@ -341,13 +344,13 @@ impl<B: Backend> Backend for Autodiff<B> {
                 // Compute the lhs gradient, which is the derivative of matmul with support for
                 // broadcasting.
                 let grad_lhs = broadcast_shape::<B, D>(
-                    B::matmul(grad_output.clone(), B::transpose(rhs)),
+                    B::float_matmul(grad_output.clone(), B::float_transpose(rhs)),
                     &shape_lhs,
                 );
                 // Compute the rhs gradient, which is the derivative of matmul with support for
                 // broadcasting.
                 let grad_rhs = broadcast_shape::<B, D>(
-                    B::matmul(B::transpose(lhs), grad_output.clone()),
+                    B::float_matmul(B::float_transpose(lhs), grad_output.clone()),
                     &shape_rhs,
                 );
                 // The add derivative is only 1, so we just need to support broadcasting to
@@ -372,23 +375,35 @@ impl<B: Backend> Backend for Autodiff<B> {
         //
         // Each node can be fetched with `ops.parents` in the same order as defined here.
         match FusedMatmulAddReluBackward
-            .prepare(
-                [lhs.node, rhs.node, bias.node],
-                [lhs.graph, rhs.graph, bias.graph],
+            .prepare::<C>(
+                [lhs.node.clone(), rhs.node.clone(), bias.node.clone()],
+                [lhs.graph.clone(), rhs.graph.clone(), bias.graph.clone()],
             )
+            // Marks the operation as compute bound, meaning it will save its
+            // state instead of recomputing itself during checkpointing
+            .compute_bound()
             .stateful()
         {
-            OpsKind::Tracked(prep) => {
+            OpsKind::Tracked(mut prep) => {
                 // When at least one node is tracked, we should register our backward step.
-                // We compute the output and the state before finishing the preparation.
-                let bias_shape = B::shape(&bias.primitive);
+
+                // The state consists of what will be needed for this operation's backward pass.
+                // Since we need the parents' outputs, we must checkpoint their ids to retrieve their node
+                // output at the beginning of the backward. We can also save utilitary data such as the bias shape
+                // If we also need this operation's output, we can either save it in the state or recompute it
+                // during the backward pass. Here we choose to save it in the state because it's a compute bound operation.
+                let lhs_state = prep.checkpoint(&lhs);
+                let rhs_state = prep.checkpoint(&rhs);
+                let bias_shape = B::float_shape(&bias.primitive);
+
                 let output = B::fused_matmul_add_relu(
                     lhs.primitive.clone(),
                     rhs.primitive.clone(),
                     bias.primitive,
                 );
 
-                let state = (lhs.primitive, rhs.primitive, output.clone(), bias_shape);
+                let state = (lhs_state, rhs_state, output.clone(), bias_shape);
+
                 prep.finish(state, output)
             }
             OpsKind::UnTracked(prep) => {

diff --git a/crates/burn-autodiff/Cargo.toml b/crates/burn-autodiff/Cargo.toml
@@ -26,5 +26,3 @@ spin = { workspace = true }
 burn-tensor = { path = "../burn-tensor", version = "0.13.0", default-features = false, features = [
   "export_tests",
 ] }
-
-
diff --git a/crates/burn-autodiff/src/backend.rs b/crates/burn-autodiff/src/backend.rs
@@ -1,4 +1,9 @@
-use crate::{grads::Gradients, graph::backward::backward, tensor::AutodiffTensor};
+use crate::{
+    checkpoint::strategy::{CheckpointStrategy, NoCheckpointing},
+    grads::Gradients,
+    graph::backward::backward,
+    tensor::AutodiffTensor,
+};
 use burn_tensor::backend::{AutodiffBackend, Backend};
 use core::marker::PhantomData;
 
@@ -7,11 +12,12 @@ use core::marker::PhantomData;
 /// This works as a backend decorator, extending the functionality of any backend with
 /// backpropagation.
 #[derive(Clone, Copy, Debug, Default)]
-pub struct Autodiff<B> {
+pub struct Autodiff<B, C = NoCheckpointing> {
     _b: PhantomData<B>,
+    _checkpoint_strategy: PhantomData<C>,
 }
 
-impl<B: Backend> Backend for Autodiff<B> {
+impl<B: Backend, C: CheckpointStrategy> Backend for Autodiff<B, C> {
     type Device = B::Device;
 
     type FullPrecisionElem = B::FullPrecisionElem;
@@ -42,7 +48,7 @@ impl<B: Backend> Backend for Autodiff<B> {
     }
 }
 
-impl<B: Backend> AutodiffBackend for Autodiff<B> {
+impl<B: Backend, C: CheckpointStrategy> AutodiffBackend for Autodiff<B, C> {
     type InnerBackend = B;
     type Gradients = Gradients;