diff --git a/examples/01-tensor.rs b/examples/01-tensor.rs
new file mode 100644
index 000000000..73e0944bd
--- /dev/null
+++ b/examples/01-tensor.rs
@@ -0,0 +1,29 @@
+//! Intro to dfdx::tensor
+
+use rand::thread_rng;
+
+use dfdx::tensor::{tensor, HasArrayData, Tensor1D, Tensor2D, Tensor3D, TensorCreator};
+
+fn main() {
+    // easily create tensors using the `tensor` function
+    let _: Tensor1D<5> = tensor([1.0, 2.0, 3.0, 4.0, 5.0]);
+
+    // you can also use [TensorCreator::new]
+    let _: Tensor1D<5> = TensorCreator::new([1.0, 2.0, 3.0, 4.0, 5.0]);
+
+    // [TensorCreator] has other helpful methods such as all zeros and all ones
+    let _: Tensor2D<2, 3> = TensorCreator::zeros();
+    let _: Tensor2D<2, 3> = TensorCreator::ones();
+
+    // we can also create random tensors
+    let mut rng = thread_rng();
+    let a: Tensor3D<2, 3, 4> = TensorCreator::randn(&mut rng);
+
+    // use `.data()` to access the underlying array
+    let a_data: &[[[f32; 4]; 3]; 2] = a.data();
+    println!("a={:?}", a_data);
+
+    // you can clone() a tensor (or duplicate()):
+    let a_copy = a.clone();
+    assert_eq!(a_copy.data(), a.data());
+}
diff --git a/examples/02-ops.rs b/examples/02-ops.rs
new file mode 100644
index 000000000..0f0e4b6df
--- /dev/null
+++ b/examples/02-ops.rs
@@ -0,0 +1,32 @@
+//! Intro to dfdx::tensor_ops
+
+use rand::prelude::*;
+
+use dfdx::tensor::{HasArrayData, Tensor0D, Tensor2D, TensorCreator};
+use dfdx::tensor_ops::add;
+
+fn main() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    let a: Tensor2D<2, 3> = TensorCreator::randn(&mut rng);
+    dbg!(a.data());
+
+    let b: Tensor2D<2, 3> = TensorCreator::randn(&mut rng);
+    dbg!(b.data());
+
+    // we can do binary operations like add two tensors together
+    let c = add(a, &b);
+    dbg!(c.data());
+
+    // or unary operations like apply the `relu` function to each element
+    let d = c.relu();
+    dbg!(d.data());
+
+    // we can add/sub/mul/div scalar values to tensors
+    let e = d + 0.5;
+    dbg!(e.data());
+
+    // or reduce tensors to smaller sizes
+    let f: Tensor0D = e.mean();
+    dbg!(f.data());
+}
diff --git a/examples/03-nn.rs b/examples/03-nn.rs
new file mode 100644
index 000000000..286ab6f28
--- /dev/null
+++ b/examples/03-nn.rs
@@ -0,0 +1,29 @@
+//! Intro to dfdx::nn
+
+use rand::prelude::*;
+
+use dfdx::nn::{Linear, Module, ReLU, ResetParams};
+use dfdx::tensor::{Tensor1D, Tensor2D, TensorCreator};
+
+fn main() {
+    // nn exposes many different neural network types, like the Linear layer!
+    let mut m: Linear<4, 2> = Default::default();
+
+    // at first they are initialized to zeros, but you can randomize them too
+    let mut rng = StdRng::seed_from_u64(0);
+    m.reset_params(&mut rng);
+
+    // they act on tensors using the forward method
+    let x: Tensor1D<4> = TensorCreator::zeros();
+    let _: Tensor1D<2> = m.forward(x);
+
+    // most of them can also act on many different shapes of tensors
+    let x: Tensor2D<10, 4> = TensorCreator::zeros();
+    let _: Tensor2D<10, 2> = m.forward(x);
+
+    // you can also combine multiple modules with tuples
+    let mlp: (Linear<4, 2>, ReLU, Linear<2, 1>) = Default::default();
+
+    let x: Tensor1D<4> = TensorCreator::zeros();
+    let _: Tensor1D<1> = mlp.forward(x);
+}
diff --git a/examples/04-gradients.rs b/examples/04-gradients.rs
new file mode 100644
index 000000000..2737feffe
--- /dev/null
+++ b/examples/04-gradients.rs
@@ -0,0 +1,35 @@
+//! Intro to dfdx::gradients and tapes
+
+use rand::prelude::*;
+
+use dfdx::gradients::{Gradients, NoneTape, OwnedTape};
+use dfdx::tensor::{Tensor0D, Tensor2D, TensorCreator};
+use dfdx::tensor_ops::matmul;
+
+fn main() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    // tensors are first created with no tapes on them - the NoneTape!
+    let weight: Tensor2D<4, 2, NoneTape> = TensorCreator::randn(&mut rng);
+    let a: Tensor2D<3, 4, NoneTape> = TensorCreator::randn(&mut rng);
+
+    // the first step to tracing is to call .trace()
+    // this sticks a gradient tape into the input tensor!
+    let b: Tensor2D<3, 4, OwnedTape> = a.trace();
+
+    // the tape will automatically move around as you perform ops
+    let c: Tensor2D<3, 2, OwnedTape> = matmul(b, &weight);
+    let d: Tensor2D<3, 2, OwnedTape> = c.sin();
+    let e: Tensor0D<OwnedTape> = d.mean();
+
+    // finally you can use .backward() to extract the gradients!
+    let gradients: Gradients = e.backward();
+
+    // now you can extract gradients for specific tensors
+    // by querying with them
+    let weight_grad: &[[f32; 2]; 4] = gradients.ref_gradient(&weight);
+    dbg!(weight_grad);
+
+    let a_grad: &[[f32; 4]; 3] = gradients.ref_gradient(&a);
+    dbg!(a_grad);
+}
diff --git a/examples/05-optim.rs b/examples/05-optim.rs
new file mode 100644
index 000000000..625a578d5
--- /dev/null
+++ b/examples/05-optim.rs
@@ -0,0 +1,60 @@
+//! Intro to dfdx::optim
+
+use rand::prelude::*;
+
+use dfdx::gradients::{Gradients, OwnedTape};
+use dfdx::losses::mse_loss;
+use dfdx::nn::{Linear, Module, ReLU, ResetParams, Tanh};
+use dfdx::optim::{Momentum, Optimizer, Sgd, SgdConfig};
+use dfdx::tensor::{HasArrayData, Tensor2D, TensorCreator};
+
+// first let's declare our neural network to optimze
+type Mlp = (
+    (Linear<5, 32>, ReLU),
+    (Linear<32, 32>, ReLU),
+    (Linear<32, 2>, Tanh),
+);
+
+fn main() {
+    let mut rng = StdRng::seed_from_u64(0);
+
+    // The first step to optimizing is to initialize the optimizer.
+    // Here we construct a stochastic gradient descent optimizer
+    // for our Mlp.
+    let mut sgd: Sgd<Mlp> = Sgd::new(SgdConfig {
+        lr: 1e-1,
+        momentum: Some(Momentum::Nesterov(0.9)),
+    });
+
+    // let's initialize our model and some dummy data
+    let mut mlp: Mlp = Default::default();
+    mlp.reset_params(&mut rng);
+    let x: Tensor2D<3, 5> = TensorCreator::randn(&mut rng);
+    let y: Tensor2D<3, 2> = TensorCreator::randn(&mut rng);
+
+    // first we pass our gradient tracing input through the network
+    let prediction: Tensor2D<3, 2, OwnedTape> = mlp.forward(x.trace());
+
+    // next compute the loss against the target dummy data
+    let loss = mse_loss(prediction, &y);
+    dbg!(loss.data());
+
+    // extract the gradients
+    let gradients: Gradients = loss.backward();
+
+    // the final step is to use our optimizer to update our model
+    // given the gradients we've calculated.
+    // This will modify our model!
+    sgd.update(&mut mlp, gradients)
+        .expect("Oops, there were some unused params");
+
+    // let's do this a couple times to make sure the loss decreases!
+    for i in 0..5 {
+        let prediction = mlp.forward(x.trace());
+        let loss = mse_loss(prediction, &y);
+        println!("Loss after update {i}: {:?}", loss.data());
+        let gradients: Gradients = loss.backward();
+        sgd.update(&mut mlp, gradients)
+            .expect("Oops, there were some unused params");
+    }
+}
diff --git a/examples/mnist_classifier.rs b/examples/06-mnist.rs
similarity index 89%
rename from examples/mnist_classifier.rs
rename to examples/06-mnist.rs
index f26f94158..c5a3527d4 100644
--- a/examples/mnist_classifier.rs
+++ b/examples/06-mnist.rs
@@ -1,3 +1,7 @@
+//! This example ties all the previous ones together
+//! to build a neural network that learns to recognize
+//! the MNIST digits.
+
 use dfdx::prelude::*;
 use indicatif::ProgressBar;
 use mnist::*;
@@ -39,6 +43,7 @@ impl MnistDataset {
     }
 }
 
+// our network structure
 type Mlp = (
     (Linear<784, 512>, ReLU),
     (Linear<512, 128>, ReLU),
@@ -46,9 +51,13 @@ type Mlp = (
     Linear<32, 10>,
 );
 
+// training batch size
 const BATCH_SIZE: usize = 32;
 
 fn main() {
+    // ftz substantially improves performance
+    dfdx::flush_denormals_to_zero();
+
     let mnist_path = std::env::args()
         .nth(1)
         .unwrap_or_else(|| "./datasets/MNIST/raw".to_string());
@@ -58,10 +67,12 @@ fn main() {
 
     let mut rng = StdRng::seed_from_u64(0);
 
+    // initialize model and optimizer
     let mut model: Mlp = Default::default();
     model.reset_params(&mut rng);
     let mut opt: Adam<Mlp> = Default::default();
 
+    // initialize dataset
     let dataset = MnistDataset::train(&mnist_path);
     println!("Found {:?} training images", dataset.len());
 
@@ -94,6 +105,7 @@ fn main() {
         );
     }
 
+    // save our model to a .npz file
     model
         .save("mnist-classifier.npz")
         .expect("failed to save model");
diff --git a/examples/custom.rs b/examples/07-custom-module.rs
similarity index 59%
rename from examples/custom.rs
rename to examples/07-custom-module.rs
index df8a6eb8d..aafead888 100644
--- a/examples/custom.rs
+++ b/examples/07-custom-module.rs
@@ -1,5 +1,10 @@
-use dfdx::prelude::*;
-use rand::prelude::{SeedableRng, StdRng};
+//! Demonstrates how to build a custom [nn::Module] without using tuples
+
+use rand::prelude::*;
+
+use dfdx::gradients::{CanUpdateWithGradients, GradientProvider, OwnedTape, Tape, UnusedTensors};
+use dfdx::nn::{Linear, Module, ReLU, ResetParams};
+use dfdx::tensor::{Tensor1D, Tensor2D, TensorCreator};
 
 /// Custom model struct
 /// This case is trivial and should be done with a tuple of linears and relus,
@@ -11,6 +16,7 @@ struct Mlp<const IN: usize, const INNER: usize, const OUT: usize> {
     relu: ReLU,
 }
 
+// ResetParams lets you randomize a model's parameters
 impl<const IN: usize, const INNER: usize, const OUT: usize> ResetParams for Mlp<IN, INNER, OUT> {
     fn reset_params<R: rand::Rng>(&mut self, rng: &mut R) {
         self.l1.reset_params(rng);
@@ -19,6 +25,7 @@ impl<const IN: usize, const INNER: usize, const OUT: usize> ResetParams for Mlp<
     }
 }
 
+// CanUpdateWithGradients lets you update a model's parameters using gradients
 impl<const IN: usize, const INNER: usize, const OUT: usize> CanUpdateWithGradients
     for Mlp<IN, INNER, OUT>
 {
@@ -29,25 +36,29 @@ impl<const IN: usize, const INNER: usize, const OUT: usize> CanUpdateWithGradien
     }
 }
 
-// Impl module for single forward pass
+// impl Module for single item
 impl<const IN: usize, const INNER: usize, const OUT: usize> Module<Tensor1D<IN>>
     for Mlp<IN, INNER, OUT>
 {
     type Output = Tensor1D<OUT>;
 
-    fn forward(&self, input: Tensor1D<IN>) -> Self::Output {
-        self.l2.forward(self.relu.forward(self.l1.forward(input)))
+    fn forward(&self, x: Tensor1D<IN>) -> Self::Output {
+        let x = self.l1.forward(x);
+        let x = self.relu.forward(x);
+        self.l2.forward(x)
     }
 }
 
-// Impl module for batch forward pass
-impl<const BATCH: usize, const IN: usize, const INNER: usize, const OUT: usize, T: Tape>
-    Module<Tensor2D<BATCH, IN, T>> for Mlp<IN, INNER, OUT>
+// impl Module for batch of items
+impl<const BATCH: usize, const IN: usize, const INNER: usize, const OUT: usize, TAPE: Tape>
+    Module<Tensor2D<BATCH, IN, TAPE>> for Mlp<IN, INNER, OUT>
 {
-    type Output = Tensor2D<BATCH, OUT, T>;
+    type Output = Tensor2D<BATCH, OUT, TAPE>;
 
-    fn forward(&self, input: Tensor2D<BATCH, IN, T>) -> Self::Output {
-        self.l2.forward(self.relu.forward(self.l1.forward(input)))
+    fn forward(&self, x: Tensor2D<BATCH, IN, TAPE>) -> Self::Output {
+        let x = self.l1.forward(x);
+        let x = self.relu.forward(x);
+        self.l2.forward(x)
     }
 }
 
@@ -63,9 +74,9 @@ fn main() {
 
     // Forward pass with a single sample
     let sample: Tensor1D<10> = Tensor1D::randn(&mut rng);
-    let _y = model.forward(sample);
+    let _: Tensor1D<10> = model.forward(sample);
 
     // Forward pass with a batch of samples
     let batch: Tensor2D<BATCH_SIZE, 10> = Tensor2D::randn(&mut rng);
-    let _y = model.forward(batch);
+    let _: Tensor2D<BATCH_SIZE, 10, OwnedTape> = model.forward(batch.trace());
 }
diff --git a/examples/08-tensor-broadcast-reduce.rs b/examples/08-tensor-broadcast-reduce.rs
new file mode 100644
index 000000000..02f3505fe
--- /dev/null
+++ b/examples/08-tensor-broadcast-reduce.rs
@@ -0,0 +1,38 @@
+//! Demonstrates broadcasting tensors to different sizes, and axis reductions
+//! with BroadcastTo and ReduceTo
+
+use dfdx::arrays::Axis;
+use dfdx::tensor::{tensor, HasArrayData, Tensor1D, Tensor2D, Tensor4D};
+use dfdx::tensor_ops::BroadcastTo;
+
+fn main() {
+    let a: Tensor1D<3> = tensor([1.0, 2.0, 3.0]);
+
+    // to broadcast, use `BroadcastTo::broadcast()` and specify
+    // the output type. the axes that are broadcast are inferred for you!
+    let b: Tensor2D<5, 3> = a.broadcast();
+    assert_eq!(b.data(), &[[1.0, 2.0, 3.0]; 5]);
+
+    // we can really broadcast any axes on either side
+    // here a (5,3) tensor is broacast to (7,5,3,2).
+    // so 7 is added in front, and 2 is added last
+    let c: Tensor4D<7, 5, 3, 2> = b.broadcast();
+    assert_eq!(c.data(), &[[[[1.0; 2], [2.0; 2], [3.0; 2]]; 5]; 7]);
+
+    // the opposite of broadcast is reducing
+    // we've already introduced one reduction which is mean
+    let d: Tensor2D<5, 3> = c.mean();
+    assert_eq!(d.data(), &[[1.0, 2.0, 3.0]; 5]);
+
+    // generally you can just specify the output type
+    // and the reduction & broadcast will work.
+    // sometimes it's ambiguous though
+    let e: Tensor1D<1> = tensor([1.0]);
+
+    // here rust doesn't know if the new axis is the first or second
+    // so we have to explicitly tell it
+    let f: Tensor2D<1, 1> = BroadcastTo::<_, Axis<1>>::broadcast(e);
+
+    // reductions have the same problem when it's ambiguous
+    let _: Tensor1D<1> = f.mean::<_, Axis<0>>();
+}
diff --git a/examples/09-tensor-permute.rs b/examples/09-tensor-permute.rs
new file mode 100644
index 000000000..07a5c646d
--- /dev/null
+++ b/examples/09-tensor-permute.rs
@@ -0,0 +1,23 @@
+//! Demonstrates how to re-order (permute/transpose) the axes of a tensor
+
+use dfdx::arrays::Axes3;
+use dfdx::tensor::{Tensor3D, TensorCreator};
+use dfdx::tensor_ops::PermuteTo;
+
+fn main() {
+    let a: Tensor3D<3, 5, 7> = TensorCreator::zeros();
+
+    // permuting is as easy as just expressing the desired type
+    let b: Tensor3D<7, 5, 3> = a.permute();
+
+    // we can do any of the expected combinations!
+    let _: Tensor3D<5, 7, 3> = b.permute();
+
+    // just like broadcast/reduce there are times when
+    // inference is impossible because of ambiguities
+    let c: Tensor3D<1, 1, 1> = TensorCreator::zeros();
+
+    // when axes have the same sizes you'll have to indicate
+    // the axes explicitly to get around this
+    let _: Tensor3D<1, 1, 1> = PermuteTo::<_, Axes3<1, 0, 2>>::permute(c);
+}
diff --git a/examples/10-tensor-index.rs b/examples/10-tensor-index.rs
new file mode 100644
index 000000000..527e7ea97
--- /dev/null
+++ b/examples/10-tensor-index.rs
@@ -0,0 +1,29 @@
+//! Demonstrates how to select sub tensors (index) from tensors
+
+use dfdx::tensor::{tensor, HasArrayData, Tensor2D, Tensor3D};
+use dfdx::tensor_ops::Select1;
+
+fn main() {
+    let a: Tensor3D<3, 2, 3> = tensor([
+        [[0.00, 0.01, 0.02], [0.10, 0.11, 0.12]],
+        [[1.00, 1.01, 1.02], [1.10, 1.11, 1.12]],
+        [[2.00, 2.01, 2.02], [2.10, 2.11, 2.12]],
+    ]);
+
+    // the easiest thing to do is to select a single element from axis 0
+    let b: Tensor2D<2, 3> = a.clone().select(&0);
+    assert_eq!(b.data(), &a.data()[0]);
+
+    // but we can also select multiple elements from axis 0!
+    let _: Tensor3D<6, 2, 3> = a.clone().select(&[0, 0, 1, 1, 2, 2]);
+
+    // a 1d array of indices in this case can also mean
+    // select from the second axis. this is determined by two things:
+    // 1. we have 3 usize's in our indices, and 3 is the size of the first dimension
+    // 2. the output type has lost the middle axis, which means the usizes are reducing that axis
+    let _: Tensor2D<3, 3> = a.clone().select(&[0, 1, 0]);
+
+    // of course we can also select multiple values from the first axis also.
+    // in this case we just specify multiple indices instead of a single one
+    let _: Tensor3D<3, 4, 3> = a.select(&[[0, 0, 0, 0], [0, 1, 0, 1], [1, 0, 1, 0]]);
+}
diff --git a/examples/conv_net.rs b/examples/11-conv-net.rs
similarity index 89%
rename from examples/conv_net.rs
rename to examples/11-conv-net.rs
index 19f2eedd3..650df799b 100644
--- a/examples/conv_net.rs
+++ b/examples/11-conv-net.rs
@@ -1,3 +1,6 @@
+//! Demonstrates how to build a neural network with convolution
+//! layers on nightly rust.
+
 #![cfg_attr(feature = "nightly", feature(generic_const_exprs))]
 
 #[cfg(not(feature = "nightly"))]
diff --git a/examples/12-multi-headed.rs b/examples/12-multi-headed.rs
new file mode 100644
index 000000000..a799c45df
--- /dev/null
+++ b/examples/12-multi-headed.rs
@@ -0,0 +1,16 @@
+//! Demonstrates how to build a neural network that has multiple
+//! outputs using `SplitInto`.
+
+use dfdx::nn::{Linear, Module, SplitInto};
+use dfdx::tensor::{tensor, Tensor1D};
+
+fn main() {
+    // SplitInto accepts a tuple of modules. Each one of the items in the
+    // tuple must accept the same type of input.
+    // Note that here, both of the linears have the same size input (1)
+    let m: SplitInto<(Linear<1, 3>, Linear<1, 5>)> = Default::default();
+
+    // when we forward data through, we get a tuple back!
+    let x = tensor([1.0]);
+    let _: (Tensor1D<3>, Tensor1D<5>) = m.forward(x);
+}
diff --git a/examples/classification.rs b/examples/classification.rs
deleted file mode 100644
index 5a6d42c6d..000000000
--- a/examples/classification.rs
+++ /dev/null
@@ -1,52 +0,0 @@
-use dfdx::prelude::*;
-use rand::{rngs::StdRng, SeedableRng};
-use std::time::Instant;
-
-type Mlp = (
-    (Linear<10, 32>, ReLU),
-    (Linear<32, 32>, ReLU),
-    Linear<32, 2>,
-);
-
-fn main() {
-    let mut rng = StdRng::seed_from_u64(0);
-
-    // initialize target data
-    let x: Tensor2D<64, 10> = Tensor2D::randn(&mut rng);
-    let y: Tensor2D<64, 2> = Tensor2D::randn(&mut rng).softmax::<Axis<1>>();
-
-    // initialize model - all weights are 0s
-    let mut mlp: Mlp = Default::default();
-
-    // randomize model weights
-    mlp.reset_params(&mut rng);
-
-    // initialize our optimizer
-    let mut sgd = Sgd::new(SgdConfig {
-        lr: 1e-1,
-        momentum: Some(Momentum::Nesterov(0.9)),
-    });
-
-    // run through training data
-    for _i_epoch in 0..15 {
-        let start = Instant::now();
-
-        // forward through model, computing gradients
-        let pred = mlp.forward(x.trace());
-
-        // compute loss
-        let loss = cross_entropy_with_logits_loss(pred, &y);
-        let loss_v /*: f32 */ = *loss.data();
-
-        // run backprop
-        let gradients = loss.backward();
-
-        // update weights with optimizer
-        sgd.update(&mut mlp, gradients).expect("Unused params");
-
-        println!("cross entropy={:#.3} in {:?}", loss_v, start.elapsed());
-    }
-
-    mlp.save("classification.npz")
-        .expect("failed to save model");
-}
diff --git a/examples/multi_head.rs b/examples/multi_head.rs
deleted file mode 100644
index 0cb90daa2..000000000
--- a/examples/multi_head.rs
+++ /dev/null
@@ -1,42 +0,0 @@
-use dfdx::prelude::*;
-use rand::{rngs::StdRng, SeedableRng};
-use std::time::Instant;
-
-type MultiHeadedMLP = (
-    (Linear<10, 32>, ReLU),
-    (Linear<32, 32>, ReLU),
-    SplitInto<((Linear<32, 2>, Tanh), (Linear<32, 1>, Tanh))>,
-);
-
-fn main() {
-    let mut rng = StdRng::seed_from_u64(0);
-
-    // initialize target data
-    let x: Tensor2D<64, 10> = Tensor2D::randn(&mut rng);
-    let y1: Tensor2D<64, 2> = Tensor2D::randn(&mut rng);
-    let y2: Tensor2D<64, 1> = Tensor2D::randn(&mut rng);
-
-    // initialize optimizer & model
-    let mut mlp: MultiHeadedMLP = Default::default();
-    mlp.reset_params(&mut rng);
-    let mut sgd: Sgd<MultiHeadedMLP> = Default::default();
-
-    // run through training data
-    for _i_epoch in 0..15 {
-        let start = Instant::now();
-
-        let x = x.trace();
-        let (pred1, pred2) = mlp.forward(x);
-
-        // NOTE: we also have to move the tape around when computing losses
-        let (loss2, tape) = mse_loss(pred2, &y2).split_tape();
-        let loss1 = mse_loss(pred1.put_tape(tape), &y1);
-
-        let losses = [*loss1.data(), *loss2.data()];
-        let loss = loss1 + &loss2;
-        let gradients = loss.backward();
-        sgd.update(&mut mlp, gradients).expect("Unused params");
-
-        println!("losses={:.3?} in {:?}", losses, start.elapsed());
-    }
-}
diff --git a/examples/npy_serialize.rs b/examples/numpy-save-load.rs
similarity index 91%
rename from examples/npy_serialize.rs
rename to examples/numpy-save-load.rs
index 409cfbfb9..3db82f6cd 100644
--- a/examples/npy_serialize.rs
+++ b/examples/numpy-save-load.rs
@@ -1,3 +1,5 @@
+//! Demonstrates how to use dfdx::numpy to save and load arrays
+
 use dfdx::numpy as np;
 
 fn main() {
diff --git a/examples/regression.rs b/examples/regression.rs
deleted file mode 100644
index b9394e09a..000000000
--- a/examples/regression.rs
+++ /dev/null
@@ -1,51 +0,0 @@
-use dfdx::prelude::*;
-use rand::{rngs::StdRng, SeedableRng};
-use std::time::Instant;
-
-// our simple 2 layer feedforward network with ReLU activations
-type Mlp = (
-    (Linear<10, 32>, ReLU),
-    (Linear<32, 32>, ReLU),
-    (Linear<32, 2>, Tanh),
-);
-
-fn main() {
-    let mut rng = StdRng::seed_from_u64(0);
-
-    // initialize target data
-    let x: Tensor2D<64, 10> = Tensor2D::randn(&mut rng);
-    let y: Tensor2D<64, 2> = Tensor2D::randn(&mut rng);
-
-    // initiliaze model - all weights are 0s
-    let mut mlp: Mlp = Default::default();
-
-    // randomize model weights
-    mlp.reset_params(&mut rng);
-
-    let mut sgd = Sgd::new(SgdConfig {
-        lr: 1e-1,
-        momentum: Some(Momentum::Nesterov(0.9)),
-    });
-
-    // run through training data
-    for _i_epoch in 0..15 {
-        let start = Instant::now();
-
-        // forward through model, computing gradients
-        let pred = mlp.forward(x.trace());
-
-        // compute loss
-        let loss = mse_loss(pred, &y);
-        let loss_v /*: f32 */ = *loss.data();
-
-        // run backprop
-        let gradients = loss.backward();
-
-        // update weights with optimizer
-        sgd.update(&mut mlp, gradients).expect("Unused params");
-
-        println!("mse={:#.3} in {:?}", loss_v, start.elapsed());
-    }
-
-    mlp.save("regression.npz").expect("failed to save mlp");
-}
diff --git a/examples/dqn.rs b/examples/rl-dqn.rs
similarity index 100%
rename from examples/dqn.rs
rename to examples/rl-dqn.rs
diff --git a/examples/ppo.rs b/examples/rl-ppo.rs
similarity index 100%
rename from examples/ppo.rs
rename to examples/rl-ppo.rs
diff --git a/examples/tensors.rs b/examples/tensors.rs
deleted file mode 100644
index a5eb62297..000000000
--- a/examples/tensors.rs
+++ /dev/null
@@ -1,23 +0,0 @@
-#![allow(clippy::needless_range_loop)]
-use dfdx::prelude::*;
-
-fn main() {
-    let a: Tensor2D<2, 3> = TensorCreator::zeros();
-
-    // since add() expects tensors with the same size, we dont need a type for this
-    let b = TensorCreator::ones();
-    let c = add(a, &b);
-
-    // tensors just store raw rust arrays, use `.data()` to access this.
-    assert_eq!(c.data(), &[[1.0; 3]; 2]);
-
-    // since we pass in an array, rust will figure out that we mean Tensor1D<5> since its an [f32; 5]
-    let mut d = Tensor1D::new([1.0, 2.0, 3.0, 4.0, 5.0]);
-
-    // use `.mut_data()` to access underlying mutable array. type is provided for readability
-    let raw_data: &mut [f32; 5] = d.mut_data();
-    for i in 0..5 {
-        raw_data[i] *= 2.0;
-    }
-    assert_eq!(d.data(), &[2.0, 4.0, 6.0, 8.0, 10.0]);
-}