diff --git a/.gitignore b/.gitignore
index 026b26f36..7ac4b427d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 # will have compiled files and executables
 target
 
+Cargo.lock
+
 # These are backup files generated by rustfmt
 **/*.rs.bk
diff --git a/Cargo.toml b/Cargo.toml
index 4f5255f13..94da1a41d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 members = [
 		"greenglas", "coaster", "coaster-nn", "coaster-blas", "juice", "rust-blas",
 		"rcudnn/cudnn", "rcudnn/cudnn-sys", "rcublas/cublas", "rcublas/cublas-sys",
-		"juice-examples/juice-utils", "juice-examples/mackey-glass-rnn-regression",
+#		"juice-examples/juice-utils", "juice-examples/mackey-glass-rnn-regression",
 		"juice-examples/mnist-image-multiclass-classification"]
 
 exclude = [ "./rcudnn", "./rcublas", "./juice-examples"]
diff --git a/juice-examples/mnist-image-multiclass-classification/src/main.rs b/juice-examples/mnist-image-multiclass-classification/src/main.rs
index d306d318c..68feb2108 100644
--- a/juice-examples/mnist-image-multiclass-classification/src/main.rs
+++ b/juice-examples/mnist-image-multiclass-classification/src/main.rs
@@ -10,8 +10,7 @@ use co::frameworks::cuda::get_cuda_backend;
 #[cfg(not(feature = "cuda"))]
 use co::frameworks::native::get_native_backend;
 use co::prelude::*;
-use juice::layer::*;
-use juice::layers::*;
+use juice::net::*;
 use juice::solver::*;
 use juice::util::*;
 use juice_utils::{download_datasets, unzip_datasets};
@@ -131,7 +130,7 @@ fn main() {
         );
     }
 }
-
+/*
 #[cfg(all(feature = "cuda"))]
 fn add_conv_net(
     mut net_cfg: SequentialConfig,
@@ -164,7 +163,7 @@ fn add_conv_net(
         "linear1",
         LinearConfig { output_size: 500 },
     ));
-    net_cfg.add_layer(LayerConfig::new("sigmoid", LayerType::Sigmoid));
+    net_cfg.add_layer(LayerConfig::new("sigmoid", LayerConfig::Sigmoid));
     net_cfg.add_layer(LayerConfig::new(
         "linear2",
         LinearConfig { output_size: 10 },
@@ -192,26 +191,25 @@ fn add_mlp(
 ) -> SequentialConfig {
     net_cfg.add_layer(LayerConfig::new(
         "reshape",
-        LayerType::Reshape(ReshapeConfig::of_shape(&[batch_size, pixel_count])),
+        LayerConfig::Reshape(ReshapeConfig::of_shape(&[batch_size, pixel_count])),
     ));
     net_cfg.add_layer(LayerConfig::new(
         "linear1",
-        LayerType::Linear(LinearConfig { output_size: 1568 }),
+        LayerConfig::Linear(LinearConfig { output_size: 1568 }),
     ));
-    net_cfg.add_layer(LayerConfig::new("sigmoid", LayerType::Sigmoid));
+    net_cfg.add_layer(LayerConfig::new("sigmoid", LayerConfig::Sigmoid));
     net_cfg.add_layer(LayerConfig::new(
         "linear2",
-        LayerType::Linear(LinearConfig { output_size: 10 }),
+        LayerConfig::Linear(LinearConfig { output_size: 10 }),
     ));
     net_cfg
 }
-
-fn add_linear_net(mut net_cfg: SequentialConfig) -> SequentialConfig {
-    net_cfg.add_layer(LayerConfig::new(
+*/
+fn add_linear_net(net_cfg: SequentialConfig) -> SequentialConfig{
+    net_cfg.with_layer(
         "linear",
-        LayerType::Linear(LinearConfig { output_size: 10 }),
-    ));
-    net_cfg
+        LayerConfig::Linear(LinearConfig { output_size: 10 }),
+    )
 }
 
 fn run_mnist(
@@ -250,25 +248,19 @@ fn run_mnist(
     let momentum = momentum.unwrap_or(0f32);
 
     let mut net_cfg = SequentialConfig::default();
-    net_cfg.add_input("data", &[batch_size, pixel_dim, pixel_dim]);
-    net_cfg.force_backward = true;
 
-    net_cfg = match &*model_name.unwrap_or("none".to_owned()) {
-        "conv" => add_conv_net(net_cfg, batch_size, pixel_dim),
-        "mlp" => add_mlp(net_cfg, batch_size, pixel_count),
-        "linear" => add_linear_net(net_cfg),
+    match &*model_name.unwrap_or("none".to_owned()) {
+        // "conv" => add_conv_net(net_cfg, batch_size, pixel_dim),
+        // "mlp" => add_mlp(net_cfg, batch_size, pixel_count),
+        "linear" => net_cfg = add_linear_net(net_cfg),
         _ => panic!("Unknown model. Try one of [linear, mlp, conv]"),
     };
 
-    net_cfg.add_layer(LayerConfig::new("log_softmax", LayerType::LogSoftmax));
+    net_cfg = net_cfg.with_layer("log_softmax", LayerConfig::LogSoftmax);
 
-    let mut classifier_cfg = SequentialConfig::default();
-    classifier_cfg.add_input("network_out", &[batch_size, 10]);
-    classifier_cfg.add_input("label", &[batch_size, 1]);
     // set up nll loss
-    let nll_layer_cfg = NegativeLogLikelihoodConfig { num_classes: 10 };
-    let nll_cfg = LayerConfig::new("nll", LayerType::NegativeLogLikelihood(nll_layer_cfg));
-    classifier_cfg.add_layer(nll_cfg);
+    let classifier_cfg =
+        LayerConfig::NegativeLogLikelihood(NegativeLogLikelihoodConfig { num_classes: 10 });
 
     // set up backends
     #[cfg(all(feature = "cuda"))]
@@ -283,9 +275,14 @@ fn run_mnist(
         momentum,
         ..SolverConfig::default()
     };
-    solver_cfg.network = LayerConfig::new("network", net_cfg);
-    solver_cfg.objective = LayerConfig::new("classifier", classifier_cfg);
-    let mut solver = Solver::from_config(backend.clone(), backend.clone(), &solver_cfg);
+    solver_cfg.network = LayerConfig::Sequential(net_cfg);
+    solver_cfg.objective = classifier_cfg;
+    let mut solver = Solver::from_config(
+        backend.clone(),
+        &solver_cfg,
+        &[vec![pixel_dim, pixel_dim]],
+        &vec![1],
+    );
 
     // set up confusion matrix
     let mut classification_evaluator = ::juice::solver::ConfusionMatrix::new(10);
@@ -311,9 +308,8 @@ fn run_mnist(
             targets.push(label_val as usize);
         }
         // train the network!
-        let infered_out = solver.train_minibatch(inp_lock.clone(), label_lock.clone());
+        let mut infered = solver.train_minibatch(inp_lock.clone(), label_lock.clone());
 
-        let mut infered = infered_out.write().unwrap();
         let predictions = classification_evaluator.get_predictions(&mut infered);
 
         classification_evaluator.add_samples(&predictions, &targets);
diff --git a/juice/build.rs b/juice/build.rs
index 2961df4dc..b0a92569b 100644
--- a/juice/build.rs
+++ b/juice/build.rs
@@ -4,4 +4,10 @@ fn main() {
         .file("capnp/juice.capnp")
         .run()
         .expect("capnpc schema compiler command must succeed");
+
+    capnpc::CompilerCommand::new()
+        .src_prefix("capnp")
+        .file("capnp/net.capnp")
+        .run()
+        .expect("capnpc schema compiler command must succeed");
 }
diff --git a/juice/capnp/net.capnp b/juice/capnp/net.capnp
new file mode 100644
index 000000000..edeb87bff
--- /dev/null
+++ b/juice/capnp/net.capnp
@@ -0,0 +1,34 @@
+# Types required for serialization of a Network.
+
+@0xd42882f7f90ce348;
+
+struct SequentialConfig {
+}
+
+struct LinearConfig {
+    outputSize @0 :UInt64;
+}
+
+struct NegativeLogLikelihoodConfig {
+}
+
+struct LayerConfig {
+    layerType :union {
+        sequential @0 :SequentialConfig; 
+        linear @1 :LinearConfig;
+        logSoftmax @2 :Void;
+        relu @3 :Void;
+        sigmoid @4 :Void;
+        meanSquaredError @5 :Void;
+        negativeLogLikelihood @6 :NegativeLogLikelihoodConfig;
+    }
+}
+
+struct TensorShape {
+    shape @0 :List(UInt64);
+}
+
+struct Network {
+    config @0 :LayerConfig;
+    inputs @1 :List(TensorShape);
+}
\ No newline at end of file
diff --git a/juice/examples/benchmarks.rs b/juice/examples/benchmarks.rs
index 79a0828b3..e49a1166e 100644
--- a/juice/examples/benchmarks.rs
+++ b/juice/examples/benchmarks.rs
@@ -1,3 +1,5 @@
+/*
+
 #[macro_use]
 extern crate timeit;
 
@@ -583,3 +585,7 @@ fn bench_vgg_a() {
         }
     }
 }
+*/
+
+
+fn main() {}
\ No newline at end of file
diff --git a/juice/src/lib.rs b/juice/src/lib.rs
index 30fe80b55..c4634f5a8 100644
--- a/juice/src/lib.rs
+++ b/juice/src/lib.rs
@@ -117,10 +117,12 @@ extern crate coaster_blas as coblas;
 extern crate coaster_nn as conn;
 extern crate num;
 extern crate rand;
-pub mod layer;
-pub mod layers;
+//pub mod layer;
+//pub mod layers;
+pub mod net;
 pub mod solver;
 pub mod solvers;
+pub mod train;
 pub mod weight;
 
 mod capnp_util;
@@ -130,3 +132,6 @@ pub mod util;
 mod juice_capnp {
     include!(concat!(env!("OUT_DIR"), "/juice_capnp.rs"));
 }
+mod net_capnp {
+    include!(concat!(env!("OUT_DIR"), "/net_capnp.rs"));
+}
diff --git a/juice/src/net/activation/mod.rs b/juice/src/net/activation/mod.rs
new file mode 100644
index 000000000..56ed5a88e
--- /dev/null
+++ b/juice/src/net/activation/mod.rs
@@ -0,0 +1,5 @@
+mod relu;
+mod sigmoid;
+
+pub use relu::*;
+pub use sigmoid::*;
\ No newline at end of file
diff --git a/juice/src/net/activation/relu.rs b/juice/src/net/activation/relu.rs
new file mode 100644
index 000000000..bc41de0be
--- /dev/null
+++ b/juice/src/net/activation/relu.rs
@@ -0,0 +1,53 @@
+use crate::co::IBackend;
+use crate::conn;
+use crate::net::{Context, Descriptor, Layer};
+
+#[derive(Debug, Clone)]
+pub struct Relu {
+    descriptor: Descriptor,
+}
+
+impl Relu {
+    pub fn new(mut descriptor: Descriptor) -> Self {
+        assert_eq!(descriptor.inputs().len(), 1); // Should only be one input.
+
+        descriptor.add_output(descriptor.input(0).unit_shape().clone());
+
+        Relu {
+            descriptor: descriptor,
+        }
+    }
+}
+
+impl<B: IBackend + conn::Relu<f32>> Layer<B> for Relu {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        let input = context.get_data(self.descriptor.input(0));
+        let output = context.acquire_data(self.descriptor.output(0));
+        backend
+            .relu(&input.borrow(), &mut output.borrow_mut())
+            .unwrap();
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        let input = context.get_data(self.descriptor.input(0));
+        let output = context.get_data(self.descriptor.output(0));
+        let output_gradient = context.get_data_gradient(self.descriptor.output(0));
+        let input_gradient = context.acquire_data_gradient(self.descriptor.input(0));
+        backend
+            .relu_grad(
+                &output.borrow(),
+                &output_gradient.borrow(),
+                &input.borrow(),
+                &mut input_gradient.borrow_mut(),
+            )
+            .unwrap();
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
diff --git a/juice/src/net/activation/sigmoid.rs b/juice/src/net/activation/sigmoid.rs
new file mode 100644
index 000000000..a6e5fb8e8
--- /dev/null
+++ b/juice/src/net/activation/sigmoid.rs
@@ -0,0 +1,53 @@
+use crate::co::IBackend;
+use crate::conn;
+use crate::net::{Context, Descriptor, Layer};
+
+#[derive(Debug, Clone)]
+pub struct Sigmoid {
+    descriptor: Descriptor,
+}
+
+impl Sigmoid {
+    pub fn new(mut descriptor: Descriptor) -> Self {
+        assert_eq!(descriptor.inputs().len(), 1); // Should only be one input.
+
+        descriptor.add_output(descriptor.input(0).unit_shape().clone());
+
+        Sigmoid {
+            descriptor: descriptor,
+        }
+    }
+}
+
+impl<B: IBackend + conn::Sigmoid<f32>> Layer<B> for Sigmoid {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        let input = context.get_data(self.descriptor.input(0));
+        let output = context.acquire_data(self.descriptor.output(0));
+        backend
+            .sigmoid(&input.borrow(), &mut output.borrow_mut())
+            .unwrap();
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        let input = context.get_data(self.descriptor.input(0));
+        let output = context.get_data(self.descriptor.output(0));
+        let output_gradient = context.get_data_gradient(self.descriptor.output(0));
+        let input_gradient = context.acquire_data_gradient(self.descriptor.input(0));
+        backend
+            .sigmoid_grad(
+                &output.borrow(),
+                &output_gradient.borrow(),
+                &input.borrow(),
+                &mut input_gradient.borrow_mut(),
+            )
+            .unwrap();
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
diff --git a/juice/src/net/common/linear.rs b/juice/src/net/common/linear.rs
new file mode 100644
index 000000000..f2b30a628
--- /dev/null
+++ b/juice/src/net/common/linear.rs
@@ -0,0 +1,150 @@
+use std::cell::RefCell;
+use std::rc::Rc;
+
+use crate::co::{IBackend, ITensorDesc, SharedTensor};
+use crate::coblas::transpose::Transpose;
+use crate::net::{Context, Descriptor, Layer, LearnableParams};
+use crate::util::{native_scalar, LayerOps};
+use crate::weight::FillerType;
+
+#[derive(Clone, Debug, Default)]
+pub struct LinearConfig {
+    pub output_size: usize,
+}
+
+#[derive(Debug)]
+pub struct Linear {
+    descriptor: Descriptor,
+
+    // Weight (A) and bias (b) for the linear operation y = Ax + b.
+    weight: Rc<RefCell<LearnableParams>>,
+    bias: Rc<RefCell<LearnableParams>>,
+
+    // Constants saved for efficiency.
+    one: SharedTensor<f32>,
+    zero: SharedTensor<f32>,
+}
+
+impl LinearConfig {
+    pub fn new(output_size: usize) -> Self {
+        LinearConfig {
+            output_size: output_size,
+        }
+    }
+}
+
+impl Linear {
+    pub fn new(mut descriptor: Descriptor, config: &LinearConfig) -> Self {
+        assert_eq!(descriptor.inputs().len(), 1); // Should be only one input.
+        let input_size = descriptor.input(0).unit_shape().size();
+
+        descriptor.add_output(vec![config.output_size]);
+
+        // Create weight matrix.
+        let mut weight = SharedTensor::<f32>::new(&[config.output_size, input_size]);
+        FillerType::fill_glorot(&mut weight, input_size, config.output_size);
+
+        // Create bias. Bias is typically intialized with a constant, and a suitable initialisation
+        // is stated in https://cs231n.github.io/neural-networks-2/#init for non-LSTM types.
+        let mut bias = SharedTensor::<f32>::new(&[1, config.output_size]);
+        let seed = rand::random::<f32>();
+        let bias_init_value = seed * (2.0 / seed).sqrt();
+        FillerType::fill_constant(&mut bias, bias_init_value);
+
+        let weight_param = descriptor.create_params("weights", weight, 1.0);
+        let bias_param = descriptor.create_params("bias", bias, 1.0);
+
+        Linear {
+            descriptor: descriptor,
+            weight: weight_param,
+            bias: bias_param,
+            one: native_scalar(1f32),
+            zero: native_scalar(0f32),
+        }
+    }
+}
+
+impl<B: IBackend + LayerOps<f32>> Layer<B> for Linear {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        let input = context.get_data(self.descriptor.input(0));
+        let output = context.acquire_data(self.descriptor.output(0));
+
+        let mut ones_tensor = SharedTensor::<f32>::new(&[context.batch_size(), 1]);
+        FillerType::fill_constant(&mut ones_tensor, 1f32);
+
+        backend
+            .gemm(
+                &self.one,
+                Transpose::NoTrans,
+                &ones_tensor,
+                Transpose::NoTrans,
+                &self.bias.borrow().data,
+                &self.zero,
+                &mut output.borrow_mut(),
+            )
+            .unwrap();
+
+        backend
+            .gemm(
+                &self.one,
+                Transpose::NoTrans,
+                &input.borrow(),
+                Transpose::Trans,
+                &self.weight.borrow().data,
+                &self.one,
+                &mut output.borrow_mut(),
+            )
+            .unwrap();
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        let input = context.get_data(self.descriptor.input(0));
+        let output_gradient = context.get_data_gradient(self.descriptor.output(0));
+
+        let input_gradient = context.acquire_data_gradient(self.descriptor.input(0));
+        let weights_gradient = context.acquire_params_gradient(self.descriptor.param(0));
+        let bias_gradient = context.acquire_params_gradient(self.descriptor.param(1));
+
+        // Network error gradient with respect to input data.
+        // dE/dx = dE/dy * df/dx = dE/dy * w.
+        backend
+            .gemm(
+                &self.one,
+                Transpose::NoTrans,
+                &output_gradient.borrow(),
+                Transpose::NoTrans,
+                &self.weight.borrow().data,
+                &self.zero,
+                &mut input_gradient.borrow_mut(),
+            )
+            .unwrap();
+
+        // Network error gradient with respect to weights.
+        // dE/dw = dE/dy * df/dw = dE/dy * x.
+        backend
+            .gemm(
+                &self.one,
+                Transpose::Trans,
+                &output_gradient.borrow(),
+                Transpose::NoTrans,
+                &input.borrow(),
+                &self.zero,
+                &mut weights_gradient.borrow_mut(),
+            )
+            .unwrap();
+
+        // Network error gradient with respect to bias.
+        // dE/dw = dE/dy * df/db = dE/dy * [1] = dE/dy.
+        backend
+            .copy(&output_gradient.borrow(), &mut bias_gradient.borrow_mut())
+            .unwrap();
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
diff --git a/juice/src/net/common/log_softmax.rs b/juice/src/net/common/log_softmax.rs
new file mode 100644
index 000000000..a4d18192a
--- /dev/null
+++ b/juice/src/net/common/log_softmax.rs
@@ -0,0 +1,51 @@
+use crate::co::IBackend;
+use crate::conn;
+use crate::net::{Context, Descriptor, Layer};
+
+#[derive(Debug)]
+pub struct LogSoftmax {
+    descriptor: Descriptor,
+}
+
+impl LogSoftmax {
+    pub fn new(mut descriptor: Descriptor) -> Self {
+        assert_eq!(descriptor.inputs().len(), 1); // Should only be one input.
+
+        descriptor.add_output(descriptor.input(0).unit_shape().clone());
+
+        LogSoftmax {
+            descriptor: descriptor,
+        }
+    }
+}
+
+impl<B: IBackend + conn::LogSoftmax<f32>> Layer<B> for LogSoftmax {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        let input = context.get_data(self.descriptor.input(0));
+        let output = context.acquire_data(self.descriptor.output(0));
+        backend
+            .log_softmax(&input.borrow(), &mut output.borrow_mut())
+            .unwrap();
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        let output = context.get_data(self.descriptor.output(0));
+        let output_gradient = context.get_data_gradient(self.descriptor.output(0));
+        let input_gradient = context.acquire_data_gradient(self.descriptor.input(0));
+        backend
+            .log_softmax_grad(
+                &output.borrow(),
+                &output_gradient.borrow(),
+                &mut input_gradient.borrow_mut(),
+            )
+            .unwrap();
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
diff --git a/juice/src/net/common/mod.rs b/juice/src/net/common/mod.rs
new file mode 100644
index 000000000..023edee7b
--- /dev/null
+++ b/juice/src/net/common/mod.rs
@@ -0,0 +1,5 @@
+mod linear;
+mod log_softmax;
+
+pub use linear::*;
+pub use log_softmax::*;
\ No newline at end of file
diff --git a/juice/src/net/config.rs b/juice/src/net/config.rs
new file mode 100644
index 000000000..f66beccac
--- /dev/null
+++ b/juice/src/net/config.rs
@@ -0,0 +1,23 @@
+// Reexport layer configs.
+pub use crate::net::{
+    common::LinearConfig, container::SequentialConfig, container::FanoutConfig,
+    loss::NegativeLogLikelihoodConfig,
+};
+
+#[derive(Debug, Clone)]
+pub enum LayerConfig {
+    Fanout(FanoutConfig),
+    Sequential(SequentialConfig),
+    Linear(LinearConfig),
+    LogSoftmax,
+    Relu,
+    Sigmoid,
+    MeanSquaredError,
+    NegativeLogLikelihood(NegativeLogLikelihoodConfig),
+}
+
+impl Default for LayerConfig {
+    fn default() -> LayerConfig {
+        LayerConfig::Sequential(SequentialConfig::default())
+    }
+}
diff --git a/juice/src/net/container/fanout.rs b/juice/src/net/container/fanout.rs
new file mode 100644
index 000000000..dd8f7e463
--- /dev/null
+++ b/juice/src/net/container/fanout.rs
@@ -0,0 +1,131 @@
+use std::fmt::{Debug, Formatter};
+
+use crate::co::IBackend;
+use crate::net::{layer_from_config, Context, Descriptor, Inout, Layer, LayerConfig};
+use crate::util::LayerOps;
+
+#[derive(Debug, Clone)]
+pub struct FanoutConfig {
+    trunk: Box<LayerConfig>,
+    branches: Vec<LayerConfig>,
+}
+
+pub struct Fanout<B: IBackend> {
+    descriptor: Descriptor,
+    trunk: Box<dyn Layer<B>>,
+    branches: Vec<Box<dyn Layer<B>>>,
+}
+
+impl FanoutConfig {
+    pub fn new(trunk: LayerConfig) -> Self {
+        FanoutConfig {
+            trunk: Box::new(trunk),
+            branches: Vec::new(),
+        }
+    }
+
+    pub fn with_branch(mut self, branch: LayerConfig) -> Self {
+        self.branches.push(branch);
+        self
+    }
+}
+
+impl<B: IBackend + LayerOps<f32> + 'static> Fanout<B> {
+    pub fn new(mut descriptor: Descriptor, config: &FanoutConfig) -> Self {
+        // Create trunk.
+        let trunk_inputs: Vec<Inout> = descriptor.inputs().iter().cloned().collect();
+        let trunk = layer_from_config(
+            descriptor.sub("trunk", trunk_inputs),
+            &config.trunk,
+        );
+
+        // Create branches. Each branch will have same inputs which are trunk's outputs.
+        let branch_inputs: Vec<Inout> = trunk.descriptor().outputs().iter().cloned().collect();
+        let mut branches = Vec::new();
+        for (i, branch_config) in config.branches.iter().enumerate() {
+            let branch = layer_from_config(
+                descriptor.sub(&format!("branch_{}", i), branch_inputs.clone()),
+                branch_config,
+            );
+
+            // Expose branch outputs as this layer outputs.
+            for output in branch.descriptor().outputs() {
+                descriptor.add_output_copy(output);
+            }
+
+            // Expose branch learnable params as this layer params.
+            for params in branch.descriptor().params() {
+                descriptor.add_params_copy(params);
+            }
+
+            branches.push(branch);
+        }
+
+        Fanout {
+            descriptor: descriptor,
+            trunk: trunk,
+            branches: branches,
+        }
+    }
+}
+
+impl<B: IBackend + LayerOps<f32> + 'static> Layer<B> for Fanout<B> {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        self.trunk.compute_output(backend, context);
+        for branch in self.branches.iter() {
+            branch.compute_output(backend, context);
+        }
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        // Backpropagation process is only supported on a single branch.
+        // Determine which branch is used (by finding output gradients on the context)
+        // and compute gradients only along that branch.
+
+        let mut branch_index = None;
+        for i in 0..self.branches.len() {
+            let branch_desc = self.branches[i].descriptor();
+            for j in 0..branch_desc.outputs().len() {
+                let has_output_gradient = context.has_data_gradient(&branch_desc.outputs()[j]);
+                match (branch_index, has_output_gradient) {
+                    // Unpopulated output gradient without a selected branch. Nothing to do.
+                    (None, false) => (),
+                    // This is the first populated output gradient we saw.
+                    // Mark this branch as the one we'll be using for backprop.
+                    (None, true) => {
+                        assert_eq!(j, 0, "Branch {} has partial output gradients; should either have all or none", i);
+                        branch_index = Some(i);
+                    },
+                    // We have a selected branch and this is an unpopulated output gradient.
+                    (Some(i2), false) => {
+                        assert_ne!(i, i2, "Branch {} has partial output gradients (missing {})", i, j);
+                    },
+                    // We have a selected branch and this is a populated output gradient.
+                    (Some(i2), true) => {
+                        assert_eq!(i, i2, 
+                        "Seen output gradients on branches {} and {}; backprop only on one branch is supported", i, i2);
+                    },
+                }
+            }
+        }
+
+        assert_ne!(branch_index, None, "No output gradiens on any branch");
+
+        self.branches[branch_index.unwrap()].compute_gradients(backend, context);
+        self.trunk.compute_gradients(backend, context);
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
+
+impl<B: IBackend> Debug for Fanout<B> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Fanout")
+    }
+}
\ No newline at end of file
diff --git a/juice/src/net/container/mod.rs b/juice/src/net/container/mod.rs
new file mode 100644
index 000000000..9dd37ec58
--- /dev/null
+++ b/juice/src/net/container/mod.rs
@@ -0,0 +1,5 @@
+mod fanout;
+mod sequential;
+
+pub use fanout::*;
+pub use sequential::*;
\ No newline at end of file
diff --git a/juice/src/net/container/sequential.rs b/juice/src/net/container/sequential.rs
new file mode 100644
index 000000000..f9445e480
--- /dev/null
+++ b/juice/src/net/container/sequential.rs
@@ -0,0 +1,238 @@
+//! A container layer the composes nested layers in a sequence.
+//!
+//! In the most simple case, all nested layers are executed one by one, with outputs of one layer
+//! becoming inputs of the next one. Inputs of the Sequential layers are inputs to the first
+//! nested layer, and outputs of the last nested layer are the outpus of the Sequential layer:
+//!
+//! ```
+//! let mut cfg = SequentialConfig::default();
+//! cfg.add_layer("linear", LayerConfig::Linear(LinearConfig{ output_size: 10}));
+//! cfg.add_layer("softmax", LayerConfig::Softmax);
+//! ```
+//!
+//! Sequential layer also supports complex flow graphs (as long as they are acyclic)
+//! by allowing nested layers inputs and outputs to be mapped to named "buffers":
+//!
+//! ```
+//! let mut cfg = SequentialConfig::default();
+//! cfg.add_input("in");
+//! cfg
+//!   .add_layer("linear1", LayerConfig::Linear(LinearConfig{ output_size: 10}))
+//!   .map_input("in")
+//!   .map_output("linear1_out");
+//! cfg
+//!   .add_layer("linear2", LayerConfig::Linear(LinearConfig{ output_size: 10}))
+//!   .map_input("in")
+//!   .map_output("linear2_out");
+//! cfg
+//!   .add_layer(/*some 2-input 1-output layer*/)
+//!   .map_input("linear1_out")
+//!   .map_input("linear2_out")
+//!   .map_output("out");
+//! cfg.add_output("out");
+//! ```
+//!
+//! Note that currently it is requires that layer i can only use outputs from
+//! previous layers 0..i-1.
+use std::collections::HashMap;
+use std::fmt::{Debug, Formatter};
+
+use crate::co::IBackend;
+use crate::net::{layer_from_config, Context, Descriptor, Layer, LayerConfig};
+use crate::util::LayerOps;
+
+#[derive(Debug, Clone, Default)]
+pub struct SequentialChildConfig {
+    name: String,
+    config: LayerConfig,
+    inputs: Vec<String>,
+    outputs: Vec<String>,
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct SequentialConfig {
+    // List of named inputs. If not given, only a single input is assumed.
+    inputs: Vec<String>,
+
+    // Sequence of layers.
+    layers: Vec<SequentialChildConfig>,
+
+    // List of named outputs. If not given, Sequential layer outputs will match the outputs
+    // of the last layer.
+    outputs: Vec<String>,
+}
+
+pub struct Sequential<B: IBackend> {
+    // Outward-facing inputs, outputs and params.
+    descriptor: Descriptor,
+    // Nested layers.
+    children: Vec<Box<dyn Layer<B>>>,
+}
+
+impl SequentialChildConfig {
+    pub fn map_input(mut self, name: &str) -> Self {
+        self.inputs.push(name.to_string());
+        self
+    }
+    pub fn map_output(mut self, name: &str) -> Self {
+        self.outputs.push(name.to_string());
+        self
+    }
+}
+
+impl SequentialConfig {
+    pub fn new() -> Self {
+        SequentialConfig::default()
+    }
+
+    pub fn with_layer(mut self, name: &str, child_config: LayerConfig) -> Self {
+        self.layers.push(SequentialChildConfig {
+            name: name.to_string(),
+            config: child_config,
+            inputs: Vec::new(),
+            outputs: Vec::new(),
+        });
+        self
+    }
+
+    pub fn with_input(mut self, name: &str) -> Self {
+        self.inputs.push(name.to_string());
+        self
+    }
+}
+
+impl<B: IBackend + LayerOps<f32> + 'static> Sequential<B> {
+    pub fn new(mut descriptor: Descriptor, config: &SequentialConfig) -> Self {
+        // Create internal layers one by one and connect them.
+        // For the purpose of connecting layers, all inputs and outputs have names,
+        // which are either explicitly given in the config, or have implicit form of
+        // io_{i}_{j} for layer inputs and io_{i+1}_{j} for outputs, where i is the layer index
+        // (0-based) and j is the input/output index within the layer.
+        // io_0_{j} are inputs to the Sequential layer itself.
+
+        // All layer output Inouts known so far, keyed by their internal names.
+        // This is initialized with shapes of the Sequential inputs below.
+        let mut internal_outputs = HashMap::new();
+
+        // Output names from the previous layer. These will be assumed to be the inputs of the next
+        // layer unless it has explicit input names in the config.
+        // Initialized with Sequenial input names below.
+        let mut prev_layer_output_names = Vec::new();
+
+        // Create an array of Sequential inputs where:
+        // * internal names are either explicitly set in config or implicitly set to "io_0_{j}",
+        // * shapes and data paths are taken from the descriptor.
+        for (j, input) in descriptor.inputs().iter().enumerate() {
+            let internal_name = if config.inputs.len() > j {
+                config.inputs[j].clone()
+            } else {
+                format!("io_0_{}", j)
+            };
+            prev_layer_output_names.push(internal_name.clone());
+            internal_outputs.insert(internal_name, input.clone());
+        }
+
+        // Create children layers.
+        let mut children = Vec::new();
+        for (i, child_config) in config.layers.iter().enumerate() {
+            // Inputs for a child are either given explicitly in the config, or replicate the
+            // outputs of the previous child.
+            let child_input_names = if !child_config.inputs.is_empty() {
+                child_config.inputs.clone()
+            } else {
+                prev_layer_output_names
+            };
+
+            let child_inputs = child_input_names
+                .iter()
+                .map(|name| {
+                    internal_outputs
+                        .get(name)
+                        .unwrap_or_else(|| panic!("Unknown input/output name {}", name))
+                        .clone()
+                })
+                .collect();
+
+            let mut child_layer = layer_from_config(
+                descriptor.sub(&child_config.name, child_inputs),
+                &child_config.config,
+            );
+
+            // Create data buffer paths for child outpus and save the outputs for subsequent layers.
+            let child_descriptor = child_layer.descriptor_mut();
+            // Config cannot have more outputs that layer actually has.
+            assert!(child_config.outputs.len() <= child_descriptor.outputs().len());
+            prev_layer_output_names = Vec::with_capacity(child_descriptor.outputs().len());
+            for (j, output) in child_descriptor.outputs_mut().iter_mut().enumerate() {
+                let output_name = if j < child_config.outputs.len() {
+                    child_config.outputs[j].clone()
+                } else {
+                    format!("io_{}_{}", i + 1, j)
+                };
+
+                // Assign data buffer path.
+                output.set_path(&format!("{}.{}", descriptor.path(), &output_name));
+
+                prev_layer_output_names.push(output_name.clone());
+                internal_outputs.insert(output_name, output.clone());
+            }
+
+            // Copy layer learnable params links into Sequential descriptor.
+            for params in child_layer.descriptor().params() {
+                descriptor.add_params_copy(params);
+            }
+
+            children.push(child_layer);
+        }
+
+        // If outputs are given explicitly, use them. Otherwise take outputs of the last layer.
+        if !config.outputs.is_empty() {
+            for output_name in config.outputs.iter() {
+                descriptor.add_output(
+                    internal_outputs
+                        .get(output_name)
+                        .unwrap_or_else(|| panic!("Can't find output {}", output_name))
+                        .unit_shape()
+                        .clone(),
+                );
+            }
+        } else {
+            for output in children.last().unwrap().descriptor().outputs() {
+                descriptor.add_output_copy(output)
+            }
+        }
+
+        Sequential {
+            descriptor: descriptor,
+            children: children,
+        }
+    }
+}
+
+impl<B: IBackend + LayerOps<f32> + 'static> Layer<B> for Sequential<B> {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        for child in self.children.iter() {
+            child.compute_output(backend, context);
+        }
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        for child in self.children.iter().rev() {
+            child.compute_gradients(backend, context);
+        }
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
+
+impl<B: IBackend> Debug for Sequential<B> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Sequential")
+    }
+}
\ No newline at end of file
diff --git a/juice/src/net/context.rs b/juice/src/net/context.rs
new file mode 100644
index 000000000..80c6d88d6
--- /dev/null
+++ b/juice/src/net/context.rs
@@ -0,0 +1,180 @@
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::iter;
+use std::rc::Rc;
+
+use crate::co::{SharedTensor, TensorDesc};
+use co::frameworks::native::get_native_backend;
+use crate::net::{Inout, LearnableParamsLink};
+
+/// Context stores data for a single invocation of a network (forward and optionally backward),
+/// which includes data passed between layers (at Junctions), loss function gradients with respect
+/// to this data (again, at Junctions) and also loss function gradients with respect to the
+/// learnable parameters.
+#[derive(Debug)]
+pub struct Context {
+    batch_size: usize,
+    data: HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+    data_gradient: HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+    param_gradient: HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+}
+
+impl Context {
+    pub fn new(batch_size: usize) -> Self {
+        Context {
+            batch_size: batch_size,
+            data: HashMap::new(),
+            data_gradient: HashMap::new(),
+            param_gradient: HashMap::new(),
+        }
+    }
+
+    pub fn batch_size(&self) -> usize {
+        self.batch_size
+    }
+
+    pub fn has_data_gradient(&self, inout: &Inout) -> bool {
+        Self::has_inout_buffer(&self.data_gradient, inout)
+    }
+
+    // Get data buffer for the given inout spec found inside the current scope.
+    // Panics if this data buffer doesn't exist.
+    pub fn get_data(&mut self, inout: &Inout) -> Rc<RefCell<SharedTensor<f32>>> {
+        Self::get_inout_buffer(&self.data, inout, "data")
+    }
+
+    // Get data gradient buffer for the given inout spec found inside the current scope.
+    // Panics if this data buffer doesn't exist.
+    pub fn get_data_gradient(&mut self, inout: &Inout) -> Rc<RefCell<SharedTensor<f32>>> {
+        Self::get_inout_buffer(&self.data_gradient, inout, "data gradient")
+    }
+
+    // Same as `get_data()` but will create the buffer on the fly if it doesn't exist.
+    pub fn acquire_data(&mut self, inout: &Inout) -> Rc<RefCell<SharedTensor<f32>>> {
+        Self::acquire_inout_buffer(&mut self.data, inout, self.batch_size, "data")
+    }
+
+    // Same as `get_data_gradient()` but will create the buffer on the fly if it doesn't exist.
+    pub fn acquire_data_gradient(&mut self, inout: &Inout) -> Rc<RefCell<SharedTensor<f32>>> {
+        Self::acquire_inout_buffer(
+            &mut self.data_gradient,
+            inout,
+            self.batch_size,
+            "data gradient",
+        )
+    }
+
+    // Takes the tensor out of the context. Panics if no such tensor.
+    pub fn take_data(&mut self, inout: &Inout) -> SharedTensor<f32> {
+        Self::take_inout_buffer(&mut self.data, inout, "data")
+    }
+
+    // Get params gradient buffer for the given learnable params link.
+    // Panics if this data buffer doesn't exist.
+    pub fn get_params_gradient(
+        &mut self,
+        params: &LearnableParamsLink,
+    ) -> Rc<RefCell<SharedTensor<f32>>> {
+        let key = Rc::as_ptr(&params) as usize;
+        match self.param_gradient.get(&key) {
+            Some(data) => data.clone(),
+            None => panic!("No params gradient for {:?}", params.borrow()),
+        }
+    }
+
+    // Same as `get_params_gradient()` but will create the buffer on the fly if it doesn't exist.
+    // Buffer is zero-filled on creation.
+    pub fn acquire_params_gradient(
+        &mut self,
+        params: &LearnableParamsLink,
+    ) -> Rc<RefCell<SharedTensor<f32>>> {
+        let key = Rc::as_ptr(&params) as usize;
+        match self.param_gradient.get(&key) {
+            Some(data) => return data.clone(),
+            None => (),
+        }
+
+        // println!("Creating params gradient for {:?}", params.borrow());
+
+        let backend = get_native_backend();
+        let shape = params.borrow().data.desc().clone();
+        let mut buffer = SharedTensor::<f32>::new(&shape);
+        // Initialize with zeroes.
+        buffer
+            .write_only(backend.device())
+            .unwrap()
+            .as_mut_slice()
+            .fill(0.0);
+        let buffer_rc = Rc::new(RefCell::new(buffer));
+        self.param_gradient.insert(key, buffer_rc.clone());
+        buffer_rc
+    }
+
+    fn has_inout_buffer(
+        storage: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+        inout: &Inout,
+    ) -> bool {
+        let key = Rc::as_ptr(&inout.junction) as usize;
+        storage.get(&key).is_some()
+    }
+
+    fn get_inout_buffer(
+        storage: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+        inout: &Inout,
+        purpose: &str,
+    ) -> Rc<RefCell<SharedTensor<f32>>> {
+        let key = Rc::as_ptr(&inout.junction) as usize;
+        match storage.get(&key) {
+            Some(data) => data.clone(),
+            None => panic!("No {} for {:?}", purpose, inout),
+        }
+    }
+
+    fn acquire_inout_buffer(
+        storage: &mut HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+        inout: &Inout,
+        batch_size: usize,
+        purpose: &str,
+    ) -> Rc<RefCell<SharedTensor<f32>>> {
+        let key = Rc::as_ptr(&inout.junction) as usize;
+        match storage.get(&key) {
+            Some(data) => return data.clone(),
+            None => (),
+        };
+
+        // REMOVE
+        let backend = get_native_backend();
+
+        let shape: TensorDesc = iter::once(batch_size)
+            .chain(inout.junction.unit_shape.iter().map(|i| *i))
+            .collect();
+        // println!(
+        //     "Creating {} for {:?} with shape {:?}",
+        //     purpose, inout, shape
+        // );
+        let mut buffer = SharedTensor::<f32>::new(&shape);
+
+        // REMOVE
+        buffer
+            .write_only(backend.device())
+            .unwrap()
+            .as_mut_slice()
+            .fill(0.0);
+
+        let buffer_rc = Rc::new(RefCell::new(buffer));
+        storage.insert(key, buffer_rc.clone());
+        buffer_rc
+    }
+
+    fn take_inout_buffer(
+        storage: &mut HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+        inout: &Inout,
+        purpose: &str,
+    ) -> SharedTensor<f32> {
+        let key = Rc::as_ptr(&inout.junction) as usize;
+        match storage.remove(&key) {
+            Some(data) => Rc::try_unwrap(data).unwrap().into_inner(),
+            None => panic!("No {} for {:?}", purpose, inout),
+        }
+    }
+}
diff --git a/juice/src/net/descriptor.rs b/juice/src/net/descriptor.rs
new file mode 100644
index 000000000..e3c027c29
--- /dev/null
+++ b/juice/src/net/descriptor.rs
@@ -0,0 +1,184 @@
+use std::cell::RefCell;
+
+use std::rc::Rc;
+
+use crate::co::{SharedTensor, TensorDesc};
+
+/// A junction is a point connecting a layer output to another layer input (or several of them).
+/// Junction describes the shape of the data which flows through it (except for the batch size
+/// part of it, which is a property of specific invocation and is captured in a `Context`).
+/// Each junction will have an associated buffer in a `Context` (two of them, actually: one
+/// for data and another for gradient).
+#[derive(Debug)]
+pub struct Junction {
+    /// Shape of one data unit in a batch.
+    pub unit_shape: TensorDesc,
+    /// Human-readable path of the junction, mostly for logging and debugging.
+    /// Can change after creation to support layer waterfall construction/connection model.
+    pub path: RefCell<String>,
+}
+
+/// A struct representing either an input or output of a layer.
+/// Inouts that share same junction are "connected" in the sense that they use the same buffer
+/// (written to by output Inout and read from by input Inout).
+#[derive(Debug, Clone)]
+pub struct Inout {
+    // Junction used to read/write data.
+    pub junction: Rc<Junction>,
+}
+
+/// A single blob of params that can be learned during network training.
+/// Params are "owned" by the layers, but layers must expose them via LearnableParamsLink in
+/// the Descriptor. Container layers must expose all nested layers params.
+#[derive(Debug)]
+pub struct LearnableParams {
+    pub data: SharedTensor<f32>,
+    pub learning_rate: f32,
+    /// Human-readable path which includes the owner layer path, mostly for logging and debuggin.
+    pub path: String,
+}
+
+/// A pointer to an instance of LearnableParams. Among other things is used to find associated
+/// gradient buffer in the `Context`.
+pub type LearnableParamsLink = Rc::<RefCell::<LearnableParams>>;
+
+/// Descriptor of a layer, containing information about layer name,
+/// inputs, outputs and params. Inputs and outputs are created using a waterfall model:
+/// 1. Parent logic creates a Descriptor with a chosen name and inputs (with shapes).
+/// 2. Descriptor is passed to the layer constructor. Layer determines the number and shapes of
+///    the outputs and adds them to the descriptor. Params are also initialized and added if
+///    layer uses them.
+/// 3. After layer is created, parent logic assigns human-readable paths to the outputs Junctions
+///    and connects them appropriately.
+#[derive(Debug, Clone)]
+pub struct Descriptor {
+    path: String,
+    inputs: Vec<Inout>,
+    outputs: Vec<Inout>,
+
+    // All learnable params of the layer. For container layers, this must contain all params
+    // of the nested layers.
+    params: Vec<LearnableParamsLink>,
+}
+
+impl Inout {
+    // Create an Inout without buffer path.
+    pub fn new(unit_shape: TensorDesc) -> Self {
+        Inout {
+            junction: Rc::new(Junction {
+                unit_shape: unit_shape,
+                path: RefCell::new("".to_owned()),
+            }),
+        }
+    }
+
+    // Create an Inout with buffer path.
+    pub fn new_with_path(unit_shape: TensorDesc, path: &str) -> Self {
+        Inout {
+            junction: Rc::new(Junction {
+                unit_shape: unit_shape,
+                path: RefCell::new(path.to_owned()),
+            }),
+        }
+    }
+
+    pub fn set_path(&mut self, path: &str) {
+        self.junction.path.replace(path.to_owned());
+    }
+
+    pub fn unit_shape(&self) -> &TensorDesc {
+        &self.junction.unit_shape
+    }
+}
+
+impl Descriptor {
+    // Create a top-level Descriptor.
+    pub fn top(name: &str, inputs: Vec<Inout>) -> Self {
+        Descriptor {
+            path: name.to_owned(),
+            inputs: inputs,
+            outputs: Vec::new(),
+            params: Vec::new(),
+        }
+    }
+
+    // Create a Descriptor which is nested under this one.
+    // In practice, "nested" only means the new descriptor path is constructed as
+    // "<parent_path>.<name>".
+    pub fn sub(&self, name: &str, inputs: Vec<Inout>) -> Self {
+        Descriptor {
+            path: format!("{}.{}", self.path, name),
+            inputs: inputs,
+            outputs: Vec::new(),
+            params: Vec::new(),
+        }
+    }
+
+    pub fn path(&self) -> &str {
+        &self.path
+    }
+
+    pub fn inputs(&self) -> &[Inout] {
+        &self.inputs
+    }
+
+    pub fn input(&self, index: usize) -> &Inout {
+        &self.inputs[index]
+    }
+
+    pub fn outputs(&self) -> &[Inout] {
+        &self.outputs
+    }
+
+    pub fn outputs_mut(&mut self) -> &mut [Inout] {
+        &mut self.outputs
+    }
+
+    pub fn output(&self, index: usize) -> &Inout {
+        &self.outputs[index]
+    }
+
+    pub fn output_mut(&mut self, index: usize) -> &mut Inout {
+        &mut self.outputs[index]
+    }
+
+    pub fn params(&self) -> &[LearnableParamsLink] {
+        &self.params
+    }
+    pub fn params_mut(&mut self) -> &mut [LearnableParamsLink] {
+        &mut self.params
+    }
+
+    pub fn param(&self, index: usize) -> &LearnableParamsLink {
+        &self.params[index]
+    }
+
+    pub fn add_output(&mut self, unit_shape: TensorDesc) -> &mut Inout {
+        self.outputs.push(Inout::new(unit_shape));
+        self.outputs.last_mut().unwrap()
+    }
+
+    pub fn add_output_copy(&mut self, inout: &Inout) {
+        self.outputs.push(inout.clone())
+    }
+
+    pub fn create_params(
+        &mut self,
+        name: &str,
+        data: SharedTensor<f32>,
+        learning_rate: f32,
+    ) -> LearnableParamsLink {
+        let params = LearnableParams {
+            data: data,
+            learning_rate: learning_rate,
+            path: format!("{}.{}", self.path, name),
+        };
+        let params_rc = Rc::new(RefCell::new(params));
+        self.params.push(params_rc.clone());
+        params_rc
+    }
+
+    pub fn add_params_copy(&mut self, params: &LearnableParamsLink) {
+        self.params.push(params.clone());
+    }
+}
diff --git a/juice/src/net/layer.rs b/juice/src/net/layer.rs
new file mode 100644
index 000000000..eca9e8b90
--- /dev/null
+++ b/juice/src/net/layer.rs
@@ -0,0 +1,66 @@
+use std::fmt::Debug;
+
+use crate::co::{IBackend};
+use crate::net::{Context, Descriptor, LayerConfig};
+use crate::net::activation::*;
+use crate::net::common::*;
+use crate::net::container::*;
+use crate::net::loss::*;
+use crate::util::LayerOps;
+
+/// A generalized layer in a network, performing certain function on inputs producing outputs.
+/// Layers be can combined in an acyclic graph forming an ML network that can compute output from
+/// inputs and can be "trained" using the backpropagation process.
+///
+/// Note that a Layer is a more general concept than conventional ML layers and includes:
+/// * conventional "layers" like convolutional, fully-connected, dropout, etc;
+/// * activation functions like ReLU, softmax, etc;
+/// * groups of sublayers.
+///
+/// Layer can have arbitrary number of inputs, outputs and weights, which are all described in the
+/// `Descriptor`. Inputs and outputs declare the shapes of the 'units' of data, which then can
+/// be batched according to `Context` settings. The actual shapes of then inputs and outputs are
+/// always of the form [N, {unit_shape}] where N is the batch size. 
+/// 
+/// Number and unit shapes of the inputs are defined by the upstream logic. Number and unit shapes
+/// of the outputs are determined by the layer depending on input unit shapes and layer settings.
+/// When creating a layer, parent logic passes a partially filled `Descriptor`, containing inputs
+/// information. Layer then must fill the outputs of the `Descriptor`.
+///
+/// It is assumed that weight shapes do not depend on batch size N (as weights are created once and
+/// cannot change shape during learning).
+pub trait Layer<B: IBackend>: Debug {
+    // Computes output given the input(s).
+    fn compute_output(&self, backend: &B, context: &mut Context);
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context);
+
+    fn descriptor(&self) -> &Descriptor;
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor;
+}
+
+/// Creates layer from a config.
+/// Takes a partially filled Descriptor, which should have a valid path and fully populated inputs
+/// (including data_path).
+/// This is an internal function, typically users should be using net_from_config() instead.
+/// TODO: Make it private (for now required for Solver).
+pub fn layer_from_config<B: IBackend + LayerOps<f32> + 'static>(
+    descriptor: Descriptor,
+    config: &LayerConfig,
+) -> Box<dyn Layer<B>> {
+    match config {
+        LayerConfig::Sequential(sequential_config) => {
+            Box::new(Sequential::new(descriptor, sequential_config))
+        }
+        LayerConfig::Fanout(fanout_config) => Box::new(Fanout::new(descriptor, fanout_config)),
+        LayerConfig::Linear(linear_config) => Box::new(Linear::new(descriptor, linear_config)),
+        LayerConfig::LogSoftmax => Box::new(LogSoftmax::new(descriptor)),
+        LayerConfig::Relu => Box::new(Relu::new(descriptor)),
+        LayerConfig::Sigmoid => Box::new(Sigmoid::new(descriptor)),
+        LayerConfig::NegativeLogLikelihood(nll_config) => {
+            Box::new(NegativeLogLikelihood::new(descriptor, nll_config))
+        }
+        LayerConfig::MeanSquaredError => Box::new(MeanSquaredError::new(descriptor)),
+    }
+}
\ No newline at end of file
diff --git a/juice/src/net/loss/mean_squared_error.rs b/juice/src/net/loss/mean_squared_error.rs
new file mode 100644
index 000000000..291de1c56
--- /dev/null
+++ b/juice/src/net/loss/mean_squared_error.rs
@@ -0,0 +1,88 @@
+use crate::co::{IBackend, ITensorDesc};
+use crate::coblas::plugin::*;
+use crate::net::{Context, Descriptor, Layer};
+use crate::util::{native_backend, Axpby};
+
+#[derive(Debug)]
+// Layer implementing the Mean Squared Error loss function.
+// This implementation supports sparse labels (marked as NaN values in label tensor).
+// Gradient for absent label values will be 0.0.
+pub struct MeanSquaredError {
+    descriptor: Descriptor,
+}
+
+impl MeanSquaredError {
+    pub fn new(descriptor: Descriptor) -> Self {
+        assert_eq!(
+            descriptor.inputs().len(),
+            2,
+            "MeanSquaredError must take 2 inputs: values and labels"
+        );
+        assert_eq!(
+            descriptor.inputs()[0].unit_shape().size(),
+            descriptor.inputs()[1].unit_shape().size(),
+            "Labels must be of the same size"
+        );
+        // Loss layers don't have outputs.
+
+        MeanSquaredError {
+            descriptor: descriptor,
+        }
+    }
+}
+
+impl<B: IBackend + Axpby<f32> + Copy<f32>> Layer<B> for MeanSquaredError {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        // No output computation since loss layer doesn't have outputs.
+        // It's main purpose is to start the backpropagation process by
+        // computing the loss gradient with respect to net final output
+        // in compute_gradients().
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        let predictions = context.get_data(self.descriptor.input(0));
+        let labels = context.get_data(self.descriptor.input(1));
+        let input_gradient = context.acquire_data_gradient(self.descriptor.input(0));
+
+        let native = native_backend();
+
+        let predictions_ref = predictions.borrow();
+        let predictions_data = predictions_ref
+            .read(native.device())
+            .unwrap()
+            .as_slice::<f32>();
+
+        let labels_ref = labels.borrow();
+        let labels_data = labels_ref.read(native.device()).unwrap().as_slice::<f32>();
+
+        let mut input_gradient_ref = input_gradient.borrow_mut();
+        let input_gradient_data = input_gradient_ref
+            .write_only(native.device())
+            .unwrap()
+            .as_mut_slice::<f32>();
+
+        for i in 0..predictions_data.len() {
+            input_gradient_data[i] = match labels_data[i].is_nan() {
+                true => 0.0,
+                false => 2.0 * (predictions_data[i] - labels_data[i]),
+            };
+        }
+
+        // // Gradient is calculated as 2 * (predictions - labels).
+        // backend.copy(&labels.borrow(), &mut input_gradient.borrow_mut());
+        // backend.axpby(
+        //     &native_scalar(2.0),
+        //     &predictions.borrow(),
+        //     &native_scalar(-2.0),
+        //     &mut input_gradient.borrow_mut(),
+        // );
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
diff --git a/juice/src/net/loss/mod.rs b/juice/src/net/loss/mod.rs
new file mode 100644
index 000000000..3379ed33e
--- /dev/null
+++ b/juice/src/net/loss/mod.rs
@@ -0,0 +1,5 @@
+mod mean_squared_error;
+mod negative_log_likelihood;
+
+pub use mean_squared_error::*;
+pub use negative_log_likelihood::*;
\ No newline at end of file
diff --git a/juice/src/net/loss/negative_log_likelihood.rs b/juice/src/net/loss/negative_log_likelihood.rs
new file mode 100644
index 000000000..4d668b453
--- /dev/null
+++ b/juice/src/net/loss/negative_log_likelihood.rs
@@ -0,0 +1,78 @@
+use crate::co::{IBackend, ITensorDesc};
+use crate::net::{Context, Descriptor, Layer};
+use crate::util::native_backend;
+
+#[derive(Clone, Debug, Default)]
+pub struct NegativeLogLikelihoodConfig {
+    /// How many different classes can be classified.
+    pub num_classes: usize,
+}
+
+#[derive(Debug)]
+pub struct NegativeLogLikelihood {
+    descriptor: Descriptor,
+    num_classes: usize,
+}
+
+impl NegativeLogLikelihood {
+    pub fn new(descriptor: Descriptor, config: &NegativeLogLikelihoodConfig) -> Self {
+        assert_eq!(
+            descriptor.inputs().len(),
+            2,
+            "NegativeLogLikelihood must take 2 inputs: probabilities and labels"
+        );
+        assert_eq!(
+            descriptor.inputs()[1].unit_shape().size(),
+            1,
+            "Labels must be of [1] shape"
+        );
+        
+        // Note that loss layers don't have outputs, since the result of loss computation is always
+        // a single number which can't then be piped to other layers which expect data to have
+        // shape [batch_size, ...]
+
+        NegativeLogLikelihood {
+            descriptor: descriptor,
+            num_classes: config.num_classes,
+        }
+    }
+}
+
+impl<B: IBackend> Layer<B> for NegativeLogLikelihood {
+    fn compute_output(&self, backend: &B, context: &mut Context) {
+        // No output computation since loss layer doesn't have outputs.
+        // It's main purpose is to start the backpropagation process by
+        // computing the loss gradient with respect to net final output
+        // in compute_gradients().
+    }
+
+    fn compute_gradients(&self, backend: &B, context: &mut Context) {
+        let probabilities_gradient = context.acquire_data_gradient(self.descriptor.input(0));
+        let labels = context.get_data(self.descriptor.input(1));
+
+        let native = native_backend();
+        let labels_data = labels.borrow();
+        let native_labels = labels_data.read(native.device()).unwrap().as_slice::<f32>();
+        let mut writable_gradient = vec![0f32; probabilities_gradient.borrow().desc().size()];
+
+        for (batch_n, &label_value) in native_labels.iter().enumerate() {
+            let index = (self.num_classes * batch_n) + label_value as usize;
+            writable_gradient[index] = -1f32;
+        }
+        crate::util::write_to_memory(
+            probabilities_gradient
+                .borrow_mut()
+                .write_only(native.device())
+                .unwrap(),
+            &writable_gradient,
+        );
+    }
+
+    fn descriptor(&self) -> &Descriptor {
+        &self.descriptor
+    }
+
+    fn descriptor_mut(&mut self) -> &mut Descriptor {
+        &mut self.descriptor
+    }
+}
diff --git a/juice/src/net/mod.rs b/juice/src/net/mod.rs
new file mode 100644
index 000000000..faa3ef09a
--- /dev/null
+++ b/juice/src/net/mod.rs
@@ -0,0 +1,34 @@
+//! Representation of a neural network.
+//!
+//! Network consists of 2 parts:
+//! 1. Static layer configuration and connections between them. Each layer takes data as input,
+//!    performs certain operation which produced the output. Layers can be combined into
+//!    hierarchical structures via special container layers. Configuration of each layer, as
+//!    well as connections between them is static, i.e. it cannot change in runtime. The shape
+//!    of data 'units' in inputs/outputs is also fixed, but the batch size (batch comprises
+//!    several data 'units') is not. Layer inputs/outputs as well as their connections is
+//!    captured in a `Descriptor`, which also contains information about learnable params.
+//! 2. Dynamic state representing (partial) data flow through the network. When doing the forward
+//!    pass through the net (converting inputs to outputs), all the intermediate state (data
+//!    buffers passed between layers) is saved in a `Context`, which allows reuse of this data
+//!    when doing the backpropagation step (intermediate data for backpropagation, for example,
+//!    gradients, is also saved in the `Context`). Context also defines the batch size for this
+//!    particular instance of exercising the network, which allows to use different batch sizes
+//!    for different use cases (for example simultaneous learning with batches and using the net
+//!    for producing outputs by setting batch size to 1).
+
+mod activation;
+mod common;
+mod config;
+mod container;
+mod context;
+mod descriptor;
+mod layer;
+mod loss;
+mod network;
+
+pub use config::*;
+pub use context::*;
+pub use descriptor::*;
+pub use layer::*;
+pub use network::*;
diff --git a/juice/src/net/network.rs b/juice/src/net/network.rs
new file mode 100644
index 000000000..c2a3ccedb
--- /dev/null
+++ b/juice/src/net/network.rs
@@ -0,0 +1,159 @@
+use std::fs;
+use std::io;
+use std::path;
+
+use crate::net_capnp::network as capnp_network;
+
+use crate::capnp_util::*;
+use crate::co::frameworks::native::get_native_backend;
+use crate::co::{IBackend, ITensorDesc, SharedTensor, TensorDesc};
+use crate::coblas::plugin::Copy;
+use crate::net::layer::Layer;
+use crate::net::{layer_from_config, Context, Descriptor, Inout, LayerConfig};
+use crate::util::LayerOps;
+
+// A trainable network.
+pub struct Network<B: IBackend + LayerOps<f32>> {
+    config: LayerConfig,
+    top: Box<dyn Layer<B>>,
+}
+
+impl<B: IBackend + LayerOps<f32> + 'static> Network<B> {
+    /// Creates network from a config.
+    pub fn from_config(config: LayerConfig, input_shapes: &[TensorDesc]) -> Network<B> {
+        let inputs = input_shapes
+            .iter()
+            .enumerate()
+            .map(|(i, shape)| Inout::new_with_path(shape.clone(), &format!("net_in_{}", i)))
+            .collect();
+        let descriptor = Descriptor::top("net", inputs);
+        let top = layer_from_config(descriptor, &config);
+
+        Network {
+            config: config,
+            top: top,
+        }
+    }
+
+    /// Top level layer (typically a container layer).
+    pub fn top(&self) -> &dyn Layer<B> {
+        self.top.as_ref()
+    }
+
+    /// Mutable top level layer (typically a container layer).
+    pub fn top_mut(&mut self) -> &mut dyn Layer<B> {
+        self.top.as_mut()
+    }
+
+    /// Do a forward pass on the provided inputs and return the network output.
+    /// This function assumes the network has exactly one input and exactly one output
+    /// (will panic otherwise).
+    /// Input shape must be either [<top input shape>] or [N, <top input shape>]
+    /// (latter case for batched processing).
+    /// Returns a tensor of shape which is either [<top output shape>] or [N, <top output shape>],
+    /// depending on the input shape.
+    pub fn transform(&self, backend: &B, input: &SharedTensor<f32>) -> SharedTensor<f32> {
+        assert_eq!(self.top.descriptor().inputs().len(), 1);
+        assert_eq!(self.top.descriptor().outputs().len(), 1);
+
+        // Figure out batch size.
+        let net_input_size = self.top.descriptor().input(0).unit_shape().size();
+        let batch_size = if input.desc().size() == net_input_size {
+            1
+        } else {
+            assert!(input.desc().len() > 1);
+            let input_unit_size = input.desc().iter().skip(1).fold(1, |acc, i| acc * i);
+            assert_eq!(input_unit_size, net_input_size);
+            input.desc()[0]
+        };
+
+        let mut context = Context::new(batch_size);
+
+        // Copy input data into the context.
+        let context_inputs = context.acquire_data(self.top.descriptor().input(0));
+        assert_eq!(context_inputs.borrow().desc().size(), input.desc().size());
+        backend.copy(&input, &mut context_inputs.borrow_mut()).unwrap();
+
+        // Compute network output and take it out of the context as a return value.
+        self.top.compute_output(backend, &mut context);
+        context.take_data(self.top.descriptor().output(0))
+    }
+
+    pub fn save<P: AsRef<path::Path>>(&mut self, path: P) -> io::Result<()> {
+        let path = path.as_ref();
+        let ref mut out = fs::File::create(path)?;
+
+        let mut message = ::capnp::message::Builder::new_default();
+        {
+            let mut net_message = message.init_root::<capnp_network::Builder>();
+            self.write_capnp(&mut net_message);
+        }
+        ::capnp::serialize_packed::write_message(out, &message).unwrap();
+
+        Ok(())
+    }
+}
+
+impl<B: IBackend + LayerOps<f32> + 'static> Clone for Network<B> {
+    fn clone(&self) -> Network<B> {
+        let input_shapes: Vec<TensorDesc> = self
+            .top
+            .descriptor()
+            .inputs()
+            .iter()
+            .map(|input| input.unit_shape().clone())
+            .collect();
+        let net = Network::from_config(self.config.clone(), &input_shapes);
+
+        // Copy weights data.
+        let backend = get_native_backend();
+        assert_eq!(
+            self.top.descriptor().params().len(),
+            net.top.descriptor().params().len()
+        );
+        for i in 0..self.top.descriptor().params().len() {
+            let from_params = self.top.descriptor().params()[i].borrow();
+            let mut to_params = net.top.descriptor().params()[i].borrow_mut();
+            backend.copy(&from_params.data, &mut to_params.data).unwrap();
+            to_params.learning_rate = from_params.learning_rate;
+        }
+
+        net
+    }
+}
+
+impl<'a, B: IBackend + LayerOps<f32>> CapnpWrite<'a> for Network<B> {
+    type Builder = capnp_network::Builder<'a>;
+
+    /// Write the Layer into a capnp message.
+    fn write_capnp(&self, builder: &mut Self::Builder) {
+        // Write top-level cofnig.
+        {
+            // let mut config_message = builder.reborrow().init_config();
+            // self.config.write_capnp(&mut config_message);
+        }
+
+        // Write input shapes.
+        {
+            let input_shapes: Vec<TensorDesc> = self
+                .top
+                .descriptor()
+                .inputs()
+                .iter()
+                .map(|input| input.unit_shape().clone())
+                .collect();
+
+            let inputs_count = self.top.descriptor().inputs().len();
+            let mut inputs_message = builder.reborrow().init_inputs(input_shapes.len() as u32);
+            for i in 0..input_shapes.len() {
+                let mut input_message = inputs_message.reborrow().get(i as u32);
+                let mut vals = input_message
+                    .reborrow()
+                    .init_shape(input_shapes[i].len() as u32);
+                for j in 0..input_shapes[i].len() {
+                    vals.set(j as u32, input_shapes[i][j] as u64);
+                }
+            }
+        }
+    }
+}
diff --git a/juice/src/solver/mod.rs b/juice/src/solver/mod.rs
index 78af6986f..23994482e 100644
--- a/juice/src/solver/mod.rs
+++ b/juice/src/solver/mod.rs
@@ -6,79 +6,117 @@
 pub mod confusion_matrix;
 pub mod regression_evaluator;
 
+use std::cell::RefCell;
+
+use std::rc::Rc;
+
 pub use self::confusion_matrix::ConfusionMatrix;
 pub use self::regression_evaluator::{RegressionEvaluator, RegressionLoss};
 use crate::co::prelude::*;
-use crate::layer::*;
-use crate::layers::SequentialConfig;
+use crate::net::*;
 use crate::solvers::*;
-use std::marker::PhantomData;
 
 use crate::util::{ArcLock, LayerOps, SolverOps};
-use std::rc::Rc;
 
 #[derive(Debug)]
 /// Solver that optimizes a [Layer][1] with a given objective.
 /// [1]: ../layer/index.html
-pub struct Solver<SolverB, B>
+pub struct Solver<B>
 where
-    SolverB: IBackend + SolverOps<f32>,
     B: IBackend + LayerOps<f32>,
 {
-    net: Layer<B>,
-    objective: Layer<SolverB>,
+    backend: Rc<B>,
+    context: Context,
+
+    net: Box<dyn Layer<B>>,
+    objective: Box<dyn Layer<B>>,
     /// The implementation of the Solver
-    pub worker: Box<dyn ISolver<SolverB, B>>,
+    pub worker: Box<dyn ISolver<B, B>>,
 
     config: SolverConfig,
 
     /// The current iteration / number of times weights have been updated
     iter: usize,
-
-    solver_backend: PhantomData<SolverB>,
 }
 
-impl<SolverB, B> Solver<SolverB, B>
+impl<B> Solver<B>
 where
-    SolverB: IBackend + SolverOps<f32> + 'static,
-    B: IBackend + LayerOps<f32> + 'static,
+    B: IBackend + LayerOps<f32> + SolverOps<f32> + 'static,
 {
     /// Create Solver from [SolverConfig][1]
     /// [1]: ./struct.SolverConfig.html
     ///
     /// This is the **preferred method** to create a Solver for training a neural network.
-    pub fn from_config(net_backend: Rc<B>, obj_backend: Rc<SolverB>, config: &SolverConfig) -> Solver<SolverB, B> {
-        let network = Layer::from_config(net_backend, &config.network);
-        let mut worker = config.solver.with_config(obj_backend.clone(), &config);
-        worker.init(&network);
+    pub fn from_config(
+        backend: Rc<B>,
+        config: &SolverConfig,
+        input_shapes: &[TensorDesc],
+        label_shape: &TensorDesc,
+    ) -> Solver<B> {
+        let context = Context::new(config.minibatch_size);
+        let network = layer_from_config(
+            Descriptor::top(
+                "net",
+                input_shapes
+                    .iter()
+                    .enumerate()
+                    .map(|(i, shape)| Inout::new_with_path(shape.clone(), &format!("net_in_{}", i)))
+                    .collect(),
+            ),
+            &config.network,
+        );
+        assert_eq!(network.descriptor().outputs().len(), 1); // Net must have only one output.
+
+        let objective = layer_from_config(
+            Descriptor::top(
+                "loss",
+                vec![
+                    network.descriptor().output(0).clone(),
+                    Inout::new_with_path(label_shape.clone(), "labels"),
+                ],
+            ),
+            &config.objective,
+        );
+
+        let weight_shapes: Vec<TensorDesc> = network
+            .descriptor()
+            .params()
+            .iter()
+            .map(|w| w.borrow().data.desc().clone())
+            .collect();
+        let mut worker = config.solver.with_config(backend.clone(), &config);
+        worker.init(&weight_shapes);
+
+        // Loss layer cannot have params (no one will train them!).
+        assert!(objective.descriptor().params().is_empty());
 
         Solver {
+            backend: backend,
+            context: context,
             worker: worker,
             net: network,
-            objective: Layer::from_config(obj_backend, &config.objective),
+            objective: objective,
             iter: 0,
-
             config: config.clone(),
-            solver_backend: PhantomData::<SolverB>,
         }
     }
 }
 
-impl<SolverB, B> Solver<SolverB, B>
+impl<B> Solver<B>
 where
-    SolverB: IBackend + SolverOps<f32> + 'static,
-    B: IBackend + LayerOps<f32> + 'static,
+    B: IBackend + LayerOps<f32> + SolverOps<f32> + 'static,
 {
-    fn init(&mut self, backend: Rc<B>) {
+    fn init(&mut self, backend: Rc<B>, input_shapes: &[TensorDesc]) {
         info!("Initializing solver from configuration");
 
-        let mut config = self.config.clone();
-        self.init_net(backend, &mut config);
+        let config = self.config.clone();
+        self.init_net(backend, &config, input_shapes);
     }
 
     /// Initialize the training net
-    fn init_net(&mut self, backend: Rc<B>, param: &mut SolverConfig) {
-        self.net = Layer::from_config(backend, &param.network);
+    fn init_net(&mut self, backend: Rc<B>, param: &SolverConfig, input_shapes: &[TensorDesc]) {
+        unimplemented!();
+        //self.net = layer_from_config(&*backend, &param.network, input_shapes);
     }
 
     /// Train the network with one minibatch
@@ -86,38 +124,79 @@ where
         &mut self,
         mb_data: ArcLock<SharedTensor<f32>>,
         mb_target: ArcLock<SharedTensor<f32>>,
-    ) -> ArcLock<SharedTensor<f32>> {
-        // forward through network and classifier
-        let network_out = self.net.forward(&[mb_data])[0].clone();
-        let _ = self.objective.forward(&[network_out.clone(), mb_target]);
-
-        // forward through network and classifier
-        let classifier_gradient = self.objective.backward(&[]);
-        self.net.backward(&classifier_gradient[0..1]);
+    ) -> SharedTensor<f32> {
+        // Copy intput data into the network context.
+        let data = self.context.acquire_data(self.net.descriptor().input(0));
+        self.backend
+            .copy(&mb_data.read().unwrap(), &mut data.borrow_mut())
+            .unwrap();
+        let labels = self
+            .context
+            .acquire_data(self.objective.descriptor().input(1));
+        self.backend
+            .copy(&mb_target.read().unwrap(), &mut labels.borrow_mut())
+            .unwrap();
+
+        // Compute network output and the loss.
+        self.net.compute_output(&*self.backend, &mut self.context);
+        self.objective
+            .compute_output(&*self.backend, &mut self.context);
+
+        // Compute params gradients by doing a backpropagation on the network.
+        self.objective
+            .compute_gradients(&*self.backend, &mut self.context);
+        self.net
+            .compute_gradients(&*self.backend, &mut self.context);
+
+        // Let the solver worker adjust the params gradients before applying them to params.
+        let params: Vec<LearnableParamsLink> =
+            self.net.descriptor().params().iter().cloned().collect();
+        let params_gradients: Vec<(Rc<RefCell<SharedTensor<f32>>>, f32)> = params
+            .iter()
+            .map(|p| {
+                (
+                    self.context.get_params_gradient(p),
+                    p.borrow().learning_rate,
+                )
+            })
+            .collect();
+        self.worker
+            .compute_update(&self.config, &params_gradients, self.iter);
+
+        // Finally apply the weight change.
+        let shared_a = crate::util::native_scalar(-1f32);
+        for i in 0..self.net.descriptor().params().len() {
+            let gradient = &params_gradients[i].0.borrow();
+            let params = &mut self.net.descriptor_mut().param(i).borrow_mut().data;
+            self.backend.axpy(&shared_a, gradient, params).unwrap();
+        }
 
-        self.worker.compute_update(&self.config, &mut self.net, self.iter);
-        self.net.update_weights(self.worker.backend());
         self.iter += 1;
 
+        let out_buffer = self.context.get_data(self.net.descriptor().output(0));
+        let mut network_out = SharedTensor::<f32>::new(out_buffer.borrow().desc());
+        self.backend
+            .copy(&out_buffer.borrow(), &mut network_out)
+            .unwrap();
         network_out
     }
 
-    /// Returns the network trained by the solver.
-    ///
-    /// This is the recommended method to get a usable trained network.
-    pub fn network(&self) -> &Layer<B> {
-        &self.net
-    }
-
-    /// Returns the network trained by the solver.
-    ///
-    /// This is the recommended method to get a trained network,
-    /// if you want to alter the network. Keep in mind that altering the network
-    /// might render the solver unusable and continuing training the network with it will yield
-    /// unexpected results.
-    pub fn mut_network(&mut self) -> &mut Layer<B> {
-        &mut self.net
-    }
+    // /// Returns the network trained by the solver.
+    // ///
+    // /// This is the recommended method to get a usable trained network.
+    // pub fn network(&self) -> &Layer<B> {
+    //     &self.net
+    // }
+
+    // /// Returns the network trained by the solver.
+    // ///
+    // /// This is the recommended method to get a trained network,
+    // /// if you want to alter the network. Keep in mind that altering the network
+    // /// might render the solver unusable and continuing training the network with it will yield
+    // /// unexpected results.
+    // pub fn mut_network(&mut self) -> &mut Layer<B> {
+    //     &mut self.net
+    // }
 }
 
 /// Implementation of a specific Solver.
@@ -130,7 +209,7 @@ where
     SolverB: IBackend + SolverOps<f32>,
 {
     /// Initialize the solver, setting up any network related data.
-    fn init(&mut self, net: &Layer<B>) {}
+    fn init(&mut self, weight_shapes: &[TensorDesc]) {}
 
     /// Update the weights of the net with part of the gradient.
     ///
@@ -143,7 +222,12 @@ where
     /// Used by [step][2] to optimize the network.
     ///
     /// [2]: ./struct.Solver.html#method.step
-    fn compute_update(&mut self, param: &SolverConfig, network: &mut Layer<B>, iter: usize);
+    fn compute_update(
+        &mut self,
+        param: &SolverConfig,
+        weight_gradients: &[(Rc<RefCell<SharedTensor<f32>>>, f32)],
+        iter: usize,
+    );
 
     /// Returns the backend used by the solver.
     fn backend(&self) -> &SolverB;
@@ -240,8 +324,8 @@ impl Default for SolverConfig {
     fn default() -> SolverConfig {
         SolverConfig {
             name: "".to_owned(),
-            network: LayerConfig::new("default", SequentialConfig::default()),
-            objective: LayerConfig::new("default", SequentialConfig::default()),
+            network: LayerConfig::default(),
+            objective: LayerConfig::default(),
             solver: SolverKind::SGD(SGDKind::Momentum),
 
             minibatch_size: 1,
@@ -354,7 +438,10 @@ pub enum SolverKind {
 
 impl SolverKind {
     /// Create a Solver of the specified kind with the supplied SolverConfig.
-    pub fn with_config<B: IBackend + SolverOps<f32> + 'static, NetB: IBackend + LayerOps<f32> + 'static>(
+    pub fn with_config<
+        B: IBackend + SolverOps<f32> + 'static,
+        NetB: IBackend + LayerOps<f32> + 'static,
+    >(
         &self,
         backend: Rc<B>,
         config: &SolverConfig,
@@ -375,7 +462,10 @@ pub enum SGDKind {
 
 impl SGDKind {
     /// Create a Solver of the specified kind with the supplied SolverConfig.
-    pub fn with_config<B: IBackend + SolverOps<f32> + 'static, NetB: IBackend + LayerOps<f32> + 'static>(
+    pub fn with_config<
+        B: IBackend + SolverOps<f32> + 'static,
+        NetB: IBackend + LayerOps<f32> + 'static,
+    >(
         &self,
         backend: Rc<B>,
         config: &SolverConfig,
diff --git a/juice/src/solvers/mod.rs b/juice/src/solvers/mod.rs
index 08ab06916..cad708159 100644
--- a/juice/src/solvers/mod.rs
+++ b/juice/src/solvers/mod.rs
@@ -27,12 +27,14 @@
 //! [minimum]: http://mathworld.wolfram.com/GlobalMinimum.html
 //! [backprop]: https://en.wikipedia.org/wiki/Backpropagation
 
+use std::rc::Rc;
+use std::cell::RefCell;
+
 #[allow(unused_import_braces)]
 pub use self::sgd::Momentum;
 pub mod sgd;
 
 use crate::co::{IBackend, SharedTensor};
-use crate::layer::*;
 use crate::solver::*;
 use crate::util::*;
 
@@ -40,10 +42,10 @@ trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f3
     fn compute_update_value(
         &mut self,
         config: &SolverConfig,
-        weight_blob: &ArcLock<SharedTensor<f32>>,
+        weight_gradient: &mut SharedTensor<f32>,
         history_blob_id: usize,
-        global_lr: &f32,
-        blob_lr: &f32,
+        global_lr: f32,
+        blob_lr: f32,
     );
 
     /// [Clip gradients][1] when they exceed [SolverConfig.clip_gradients][2].
@@ -59,20 +61,19 @@ trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f3
     /// [3]: https://en.wikipedia.org/wiki/Recurrent_neural_network
     /// [4]: https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm
     #[allow(unused_must_use)]
-    fn clip_gradients<B: IBackend + LayerOps<f32> + 'static>(&self, config: &SolverConfig, net: &mut Layer<B>) {
+    fn clip_gradients(&self, config: &SolverConfig, weight_gradients: &[Rc<RefCell<SharedTensor<f32>>>]) {
         // skip clipping gradients if SolverConfig.clip_gradients is set to None
         if let Some(clip_threshold) = config.clip_gradients {
             let native = native_backend();
 
-            let net_gradients = net.learnable_weights_gradients();
             let mut sumsq_diff = 0f32;
             let backend = self.backend();
-            for net_gradient in net_gradients.clone() {
-                let gradient = net_gradient.read().unwrap();
+            for net_gradient in weight_gradients {
+                let gradient = &net_gradient.borrow();
                 // PERF: preallocate tensor once
                 let mut result = SharedTensor::new(&[1]);
                 // gradient.sumsq_diff(self.backend(), &mut result);
-                self.backend().dot(&gradient, &gradient, &mut result);
+                self.backend().dot(gradient, gradient, &mut result);
 
                 let sumsq_diff_slice = result.read(native.device()).unwrap().as_slice::<f32>();
                 sumsq_diff += sumsq_diff_slice[0];
@@ -88,8 +89,8 @@ trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f3
 
                 let mut scale_shared = native_scalar(scale_factor);
 
-                for weight_gradient in net_gradients {
-                    let mut gradient = weight_gradient.write().unwrap();
+                for weight_gradient in weight_gradients {
+                    let mut gradient = weight_gradient.borrow_mut();
                     backend.scal(&mut scale_shared, &mut gradient);
                 }
             }
@@ -102,15 +103,14 @@ trait SGDSolver<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f3
     /// To counteract that we are accumulating the gradients over multiple samples,
     /// we need to scale the gradients down to the equivalent of a single sample.</br>
     /// E.g. with a `minibatch_size` of 4 we need to scale the gradient by 0.25 (= 1/4).
-    fn normalize(&self, config: &SolverConfig, weight_blob: &ArcLock<SharedTensor<f32>>) {
+    fn normalize(&self, config: &SolverConfig, weight_blob: &mut SharedTensor<f32>) {
         if config.minibatch_size > 1 {
             let scale_factor = 1f32 / config.minibatch_size as f32;
-            let mut gradient = weight_blob.write().unwrap();
             let native = native_backend();
 
             let mut scale_factor_shared = native_scalar(scale_factor);
             // self.backend().scal_plain(&scale_factor_shared, &mut gradient).unwrap();
-            self.backend().scal(&mut scale_factor_shared, &mut gradient).unwrap();
+            self.backend().scal(&mut scale_factor_shared, weight_blob).unwrap();
         }
     }
 
diff --git a/juice/src/solvers/sgd/mod.rs b/juice/src/solvers/sgd/mod.rs
index e39f22f53..cd852b536 100644
--- a/juice/src/solvers/sgd/mod.rs
+++ b/juice/src/solvers/sgd/mod.rs
@@ -17,7 +17,6 @@
 //!
 //! [backprop]: https://en.wikipedia.org/wiki/Backpropagation
 //! [gd]: https://en.wikipedia.org/wiki/Gradient_descent
-
 pub use self::momentum::Momentum;
 
 /// Implement [ISolver][1] for [SGD solvers][2].
@@ -26,39 +25,49 @@ pub use self::momentum::Momentum;
 #[macro_export]
 macro_rules! impl_isolver_sgd {
     ($t:ty) => {
-        impl<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static> ISolver<SolverB, NetB>
-            for $t
+        impl<SolverB: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static>
+            ISolver<SolverB, NetB> for $t
         {
             /// Initialize the SGD Momentum solver, allocating memory for its history.
-            fn init(&mut self, net: &Layer<NetB>) {
-                self.history = Vec::with_capacity(net.learnable_weights_gradients().len());
+            fn init(&mut self, weight_shapes: &[TensorDesc]) {
+                self.history = Vec::with_capacity(weight_shapes.len());
 
-                for weight_gradient in net.learnable_weights_gradients() {
-                    let shape = weight_gradient.read().unwrap().desc().clone();
-                    let mut tensor = SharedTensor::new(&shape);
+                for shape in weight_shapes {
+                    let mut tensor = SharedTensor::new(shape);
 
                     let filler = crate::weight::FillerType::Constant { value: 0f32 };
                     filler.fill(&mut tensor);
 
-                    let history_tensor = Arc::new(RwLock::new(tensor));
-                    self.history.push(history_tensor);
+                    self.history.push(tensor);
                 }
             }
 
-            fn compute_update(&mut self, config: &SolverConfig, net: &mut Layer<NetB>, iter: usize) {
+            fn compute_update(
+                &mut self,
+                config: &SolverConfig,
+                weight_gradients: &[(Rc<RefCell<SharedTensor<f32>>>, f32)],
+                iter: usize,
+            ) {
                 let rate = config.get_learning_rate(iter);
 
-                SGDSolver::<SolverB, NetB>::clip_gradients(self, config, net);
-                for (weight_id, weight_gradient) in net.learnable_weights_gradients().iter().enumerate() {
-                    SGDSolver::<SolverB, NetB>::normalize(self, config, weight_gradient);
+                SGDSolver::<SolverB, NetB>::clip_gradients(
+                    self,
+                    config,
+                    &weight_gradients.iter().map(|(t, r)| t.clone()).collect::<Vec<_>>(),
+                );
+                for (weight_id, (weights_data, learning_rate)) in
+                    weight_gradients.iter().enumerate()
+                {
+                    SGDSolver::<SolverB, NetB>::normalize(self, config, &mut weights_data.borrow_mut());
+
                     // SGDSolver::<SolverB, NetB>::regularize(self, config, weight_gradient, net.weights_weight_decay()[weight_id]);
                     SGDSolver::<SolverB, NetB>::compute_update_value(
                         self,
                         config,
-                        weight_gradient,
+                        &mut weights_data.borrow_mut(),
                         weight_id,
-                        &rate,
-                        &net.learnable_weights_lr()[weight_id].unwrap(),
+                        rate,
+                        *learning_rate,
                     );
                 }
             }
diff --git a/juice/src/solvers/sgd/momentum.rs b/juice/src/solvers/sgd/momentum.rs
index ab396091b..767cd9247 100644
--- a/juice/src/solvers/sgd/momentum.rs
+++ b/juice/src/solvers/sgd/momentum.rs
@@ -14,12 +14,12 @@
 //! It also makes solving more stable.
 
 use crate::co::prelude::*;
-use crate::layer::*;
+
 use crate::solver::*;
 use crate::solvers::SGDSolver;
 use crate::util::*;
 use std::rc::Rc;
-use std::sync::{Arc, RwLock};
+use std::cell::RefCell;
 
 #[derive(Debug)]
 /// Stochastic Gradient Descent with Momentum.
@@ -28,7 +28,7 @@ use std::sync::{Arc, RwLock};
 /// [1]: ./index.html
 pub struct Momentum<SolverB: IBackend + SolverOps<f32>> {
     /// The gradient update from the previous iteration for each blob.
-    history: Vec<ArcLock<SharedTensor<f32>>>,
+    history: Vec<SharedTensor<f32>>,
     /// The backend used for computing the gradient.
     backend: Rc<SolverB>,
 
@@ -56,14 +56,16 @@ impl<SolverB: IBackend + SolverOps<f32>> Momentum<SolverB> {
     }
 }
 
-impl<B: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static> SGDSolver<B, NetB> for Momentum<B> {
+impl<B: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static> SGDSolver<B, NetB>
+    for Momentum<B>
+{
     fn compute_update_value(
         &mut self,
         config: &SolverConfig,
-        weight_gradient: &ArcLock<SharedTensor<f32>>,
+        weight_gradient: &mut SharedTensor<f32>,
         history_blob_id: usize,
-        global_lr: &f32,
-        blob_lr: &f32,
+        global_lr: f32,
+        blob_lr: f32,
     ) {
         // PERF: check if value is changed before writing it
         crate::weight::FillerType::Constant {
@@ -71,24 +73,25 @@ impl<B: IBackend + SolverOps<f32>, NetB: IBackend + LayerOps<f32> + 'static> SGD
         }
         .fill(&mut self.lr);
 
-        crate::weight::FillerType::Constant { value: config.momentum }.fill(&mut self.momentum);
+        crate::weight::FillerType::Constant {
+            value: config.momentum,
+        }
+        .fill(&mut self.momentum);
 
-        let backend = ISolver::<B, NetB>::backend(self);
+        let backend : &B = &self.backend;
         let device = IBackend::device(backend);
 
-        let history_blob = &self.history[history_blob_id];
+        let history_blob = &mut self.history[history_blob_id];
         Axpby::axpby(
             backend,
             &self.lr,
-            &weight_gradient.read().unwrap(),
+            weight_gradient,
             &self.momentum,
-            &mut history_blob.write().unwrap(),
+            history_blob,
         )
         .unwrap();
 
-        backend
-            .copy(&history_blob.read().unwrap(), &mut weight_gradient.write().unwrap())
-            .unwrap();
+        backend.copy(history_blob, weight_gradient).unwrap();
     }
 }
 
diff --git a/juice/src/train/mod.rs b/juice/src/train/mod.rs
new file mode 100644
index 000000000..c07e5d40f
--- /dev/null
+++ b/juice/src/train/mod.rs
@@ -0,0 +1,5 @@
+mod trainer;
+mod optimizer;
+
+pub use optimizer::*;
+pub use trainer::*;
\ No newline at end of file
diff --git a/juice/src/train/optimizer/adam.rs b/juice/src/train/optimizer/adam.rs
new file mode 100644
index 000000000..10da716c1
--- /dev/null
+++ b/juice/src/train/optimizer/adam.rs
@@ -0,0 +1,140 @@
+//! Adam optimizer.
+//! Computes the update Vᵢ from params gradient ∇ᵢ as:
+//!   Mᵢ = β₁Mᵢ₋₁ + (1-β₁)∇ᵢ,
+//!   Sᵢ = β₂Sᵢ₋₁ + (1-β₂)∇ᵢ⊙∇ᵢ,
+//!   M₀ = 0,
+//!   S₀ = 0,
+//!   M̂ᵢ = Mᵢ/(1-β₁ᵗ),
+//!   Ŝᵢ = Sᵢ/(1-β₂ᵗ),
+//!   Vᵢ = M̂ᵢ⊘(√Ŝᵢ+ε),
+//! where:
+//!   ⊙ - pointwise multiplication,
+//!   ⊘ - pointwise division,
+//!   β₁, β₂ - averaging parameters (typically set to 0.9 and 0.999 respectively),
+//!   ε - small constant to prevent division by zero (typically 1e-8).
+//!
+//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific
+//! learning rates.)
+
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::rc::Rc;
+
+use crate::train::Optimizer;
+use crate::util::native_backend;
+use crate::weight::FillerType;
+use co::prelude::*;
+
+#[derive(Clone, Debug)]
+pub struct AdamConfig {
+    pub beta1: f32,
+    pub beta2: f32,
+    pub epsilon: f32,
+}
+
+pub struct Adam {
+    // First gradient moment (Mᵢ).
+    first_moments: HashMap<usize, SharedTensor<f32>>,
+    // Second gradient moment (Sᵢ).
+    second_moments: HashMap<usize, SharedTensor<f32>>,
+
+    // Original β₁ as well as raised to t-th power (β₁ᵗ).
+    beta1: f32,
+    beta1_nth: f32,
+    // Original β₂ as well as raised to t-th power (β₂ᵗ).
+    beta2: f32,
+    beta2_nth: f32,
+
+    epsilon: f32,
+}
+
+impl Default for AdamConfig {
+    fn default() -> Self {
+        AdamConfig {
+            beta1: 0.9,
+            beta2: 0.999,
+            epsilon: 1.0e-8,
+        }
+    }
+}
+
+impl Adam {
+    pub fn new(config: &AdamConfig) -> Self {
+        Adam {
+            first_moments: HashMap::new(),
+            second_moments: HashMap::new(),
+            beta1: config.beta1,
+            beta1_nth: config.beta1,
+            beta2: config.beta2,
+            beta2_nth: config.beta2,
+            epsilon: config.epsilon,
+        }
+    }
+}
+
+// TODO: Rewrite with backend ops (requires element-wise square and square root support).
+impl<B: IBackend> Optimizer<B> for Adam {
+    fn adjust_weight_change(
+        &mut self,
+        backend: &B,
+        weight_changes: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+    ) {
+        let native = native_backend();
+
+        for (key, change) in weight_changes {
+            let mut change_ref = change.borrow_mut();
+
+            let first_moment = self.first_moments.entry(*key).or_insert_with(|| {
+                let mut tensor = SharedTensor::new(change_ref.desc());
+                FillerType::fill_constant(&mut tensor, 0.0);
+                tensor
+            });
+            let second_moment = self.second_moments.entry(*key).or_insert_with(|| {
+                let mut tensor = SharedTensor::new(change_ref.desc());
+                FillerType::fill_constant(&mut tensor, 0.0);
+                tensor
+            });
+
+            let len = change_ref.desc().size();
+
+            let change_slice = change_ref
+                .read_write(native.device())
+                .unwrap()
+                .as_mut_slice::<f32>();
+            let first_moment_slice = first_moment
+                .read_write(native.device())
+                .unwrap()
+                .as_mut_slice::<f32>();
+            let second_moment_slice = second_moment
+                .read_write(native.device())
+                .unwrap()
+                .as_mut_slice::<f32>();
+
+            // We can rewrite the matrix equations at the top of this file in a element-wise form:
+            //   Mᵢ[j] = β₁Mᵢ₋₁[j] + (1-β₁)∇ᵢ[j]
+            //   Sᵢ[j] = β₂Sᵢ₋₁[j] + (1-β₂)∇ᵢ[j]²
+            //   Vᵢ[j] = Mᵢ[j] / ((1-β₁ᵗ)•√(Sᵢ[j]/(1-β₂ᵗ) + ε)
+            for j in 0..len {
+                // ∇ᵢ[j].
+                let w = change_slice[j];
+                // Mᵢ[j], M̂ᵢ[j].
+                let m = self.beta1 * first_moment_slice[j] + (1.0 - self.beta1) * w;
+                let m_hat = m / (1.0 - self.beta1_nth);
+                // Sᵢ[j], Ŝᵢ[j].
+                let s = self.beta2 * second_moment_slice[j] + (1.0 - self.beta2) * w * w;
+                let s_hat = s / (1.0 - self.beta2_nth);
+                // Vᵢ[j].
+                let v = m_hat / (s_hat.sqrt() + self.epsilon);
+                
+                assert!(!v.is_nan());
+
+                change_slice[j] = v;
+                first_moment_slice[j] = m;
+                second_moment_slice[j] = s;
+            }
+        }
+
+        self.beta1_nth *= self.beta1;
+        self.beta2_nth *= self.beta2;
+    }
+}
diff --git a/juice/src/train/optimizer/mod.rs b/juice/src/train/optimizer/mod.rs
new file mode 100644
index 000000000..4b5d981c1
--- /dev/null
+++ b/juice/src/train/optimizer/mod.rs
@@ -0,0 +1,48 @@
+mod adam;
+mod sgd_momentum;
+
+use std::rc::Rc;
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::default::Default;
+
+use crate::coblas::plugin::Copy;
+use co::prelude::*;
+use crate::util::Axpby;
+
+use adam::Adam;
+use sgd_momentum::SgdWithMomentum;
+
+// Expose configs publicly.
+pub use adam::AdamConfig;
+pub use sgd_momentum::SgdWithMomentumConfig;
+
+// A gradient descent optimization algorithm.
+pub trait Optimizer<B: IBackend> {
+    // Called on each minibatch training cycle. Takes all weight gradients computed during
+    // backpropagation (indexed by an opaque key which is guaranteed to be stable for the
+    // duration of the program).
+    // Modifies the changes in-place; modified changes will then be applied to the weights:
+    //   W = W - α•change,
+    // where α is the learning rate (combined from global and param-specific rates).
+    fn adjust_weight_change(&mut self, backend: &B, weight_changes: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>);
+}
+
+#[derive(Clone, Debug)]
+pub enum OptimizerConfig {
+    SgdWithMomentum(SgdWithMomentumConfig),
+    Adam(AdamConfig),
+}
+
+impl Default for OptimizerConfig {
+    fn default() -> Self {
+        OptimizerConfig::SgdWithMomentum(Default::default())
+    }
+}
+
+pub fn optimizer_from_config<B: IBackend + Axpby<f32> + Copy<f32>>(config: &OptimizerConfig) -> Box<dyn Optimizer<B>> {
+    match config {
+        OptimizerConfig::SgdWithMomentum(sgd_config) => Box::new(SgdWithMomentum::new(sgd_config)),
+        OptimizerConfig::Adam(adam_config) => Box::new(Adam::new(adam_config)),
+    }
+}
\ No newline at end of file
diff --git a/juice/src/train/optimizer/sgd_momentum.rs b/juice/src/train/optimizer/sgd_momentum.rs
new file mode 100644
index 000000000..a73f39087
--- /dev/null
+++ b/juice/src/train/optimizer/sgd_momentum.rs
@@ -0,0 +1,83 @@
+//! SGD with momentum.
+//! Computes the update Vᵢ from params gradient ∇ᵢ as:
+//!   Vᵢ = (1-β)Vᵢ₋₁ + β∇ᵢ,
+//!   V₀ = 0,
+//! where:
+//!   β is the momentum parameter (typically set to 0.1).
+//! 
+//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific
+//! learning rates.)
+
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::rc::Rc;
+
+use crate::coblas::plugin::Copy;
+use crate::train::Optimizer;
+use crate::util::{native_scalar, Axpby};
+use crate::weight::FillerType;
+use co::prelude::*;
+
+#[derive(Clone, Debug)]
+pub struct SgdWithMomentumConfig {
+    pub momentum: f32,
+}
+
+pub struct SgdWithMomentum {
+    history: HashMap<usize, SharedTensor<f32>>,
+    // Precomputed tensor constants.
+    zero: SharedTensor<f32>,
+    momentum: SharedTensor<f32>,
+    one_minus_momentum: SharedTensor<f32>,
+}
+
+impl Default for SgdWithMomentumConfig {
+    fn default() -> Self {
+        SgdWithMomentumConfig { momentum: 0.1 }
+    }
+}
+
+impl SgdWithMomentum {
+    pub fn new(config: &SgdWithMomentumConfig) -> Self {
+        SgdWithMomentum {
+            history: HashMap::new(),
+            zero: native_scalar(0.0),
+            momentum: native_scalar(config.momentum),
+            one_minus_momentum: native_scalar(1.0 - config.momentum),
+        }
+    }
+}
+
+impl<B: IBackend + Axpby<f32> + Copy<f32>> Optimizer<B> for SgdWithMomentum {
+    fn adjust_weight_change(
+        &mut self,
+        backend: &B,
+        weight_changes: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+    ) {
+        for (key, change) in weight_changes {
+            let mut change_ref = change.borrow_mut();
+
+            let history = self.history.entry(*key).or_insert_with(|| {
+                let mut tensor = SharedTensor::new(change_ref.desc());
+                FillerType::fill_constant(&mut tensor, 0.0);
+                tensor
+            });
+
+            // Make sure the params shape didn't change under us.
+            assert_eq!(history.desc().size(), change_ref.desc().size());
+
+            // Compute Vᵢ=(1-β)Vᵢ₋₁ + β∇.
+            backend
+                .axpby(
+                    &self.momentum,
+                    &change_ref,
+                    &self.one_minus_momentum,
+                    history,
+                )
+                .unwrap();
+
+            // Copy Vᵢ to the weight change which should hold the return value.
+            backend.copy(history, &mut change_ref).unwrap();
+        }
+    }
+}
diff --git a/juice/src/train/trainer.rs b/juice/src/train/trainer.rs
new file mode 100644
index 000000000..84746f467
--- /dev/null
+++ b/juice/src/train/trainer.rs
@@ -0,0 +1,168 @@
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::rc::Rc;
+
+use crate::co::prelude::*;
+use crate::net::*;
+use crate::train::{optimizer_from_config, Optimizer, OptimizerConfig, SgdWithMomentumConfig};
+use crate::util::{format_tensor, SolverOps};
+
+#[derive(Clone)]
+pub struct TrainerConfig {
+    pub batch_size: usize,
+    pub objective: LayerConfig,
+    pub optimizer: OptimizerConfig,
+    pub learning_rate: f32,
+}
+
+/// Trains a network through minibatch backpropagation.
+///
+/// Trains a network doing backpropagation from a configured output. For multi-output
+/// networks, several Trainers can be constructed (each with its own loss function)
+/// to perform asynchronous training.
+/// Doesn't own the network.
+pub struct Trainer<B: IBackend> {
+    config: TrainerConfig,
+
+    // Objective (loss) function used for backpropagation.
+    objective: Box<dyn Layer<B>>,
+
+    optimizer: Box<dyn Optimizer<B>>,
+
+    iter: usize,
+}
+
+fn key_from_rc<T>(rc: &Rc<RefCell<T>>) -> usize {
+    Rc::as_ptr(rc) as usize
+}
+
+impl Default for TrainerConfig {
+    fn default() -> Self {
+        TrainerConfig {
+            batch_size: 32,
+            objective: LayerConfig::MeanSquaredError,
+            optimizer: OptimizerConfig::SgdWithMomentum(SgdWithMomentumConfig::default()),
+            learning_rate: 0.001,
+        }
+    }
+}
+
+impl<B: IBackend + SolverOps<f32> + 'static> Trainer<B> {
+    pub fn from_config(
+        backend: &B,
+        config: &TrainerConfig,
+        net: &Network<B>,
+        label_shape: &TensorDesc,
+    ) -> Self {
+        // Create objective.
+        let objective_descriptor = Descriptor::top(
+            "loss",
+            vec![
+                net.top().descriptor().output(0).clone(),
+                Inout::new_with_path(label_shape.clone(), "labels"),
+            ],
+        );
+        let objective = layer_from_config(objective_descriptor, &config.objective);
+        let optimizer = optimizer_from_config(&config.optimizer);
+
+        Trainer {
+            config: (*config).clone(),
+            objective: objective,
+            optimizer: optimizer,
+            iter: 0,
+        }
+    }
+
+    pub fn train_minibatch(
+        &mut self,
+        backend: &B,
+        net: &mut Network<B>,
+        inputs: &SharedTensor<f32>,
+        labels: &SharedTensor<f32>,
+    ) -> SharedTensor<f32> {
+        trace!("Inputs:\n{}", format_tensor(inputs));
+        trace!("Labels:\n{}", format_tensor(labels));
+
+        let batch_size = inputs.desc()[0];
+        assert_eq!(batch_size, labels.desc()[0]);
+
+        let mut context = Context::new(batch_size);
+
+        // Copy intput and label data into the context.
+        // Copy inputs.
+        let context_inputs = context.acquire_data(net.top().descriptor().input(0));
+        assert_eq!(context_inputs.borrow().desc().size(), inputs.desc().size());
+        backend
+            .copy(&inputs, &mut context_inputs.borrow_mut())
+            .unwrap();
+        // Copy labels.
+        let context_labels = context.acquire_data(self.objective.descriptor().input(1));
+        backend
+            .copy(&labels, &mut context_labels.borrow_mut())
+            .unwrap();
+
+        // Compute network output and the loss.
+        net.top().compute_output(backend, &mut context);
+        self.objective.compute_output(backend, &mut context);
+
+        // Compute params gradients by doing a backpropagation on the network.
+        self.objective.compute_gradients(backend, &mut context);
+        trace!(
+            "Loss gradient:\n{}",
+            format_tensor(
+                &context
+                    .get_data_gradient(self.objective.descriptor().input(0))
+                    .borrow()
+            )
+        );
+        net.top().compute_gradients(backend, &mut context);
+
+        // Collect computed gradients.
+        let params_gradients: HashMap<usize, Rc<RefCell<SharedTensor<f32>>>> = net
+            .top()
+            .descriptor()
+            .params()
+            .iter()
+            .map(|p| {
+                (
+                    key_from_rc(p),
+                    // Use acquire* instead of get* to create missing gradients. This is necessary
+                    // because backpropagation might not reach all layers. Created gradients
+                    // are zero-filled and thus will not change weights when applied.
+                    context.acquire_params_gradient(p),
+                )
+            })
+            .collect();
+
+        // Let the optimizer adjust the params gradients before applying them to params.
+        self.optimizer
+            .adjust_weight_change(backend, &params_gradients);
+
+        trace!("Gradient after worker:");
+        params_gradients.iter().for_each(|(p, r)| {
+            trace!("{:?}: \n{}", p, format_tensor(&r.borrow()));
+        });
+
+        // Finally apply the weight change.
+        net.top().descriptor().params().iter().for_each(|p| {
+            let key = key_from_rc(p);
+
+            let mut params = p.borrow_mut();
+            let change = params_gradients.get(&key).unwrap().borrow();
+
+            // When applying the (optimized) gradient, additionally:
+            // 1. Normalize for batch size (multiply by 1/batch_size).
+            // 2. Multiply by individual params learning rate.
+            // 3. Multiply by global learning rate.
+            let a = crate::util::native_scalar(
+                -params.learning_rate * self.config.learning_rate / (self.config.batch_size as f32),
+            );
+
+            backend.axpy(&a, &change, &mut params.data).unwrap();
+        });
+
+        self.iter += 1;
+
+        context.take_data(net.top().descriptor().output(0))
+    }
+}
diff --git a/juice/src/util.rs b/juice/src/util.rs
index 581fb9f58..7b51c24ea 100644
--- a/juice/src/util.rs
+++ b/juice/src/util.rs
@@ -26,7 +26,11 @@ pub fn write_to_memory<T: NumCast + ::std::marker::Copy>(mem: &mut FlatBox, data
 }
 
 /// Write into a native Coaster Memory with a offset.
-pub fn write_to_memory_offset<T: NumCast + ::std::marker::Copy>(mem: &mut FlatBox, data: &[T], offset: usize) {
+pub fn write_to_memory_offset<T: NumCast + ::std::marker::Copy>(
+    mem: &mut FlatBox,
+    data: &[T],
+    offset: usize,
+) {
     let mem_buffer = mem.as_mut_slice::<f32>();
     for (index, datum) in data.iter().enumerate() {
         // mem_buffer[index + offset] = *datum;
@@ -41,7 +45,11 @@ pub fn write_to_memory_offset<T: NumCast + ::std::marker::Copy>(mem: &mut FlatBo
 /// is assumed to be the batchsize.
 ///
 /// Allocates memory on a Native Backend if neccessary.
-pub fn write_batch_sample<T: NumCast + ::std::marker::Copy>(tensor: &mut SharedTensor<f32>, data: &[T], i: usize) {
+pub fn write_batch_sample<T: NumCast + ::std::marker::Copy>(
+    tensor: &mut SharedTensor<f32>,
+    data: &[T],
+    i: usize,
+) {
     let native_backend = native_backend();
     let tensor_desc = tensor.desc();
     let batch_size = tensor_desc[0];
@@ -59,7 +67,10 @@ pub fn write_batch_sample<T: NumCast + ::std::marker::Copy>(tensor: &mut SharedT
 pub fn native_scalar<T: NumCast + ::std::marker::Copy>(scalar: T) -> SharedTensor<T> {
     let native = native_backend();
     let mut shared_scalar = SharedTensor::<T>::new(&[1]);
-    write_to_memory(shared_scalar.write_only(native.device()).unwrap(), &[scalar]);
+    write_to_memory(
+        shared_scalar.write_only(native.device()).unwrap(),
+        &[scalar],
+    );
     shared_scalar
 }
 
@@ -72,6 +83,39 @@ pub fn cast_vec_usize_to_i32(input: Vec<usize>) -> Vec<i32> {
     out
 }
 
+pub fn format_tensor(tensor: &SharedTensor<f32>) -> String {
+    let native = native_backend();
+
+    let mut output = String::new();
+
+    if tensor.desc().len() == 1
+        || (tensor.desc().len() == 2 && (tensor.desc()[0] == 1 || tensor.desc()[1] == 1))
+    {
+        // One-dimensional, print as a single row.
+        let native_data: &[f32] = tensor.read(native.device()).unwrap().as_slice();
+        for v in native_data {
+            output += &format!("{:.5} ", v);
+        }
+        output += "\n";
+    } else {
+        // Use first dimension a row number.
+        let rows = tensor.desc()[0];
+        let columns = tensor.desc().iter().skip(1).fold(1, |acc, d| acc * d);
+        let native_data: &[f32] = tensor.read(native.device()).unwrap().as_slice();
+        for row in 0..rows {
+            for col in 0..columns {
+                output += &format!("{:.5} ", native_data[row * columns + col]);
+            }
+            output += "\n"; 
+        }
+    }
+    output
+}
+
+pub fn print_tensor(tensor: &SharedTensor<f32>) {
+    println!("{}", &format_tensor(tensor));
+}
+
 /// Extends IBlas with Axpby
 pub trait Axpby<F>: Axpy<F> + Scal<F> {
     /// Performs the operation y := a*x + b*y .
diff --git a/juice/tests/layer_specs.rs b/juice/tests/layer_specs.rs
index b05f7a2f5..715bfa0bd 100644
--- a/juice/tests/layer_specs.rs
+++ b/juice/tests/layer_specs.rs
@@ -1,7 +1,7 @@
 extern crate coaster as co;
 extern crate juice;
 
-#[cfg(test)]
+#[cfg(all(test, whatever))]
 mod layer_spec {
     use crate::co::prelude::*;
     use juice::layer::*;
diff --git a/juice/tests/q_learning.rs b/juice/tests/q_learning.rs
new file mode 100644
index 000000000..2fc0a7dc5
--- /dev/null
+++ b/juice/tests/q_learning.rs
@@ -0,0 +1,385 @@
+extern crate coaster;
+extern crate juice;
+
+#[cfg(test)]
+mod cartpole {
+    /// This test verifies Q-learning in a cartpole environment:
+    ///         O
+    ///         │
+    ///         │
+    ///      ┌──┴──┐
+    ///      │     ├───► F
+    ///      └─────┘
+    ///
+    /// "Cartpole" is an inverted pendulum on a plaftorm that can move in one dimension.
+    /// Agent must apply force to move the platform either to the left or to the right
+    /// in order to balance the pole in an upright position. Scenario ends when the pole
+    /// angle crosses a certain threshold. Agent is rewarded with R=1 for each cycle
+    /// of the scenario (so the longer the agent is able to keep pole from falling, the bigger
+    /// overall reward it gets).
+    ///
+    /// State "s" consists of [cart_pos, cart_vel, pole_angle, pole_angle_vel] variables.
+    /// Possible actions "a" are [left, right].
+    /// Q function Q(s, a) is approximated by a neural network with trainable weights θ: Q(s, a, θ).
+    /// Network takes the 4 state variables as the input and outputs the the expected return value
+    /// for each action:
+    ///
+    ///   [s1 s2 s3 s4] -> net -> [Q(s, left) Q(s, right)]
+    ///
+    /// During training, on each step agent observes the state "s", chooses the best action "a"
+    /// using an ε-greedy policy based on currently learned Q-function (that is, it takes a random
+    /// action with ε probability and an action "a" that maximizes Q(s, a) with probability 1-ε),
+    /// gets the reward "R" and observes the next step "s'".
+    /// The tuple [s, a, R, s'] is saved into experience replay buffer.
+    ///
+    /// For training, a batch is taken from the replay buffer. For each replay tuple [s, a, R, s'],
+    /// "s" becomes the network input and the label is computed as
+    ///
+    ///   l = R + γ max Q*(s', a)
+    ///              a
+    ///
+    /// where Q* is an earlier snapshot of Q (a snapshot is used instead of Q itself for stability).
+    /// Note that this gives the label value only for one of the actions; the full label is set to
+    ///
+    ///   [l NaN]
+    ///
+    /// (assuming "a" was the first action). MSE loss function is assumed to ignore NaN values for
+    /// loss and backpropagation gradient computations.
+    use rand::{thread_rng, Rng};
+    use std::collections::VecDeque;
+    use std::rc::Rc;
+
+    use coaster::frameworks::native::get_native_backend;
+    use coaster::prelude::*;
+    use juice::net::*;
+    use juice::train::*;
+    use juice::util::{write_batch_sample, LayerOps};
+
+    const STATE_SIZE: usize = 4;
+    const ACTION_COUNT: usize = 2;
+    const BATCH_SIZE: usize = 32;
+    const REPLAY_BUFFER_SIZE: usize = 1024;
+    const CART_MASS: f64 = 1.0;
+    const POLE_MASS: f64 = 1.0;
+    const POLE_LENGTH: f64 = 1.0;
+    const EARTH_G: f64 = 9.8;
+    const ENV_STEP: f64 = 0.1;
+    const DISCOUNT: f32 = 0.9;
+
+    #[derive(Clone, Copy, Debug)]
+    enum Action {
+        Left,
+        Right,
+    }
+
+    #[derive(Default)]
+    struct Environment {
+        // Position, velocity and acceleration of the cart.
+        cart_pos: f64,
+        cart_vel: f64,
+        cart_acc: f64,
+
+        // Angle, angular velocity and acceleration of the pole (angle = 0 means upright).
+        pole_angle: f64,
+        pole_angle_vel: f64,
+        pole_angle_acc: f64,
+    }
+
+    struct ReplayEntry {
+        state: [f32; STATE_SIZE],
+        action: Action,
+        reward: f32,
+        // None if the resulting state is a final one.
+        next_state: Option<[f32; STATE_SIZE]>,
+    }
+
+    impl Environment {
+        fn new() -> Environment {
+            Environment {
+                // Start with a slight offset so that the pole starts falling.
+                pole_angle: 0.001,
+                ..Default::default()
+            }
+        }
+
+        // Execute an actor action, transitioning into a new env state.
+        // Action can be None for the purpose of testing the Environment logic itself
+        // (agent always makes an action).
+        fn step(&mut self, action: Option<Action>) {
+            // Shorthands and local vars.
+            let m_p = POLE_MASS;
+            let m_c = CART_MASS;
+            let l = POLE_LENGTH;
+            let m = m_p + m_c;
+            let f = match action {
+                None => 0.0,
+                Some(Action::Left) => -1.0,
+                Some(Action::Right) => 1.0,
+            };
+            let th = self.pole_angle;
+            let th_dot = self.pole_angle_vel;
+            let th_ddot = self.pole_angle_acc;
+            let sin_th = th.sin();
+            let cos_th = th.cos();
+
+            self.cart_acc = (f + m_p * l * (th_dot.powi(2) * sin_th - th_ddot * cos_th)) / m;
+            self.cart_vel += self.cart_acc * ENV_STEP;
+            self.cart_pos += self.cart_vel * ENV_STEP;
+
+            self.pole_angle_acc = (EARTH_G * sin_th
+                + cos_th * (-f - m_p * l * th_dot.powi(2) * sin_th) / m)
+                / (l * (4.0 / 3.0 - m_p * cos_th.powi(2) / m));
+            self.pole_angle_vel += self.pole_angle_acc * ENV_STEP;
+            self.pole_angle += self.pole_angle_vel * ENV_STEP;
+        }
+
+        // Returns true if the pole has reached some critical angle.
+        fn is_final(&self) -> bool {
+            self.pole_angle.abs() > std::f64::consts::PI * 0.25
+        }
+
+        fn observe(&self) -> [f32; STATE_SIZE] {
+            [
+                self.cart_pos as f32,
+                self.cart_vel as f32,
+                self.pole_angle as f32,
+                self.pole_angle_vel as f32,
+            ]
+        }
+    }
+
+    // Returns a completely random action.
+    fn random_action() -> Action {
+        if thread_rng().gen::<f64>() < 0.5 {
+            Action::Left
+        } else {
+            Action::Right
+        }
+    }
+
+    fn epsion_greedy_action<B: IBackend + LayerOps<f32> + 'static>(
+        backend: &B,
+        net: &Network<B>,
+        state: &[f32; STATE_SIZE],
+        epsilon: f64,
+    ) -> Action {
+        if thread_rng().gen::<f64>() < epsilon {
+            random_action()
+        } else {
+            let action_values = get_action_values(backend, net, state);
+            if action_values[0] > action_values[1] {
+                Action::Left
+            } else {
+                Action::Right
+            }
+        }
+    }
+
+    // Returns the predicted action values for a given state.
+    fn get_action_values<B: IBackend + LayerOps<f32> + 'static>(
+        backend: &B,
+        net: &Network<B>,
+        state: &[f32; STATE_SIZE],
+    ) -> [f32; ACTION_COUNT] {
+        let mut input = SharedTensor::new(&[1, STATE_SIZE]);
+        write_batch_sample(&mut input, state, 0);
+        let output = net.transform(backend, &input);
+
+        let mut result = [0.0; ACTION_COUNT];
+        let native_backend = get_native_backend();
+        result.clone_from_slice(output.read(native_backend.device()).unwrap().as_slice());
+        result
+    }
+
+    fn create_batch<B: IBackend + LayerOps<f32> + 'static>(
+        backend: &B,
+        buffer: &VecDeque<ReplayEntry>,
+        target_net: &Network<B>,
+    ) -> (SharedTensor<f32>, SharedTensor<f32>) {
+        let mut inputs = SharedTensor::new(&vec![BATCH_SIZE, STATE_SIZE]);
+        let mut labels = SharedTensor::new(&vec![BATCH_SIZE, ACTION_COUNT]);
+
+        for i in 0..BATCH_SIZE {
+            let j = thread_rng().gen_range(0..buffer.len());
+            let (buffer_action, other_action) = match buffer[j].action {
+                Action::Left => (0, 1),
+                Action::Right => (1, 0),
+            };
+
+            let mut action_values = [std::f32::NAN; ACTION_COUNT];
+
+            // For the (s, a, s', r) tuple in the buffer, we can compute more precise target as
+            //   y = r + γ•max Q*(s', _), if s' is not terminal, or
+            //   y = r,                   if s' is termninal.
+            action_values[buffer_action] = buffer[j].reward
+                + match buffer[j].next_state {
+                    None => 0.0,
+                    Some(s) => {
+                        DISCOUNT
+                            * get_action_values(backend, target_net, &s)
+                                .iter()
+                                .fold(std::f32::NEG_INFINITY, |a, &b| a.max(b))
+                    }
+                };
+
+            // For the other action a2, we just use the target_net:
+            //   y = Q*(s, a2).
+            // action_values[other_action] =
+            //     get_action_values(backend, target_net, &buffer[j].state)[other_action];
+
+            write_batch_sample(&mut inputs, &buffer[j].state, i);
+            write_batch_sample(&mut labels, &action_values, i);
+        }
+
+        (inputs, labels)
+    }
+
+    // Runs 3 scenarios with a greedy policy using the provided learned Q-function.
+    // Returns the average number of steps the agent was able to keep the cartpole from
+    // falling (capped at 100 steps).
+    fn eval<B: IBackend + LayerOps<f32> + 'static>(backend: &B, net: &Network<B>) -> f32 {
+        let mut sum = 0.0;
+        for _ in 0..3 {
+            let mut env = Environment::new();
+            for i in 0..100 {
+                let action = epsion_greedy_action(backend, net, &env.observe(), 0.0);
+                env.step(Some(action));
+                sum += 1.0;
+                if env.is_final() {
+                    break;
+                }
+            }
+        }
+        sum / 3.0
+    }
+
+    // A test on the environment simulator.
+    // When no forces are present, pole angle should be oscillating around PI.
+    #[test]
+    fn environment_is_sane_without_force() {
+        let mut env = Environment::new();
+        let mut avg_angle = 0.0;
+        for _ in 0..10000 {
+            env.step(None);
+            avg_angle += env.pole_angle;
+        }
+        avg_angle /= 10000.0;
+        assert!(
+            (avg_angle - std::f64::consts::PI).abs() < 0.01,
+            "Avg. angle: {}",
+            avg_angle
+        );
+    }
+
+    // A test on the environment simulator.
+    // When force is applied to the left, cart should be moving to the left.
+    #[test]
+    fn environment_is_sane_with_left_force() {
+        let mut env = Environment::new();
+        for _ in 0..10 {
+            env.step(Some(Action::Left));
+        }
+        assert!(env.cart_pos < 0.0, "Cart pos: {}", env.cart_pos);
+        assert!(env.cart_vel < 0.0, "Cart vel: {}", env.cart_vel);
+        assert!(env.cart_acc < 0.0, "Cart acc: {}", env.cart_acc);
+    }
+
+    // A test on the environment simulator.
+    // When force is applied to the right, cart should be moving to the right.
+    #[test]
+    fn environment_is_sane_with_right_force() {
+        let mut env = Environment::new();
+        for _ in 0..10 {
+            env.step(Some(Action::Right));
+        }
+        assert!(env.cart_pos > 0.0, "Cart pos: {}", env.cart_pos);
+        assert!(env.cart_vel > 0.0, "Cart vel: {}", env.cart_vel);
+        assert!(env.cart_acc > 0.0, "Cart acc: {}", env.cart_acc);
+    }
+
+    #[test]
+    fn learns_cartpole_control() {
+        env_logger::init();
+
+        let backend = get_native_backend();
+
+        // Create the network representing the Q-function Q(s, a).
+        let net_conf = LayerConfig::Sequential(
+            SequentialConfig::new()
+                .with_layer("linear1", LayerConfig::Linear(LinearConfig::new(50)))
+                .with_layer("relu1", LayerConfig::Relu)
+                .with_layer("linear2", LayerConfig::Linear(LinearConfig::new(50)))
+                .with_layer("relu2", LayerConfig::Relu)
+                .with_layer(
+                    "linear3",
+                    LayerConfig::Linear(LinearConfig::new(ACTION_COUNT)),
+                ),
+        );
+        let mut net = Network::from_config(net_conf, &[vec![STATE_SIZE]]);
+
+        // Create the trainer.
+        let trainer_conf = TrainerConfig {
+            batch_size: BATCH_SIZE,
+            objective: LayerConfig::MeanSquaredError,
+            optimizer: OptimizerConfig::SgdWithMomentum(Default::default()),
+            ..Default::default()
+        };
+        let mut trainer = Trainer::from_config(&backend, &trainer_conf, &net, &vec![ACTION_COUNT]);
+
+        let mut replay_buffer = VecDeque::new();
+        let mut env = Environment::new();
+
+        // Network used to compute full returns from a certain state.
+        // This is a periodic snapshot from the main net (main network isn't used due to
+        // stability issues).
+        let mut target_net = net.clone();
+        let mut epsilon = 1.0;
+
+        for i in 0..1000000 {
+            // Do a step.
+            let state = env.observe();
+            let action = epsion_greedy_action(&backend, &net, &state, epsilon);
+            env.step(Some(action));
+            let (reward, next_state) = match env.is_final() {
+                false => (1.0, Some(env.observe())),
+                true => (0.0, None),
+            };
+
+            // Store the result in the replay buffer.
+            let replay_entry = ReplayEntry {
+                state: state,
+                action: action,
+                reward: reward,
+                next_state: next_state,
+            };
+            replay_buffer.push_front(replay_entry);
+            replay_buffer.truncate(REPLAY_BUFFER_SIZE);
+
+            // Restart the environment if reached a final state.
+            if env.is_final() {
+                env = Environment::new();
+            }
+
+            if replay_buffer.len() >= BATCH_SIZE {
+                let (inputs, labels) = create_batch(&backend, &replay_buffer, &target_net);
+                trainer.train_minibatch(&backend, &mut net, &inputs, &labels);
+            }
+
+            // Evaluate performance and snapshot the bootstrapping net every 100 steps.
+            if i % 100 == 0 {
+                let score = eval(&backend, &net);
+                println!("Epoch: {}; score: {}; ε: {}", i / 100, score, epsilon);
+                target_net = net.clone();
+                epsilon = (epsilon * 0.995).max(0.01);
+
+                // Stop when we reach 95 score.
+                if i / 100 == 1000 {
+                // if score >= 95.0 {
+                    return;
+                }
+            }
+        }
+
+        assert!(false, "Failed to reach score 95");
+    }
+}
diff --git a/rust-blas/src/matrix/ops.rs b/rust-blas/src/matrix/ops.rs
index 4c7167290..7a7a6cb37 100644
--- a/rust-blas/src/matrix/ops.rs
+++ b/rust-blas/src/matrix/ops.rs
@@ -28,16 +28,22 @@ macro_rules! gemm_impl(($($t: ident), +) => (
         impl Gemm for $t {
             fn gemm(alpha: &$t, at: Transpose, a: &dyn Matrix<$t>, bt: Transpose, b: &dyn Matrix<$t>, beta: &$t, c: &mut dyn Matrix<$t>) {
                 unsafe {
-                    let (m, k)  = match at {
+                    let (ar, ac)  = match at {
                         Transpose::NoTrans => (a.rows(), a.cols()),
                         _ => (a.cols(), a.rows()),
                     };
-
-                    let n = match bt {
-                        Transpose::NoTrans => b.cols(),
-                        _ => b.rows(),
+                    let (br, bc)  = match bt {
+                        Transpose::NoTrans => (b.rows(), b.cols()),
+                        _ => (b.cols(), b.rows()),
                     };
 
+                    let (m, k)  = (ar, ac);
+                    let n = bc;
+
+                    if br != k || c.rows() != m || c.cols() != n {
+                        panic!("Wrong GEMM dimensions: [{},{}]x[{},{}] -> [{},{}]", ar, ac, br, bc, c.rows(), c.cols());
+                    }
+
                     prefix!($t, gemm)(a.order(),
                         at, bt,
                         m, n, k,