diff --git a/.gitignore b/.gitignore index 026b26f36..7ac4b427d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ # will have compiled files and executables target +Cargo.lock + # These are backup files generated by rustfmt **/*.rs.bk diff --git a/Cargo.toml b/Cargo.toml index 4f5255f13..94da1a41d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ members = [ "greenglas", "coaster", "coaster-nn", "coaster-blas", "juice", "rust-blas", "rcudnn/cudnn", "rcudnn/cudnn-sys", "rcublas/cublas", "rcublas/cublas-sys", - "juice-examples/juice-utils", "juice-examples/mackey-glass-rnn-regression", +# "juice-examples/juice-utils", "juice-examples/mackey-glass-rnn-regression", "juice-examples/mnist-image-multiclass-classification"] exclude = [ "./rcudnn", "./rcublas", "./juice-examples"] diff --git a/juice-examples/mnist-image-multiclass-classification/src/main.rs b/juice-examples/mnist-image-multiclass-classification/src/main.rs index d306d318c..68feb2108 100644 --- a/juice-examples/mnist-image-multiclass-classification/src/main.rs +++ b/juice-examples/mnist-image-multiclass-classification/src/main.rs @@ -10,8 +10,7 @@ use co::frameworks::cuda::get_cuda_backend; #[cfg(not(feature = "cuda"))] use co::frameworks::native::get_native_backend; use co::prelude::*; -use juice::layer::*; -use juice::layers::*; +use juice::net::*; use juice::solver::*; use juice::util::*; use juice_utils::{download_datasets, unzip_datasets}; @@ -131,7 +130,7 @@ fn main() { ); } } - +/* #[cfg(all(feature = "cuda"))] fn add_conv_net( mut net_cfg: SequentialConfig, @@ -164,7 +163,7 @@ fn add_conv_net( "linear1", LinearConfig { output_size: 500 }, )); - net_cfg.add_layer(LayerConfig::new("sigmoid", LayerType::Sigmoid)); + net_cfg.add_layer(LayerConfig::new("sigmoid", LayerConfig::Sigmoid)); net_cfg.add_layer(LayerConfig::new( "linear2", LinearConfig { output_size: 10 }, @@ -192,26 +191,25 @@ fn add_mlp( ) -> SequentialConfig { net_cfg.add_layer(LayerConfig::new( "reshape", - LayerType::Reshape(ReshapeConfig::of_shape(&[batch_size, pixel_count])), + LayerConfig::Reshape(ReshapeConfig::of_shape(&[batch_size, pixel_count])), )); net_cfg.add_layer(LayerConfig::new( "linear1", - LayerType::Linear(LinearConfig { output_size: 1568 }), + LayerConfig::Linear(LinearConfig { output_size: 1568 }), )); - net_cfg.add_layer(LayerConfig::new("sigmoid", LayerType::Sigmoid)); + net_cfg.add_layer(LayerConfig::new("sigmoid", LayerConfig::Sigmoid)); net_cfg.add_layer(LayerConfig::new( "linear2", - LayerType::Linear(LinearConfig { output_size: 10 }), + LayerConfig::Linear(LinearConfig { output_size: 10 }), )); net_cfg } - -fn add_linear_net(mut net_cfg: SequentialConfig) -> SequentialConfig { - net_cfg.add_layer(LayerConfig::new( +*/ +fn add_linear_net(net_cfg: SequentialConfig) -> SequentialConfig{ + net_cfg.with_layer( "linear", - LayerType::Linear(LinearConfig { output_size: 10 }), - )); - net_cfg + LayerConfig::Linear(LinearConfig { output_size: 10 }), + ) } fn run_mnist( @@ -250,25 +248,19 @@ fn run_mnist( let momentum = momentum.unwrap_or(0f32); let mut net_cfg = SequentialConfig::default(); - net_cfg.add_input("data", &[batch_size, pixel_dim, pixel_dim]); - net_cfg.force_backward = true; - net_cfg = match &*model_name.unwrap_or("none".to_owned()) { - "conv" => add_conv_net(net_cfg, batch_size, pixel_dim), - "mlp" => add_mlp(net_cfg, batch_size, pixel_count), - "linear" => add_linear_net(net_cfg), + match &*model_name.unwrap_or("none".to_owned()) { + // "conv" => add_conv_net(net_cfg, batch_size, pixel_dim), + // "mlp" => add_mlp(net_cfg, batch_size, pixel_count), + "linear" => net_cfg = add_linear_net(net_cfg), _ => panic!("Unknown model. Try one of [linear, mlp, conv]"), }; - net_cfg.add_layer(LayerConfig::new("log_softmax", LayerType::LogSoftmax)); + net_cfg = net_cfg.with_layer("log_softmax", LayerConfig::LogSoftmax); - let mut classifier_cfg = SequentialConfig::default(); - classifier_cfg.add_input("network_out", &[batch_size, 10]); - classifier_cfg.add_input("label", &[batch_size, 1]); // set up nll loss - let nll_layer_cfg = NegativeLogLikelihoodConfig { num_classes: 10 }; - let nll_cfg = LayerConfig::new("nll", LayerType::NegativeLogLikelihood(nll_layer_cfg)); - classifier_cfg.add_layer(nll_cfg); + let classifier_cfg = + LayerConfig::NegativeLogLikelihood(NegativeLogLikelihoodConfig { num_classes: 10 }); // set up backends #[cfg(all(feature = "cuda"))] @@ -283,9 +275,14 @@ fn run_mnist( momentum, ..SolverConfig::default() }; - solver_cfg.network = LayerConfig::new("network", net_cfg); - solver_cfg.objective = LayerConfig::new("classifier", classifier_cfg); - let mut solver = Solver::from_config(backend.clone(), backend.clone(), &solver_cfg); + solver_cfg.network = LayerConfig::Sequential(net_cfg); + solver_cfg.objective = classifier_cfg; + let mut solver = Solver::from_config( + backend.clone(), + &solver_cfg, + &[vec![pixel_dim, pixel_dim]], + &vec![1], + ); // set up confusion matrix let mut classification_evaluator = ::juice::solver::ConfusionMatrix::new(10); @@ -311,9 +308,8 @@ fn run_mnist( targets.push(label_val as usize); } // train the network! - let infered_out = solver.train_minibatch(inp_lock.clone(), label_lock.clone()); + let mut infered = solver.train_minibatch(inp_lock.clone(), label_lock.clone()); - let mut infered = infered_out.write().unwrap(); let predictions = classification_evaluator.get_predictions(&mut infered); classification_evaluator.add_samples(&predictions, &targets); diff --git a/juice/build.rs b/juice/build.rs index 2961df4dc..b0a92569b 100644 --- a/juice/build.rs +++ b/juice/build.rs @@ -4,4 +4,10 @@ fn main() { .file("capnp/juice.capnp") .run() .expect("capnpc schema compiler command must succeed"); + + capnpc::CompilerCommand::new() + .src_prefix("capnp") + .file("capnp/net.capnp") + .run() + .expect("capnpc schema compiler command must succeed"); } diff --git a/juice/capnp/net.capnp b/juice/capnp/net.capnp new file mode 100644 index 000000000..edeb87bff --- /dev/null +++ b/juice/capnp/net.capnp @@ -0,0 +1,34 @@ +# Types required for serialization of a Network. + +@0xd42882f7f90ce348; + +struct SequentialConfig { +} + +struct LinearConfig { + outputSize @0 :UInt64; +} + +struct NegativeLogLikelihoodConfig { +} + +struct LayerConfig { + layerType :union { + sequential @0 :SequentialConfig; + linear @1 :LinearConfig; + logSoftmax @2 :Void; + relu @3 :Void; + sigmoid @4 :Void; + meanSquaredError @5 :Void; + negativeLogLikelihood @6 :NegativeLogLikelihoodConfig; + } +} + +struct TensorShape { + shape @0 :List(UInt64); +} + +struct Network { + config @0 :LayerConfig; + inputs @1 :List(TensorShape); +} \ No newline at end of file diff --git a/juice/examples/benchmarks.rs b/juice/examples/benchmarks.rs index 79a0828b3..e49a1166e 100644 --- a/juice/examples/benchmarks.rs +++ b/juice/examples/benchmarks.rs @@ -1,3 +1,5 @@ +/* + #[macro_use] extern crate timeit; @@ -583,3 +585,7 @@ fn bench_vgg_a() { } } } +*/ + + +fn main() {} \ No newline at end of file diff --git a/juice/src/lib.rs b/juice/src/lib.rs index 30fe80b55..c4634f5a8 100644 --- a/juice/src/lib.rs +++ b/juice/src/lib.rs @@ -117,10 +117,12 @@ extern crate coaster_blas as coblas; extern crate coaster_nn as conn; extern crate num; extern crate rand; -pub mod layer; -pub mod layers; +//pub mod layer; +//pub mod layers; +pub mod net; pub mod solver; pub mod solvers; +pub mod train; pub mod weight; mod capnp_util; @@ -130,3 +132,6 @@ pub mod util; mod juice_capnp { include!(concat!(env!("OUT_DIR"), "/juice_capnp.rs")); } +mod net_capnp { + include!(concat!(env!("OUT_DIR"), "/net_capnp.rs")); +} diff --git a/juice/src/net/activation/mod.rs b/juice/src/net/activation/mod.rs new file mode 100644 index 000000000..56ed5a88e --- /dev/null +++ b/juice/src/net/activation/mod.rs @@ -0,0 +1,5 @@ +mod relu; +mod sigmoid; + +pub use relu::*; +pub use sigmoid::*; \ No newline at end of file diff --git a/juice/src/net/activation/relu.rs b/juice/src/net/activation/relu.rs new file mode 100644 index 000000000..bc41de0be --- /dev/null +++ b/juice/src/net/activation/relu.rs @@ -0,0 +1,53 @@ +use crate::co::IBackend; +use crate::conn; +use crate::net::{Context, Descriptor, Layer}; + +#[derive(Debug, Clone)] +pub struct Relu { + descriptor: Descriptor, +} + +impl Relu { + pub fn new(mut descriptor: Descriptor) -> Self { + assert_eq!(descriptor.inputs().len(), 1); // Should only be one input. + + descriptor.add_output(descriptor.input(0).unit_shape().clone()); + + Relu { + descriptor: descriptor, + } + } +} + +impl> Layer for Relu { + fn compute_output(&self, backend: &B, context: &mut Context) { + let input = context.get_data(self.descriptor.input(0)); + let output = context.acquire_data(self.descriptor.output(0)); + backend + .relu(&input.borrow(), &mut output.borrow_mut()) + .unwrap(); + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + let input = context.get_data(self.descriptor.input(0)); + let output = context.get_data(self.descriptor.output(0)); + let output_gradient = context.get_data_gradient(self.descriptor.output(0)); + let input_gradient = context.acquire_data_gradient(self.descriptor.input(0)); + backend + .relu_grad( + &output.borrow(), + &output_gradient.borrow(), + &input.borrow(), + &mut input_gradient.borrow_mut(), + ) + .unwrap(); + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} diff --git a/juice/src/net/activation/sigmoid.rs b/juice/src/net/activation/sigmoid.rs new file mode 100644 index 000000000..a6e5fb8e8 --- /dev/null +++ b/juice/src/net/activation/sigmoid.rs @@ -0,0 +1,53 @@ +use crate::co::IBackend; +use crate::conn; +use crate::net::{Context, Descriptor, Layer}; + +#[derive(Debug, Clone)] +pub struct Sigmoid { + descriptor: Descriptor, +} + +impl Sigmoid { + pub fn new(mut descriptor: Descriptor) -> Self { + assert_eq!(descriptor.inputs().len(), 1); // Should only be one input. + + descriptor.add_output(descriptor.input(0).unit_shape().clone()); + + Sigmoid { + descriptor: descriptor, + } + } +} + +impl> Layer for Sigmoid { + fn compute_output(&self, backend: &B, context: &mut Context) { + let input = context.get_data(self.descriptor.input(0)); + let output = context.acquire_data(self.descriptor.output(0)); + backend + .sigmoid(&input.borrow(), &mut output.borrow_mut()) + .unwrap(); + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + let input = context.get_data(self.descriptor.input(0)); + let output = context.get_data(self.descriptor.output(0)); + let output_gradient = context.get_data_gradient(self.descriptor.output(0)); + let input_gradient = context.acquire_data_gradient(self.descriptor.input(0)); + backend + .sigmoid_grad( + &output.borrow(), + &output_gradient.borrow(), + &input.borrow(), + &mut input_gradient.borrow_mut(), + ) + .unwrap(); + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} diff --git a/juice/src/net/common/linear.rs b/juice/src/net/common/linear.rs new file mode 100644 index 000000000..f2b30a628 --- /dev/null +++ b/juice/src/net/common/linear.rs @@ -0,0 +1,150 @@ +use std::cell::RefCell; +use std::rc::Rc; + +use crate::co::{IBackend, ITensorDesc, SharedTensor}; +use crate::coblas::transpose::Transpose; +use crate::net::{Context, Descriptor, Layer, LearnableParams}; +use crate::util::{native_scalar, LayerOps}; +use crate::weight::FillerType; + +#[derive(Clone, Debug, Default)] +pub struct LinearConfig { + pub output_size: usize, +} + +#[derive(Debug)] +pub struct Linear { + descriptor: Descriptor, + + // Weight (A) and bias (b) for the linear operation y = Ax + b. + weight: Rc>, + bias: Rc>, + + // Constants saved for efficiency. + one: SharedTensor, + zero: SharedTensor, +} + +impl LinearConfig { + pub fn new(output_size: usize) -> Self { + LinearConfig { + output_size: output_size, + } + } +} + +impl Linear { + pub fn new(mut descriptor: Descriptor, config: &LinearConfig) -> Self { + assert_eq!(descriptor.inputs().len(), 1); // Should be only one input. + let input_size = descriptor.input(0).unit_shape().size(); + + descriptor.add_output(vec![config.output_size]); + + // Create weight matrix. + let mut weight = SharedTensor::::new(&[config.output_size, input_size]); + FillerType::fill_glorot(&mut weight, input_size, config.output_size); + + // Create bias. Bias is typically intialized with a constant, and a suitable initialisation + // is stated in https://cs231n.github.io/neural-networks-2/#init for non-LSTM types. + let mut bias = SharedTensor::::new(&[1, config.output_size]); + let seed = rand::random::(); + let bias_init_value = seed * (2.0 / seed).sqrt(); + FillerType::fill_constant(&mut bias, bias_init_value); + + let weight_param = descriptor.create_params("weights", weight, 1.0); + let bias_param = descriptor.create_params("bias", bias, 1.0); + + Linear { + descriptor: descriptor, + weight: weight_param, + bias: bias_param, + one: native_scalar(1f32), + zero: native_scalar(0f32), + } + } +} + +impl> Layer for Linear { + fn compute_output(&self, backend: &B, context: &mut Context) { + let input = context.get_data(self.descriptor.input(0)); + let output = context.acquire_data(self.descriptor.output(0)); + + let mut ones_tensor = SharedTensor::::new(&[context.batch_size(), 1]); + FillerType::fill_constant(&mut ones_tensor, 1f32); + + backend + .gemm( + &self.one, + Transpose::NoTrans, + &ones_tensor, + Transpose::NoTrans, + &self.bias.borrow().data, + &self.zero, + &mut output.borrow_mut(), + ) + .unwrap(); + + backend + .gemm( + &self.one, + Transpose::NoTrans, + &input.borrow(), + Transpose::Trans, + &self.weight.borrow().data, + &self.one, + &mut output.borrow_mut(), + ) + .unwrap(); + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + let input = context.get_data(self.descriptor.input(0)); + let output_gradient = context.get_data_gradient(self.descriptor.output(0)); + + let input_gradient = context.acquire_data_gradient(self.descriptor.input(0)); + let weights_gradient = context.acquire_params_gradient(self.descriptor.param(0)); + let bias_gradient = context.acquire_params_gradient(self.descriptor.param(1)); + + // Network error gradient with respect to input data. + // dE/dx = dE/dy * df/dx = dE/dy * w. + backend + .gemm( + &self.one, + Transpose::NoTrans, + &output_gradient.borrow(), + Transpose::NoTrans, + &self.weight.borrow().data, + &self.zero, + &mut input_gradient.borrow_mut(), + ) + .unwrap(); + + // Network error gradient with respect to weights. + // dE/dw = dE/dy * df/dw = dE/dy * x. + backend + .gemm( + &self.one, + Transpose::Trans, + &output_gradient.borrow(), + Transpose::NoTrans, + &input.borrow(), + &self.zero, + &mut weights_gradient.borrow_mut(), + ) + .unwrap(); + + // Network error gradient with respect to bias. + // dE/dw = dE/dy * df/db = dE/dy * [1] = dE/dy. + backend + .copy(&output_gradient.borrow(), &mut bias_gradient.borrow_mut()) + .unwrap(); + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} diff --git a/juice/src/net/common/log_softmax.rs b/juice/src/net/common/log_softmax.rs new file mode 100644 index 000000000..a4d18192a --- /dev/null +++ b/juice/src/net/common/log_softmax.rs @@ -0,0 +1,51 @@ +use crate::co::IBackend; +use crate::conn; +use crate::net::{Context, Descriptor, Layer}; + +#[derive(Debug)] +pub struct LogSoftmax { + descriptor: Descriptor, +} + +impl LogSoftmax { + pub fn new(mut descriptor: Descriptor) -> Self { + assert_eq!(descriptor.inputs().len(), 1); // Should only be one input. + + descriptor.add_output(descriptor.input(0).unit_shape().clone()); + + LogSoftmax { + descriptor: descriptor, + } + } +} + +impl> Layer for LogSoftmax { + fn compute_output(&self, backend: &B, context: &mut Context) { + let input = context.get_data(self.descriptor.input(0)); + let output = context.acquire_data(self.descriptor.output(0)); + backend + .log_softmax(&input.borrow(), &mut output.borrow_mut()) + .unwrap(); + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + let output = context.get_data(self.descriptor.output(0)); + let output_gradient = context.get_data_gradient(self.descriptor.output(0)); + let input_gradient = context.acquire_data_gradient(self.descriptor.input(0)); + backend + .log_softmax_grad( + &output.borrow(), + &output_gradient.borrow(), + &mut input_gradient.borrow_mut(), + ) + .unwrap(); + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} diff --git a/juice/src/net/common/mod.rs b/juice/src/net/common/mod.rs new file mode 100644 index 000000000..023edee7b --- /dev/null +++ b/juice/src/net/common/mod.rs @@ -0,0 +1,5 @@ +mod linear; +mod log_softmax; + +pub use linear::*; +pub use log_softmax::*; \ No newline at end of file diff --git a/juice/src/net/config.rs b/juice/src/net/config.rs new file mode 100644 index 000000000..f66beccac --- /dev/null +++ b/juice/src/net/config.rs @@ -0,0 +1,23 @@ +// Reexport layer configs. +pub use crate::net::{ + common::LinearConfig, container::SequentialConfig, container::FanoutConfig, + loss::NegativeLogLikelihoodConfig, +}; + +#[derive(Debug, Clone)] +pub enum LayerConfig { + Fanout(FanoutConfig), + Sequential(SequentialConfig), + Linear(LinearConfig), + LogSoftmax, + Relu, + Sigmoid, + MeanSquaredError, + NegativeLogLikelihood(NegativeLogLikelihoodConfig), +} + +impl Default for LayerConfig { + fn default() -> LayerConfig { + LayerConfig::Sequential(SequentialConfig::default()) + } +} diff --git a/juice/src/net/container/fanout.rs b/juice/src/net/container/fanout.rs new file mode 100644 index 000000000..dd8f7e463 --- /dev/null +++ b/juice/src/net/container/fanout.rs @@ -0,0 +1,131 @@ +use std::fmt::{Debug, Formatter}; + +use crate::co::IBackend; +use crate::net::{layer_from_config, Context, Descriptor, Inout, Layer, LayerConfig}; +use crate::util::LayerOps; + +#[derive(Debug, Clone)] +pub struct FanoutConfig { + trunk: Box, + branches: Vec, +} + +pub struct Fanout { + descriptor: Descriptor, + trunk: Box>, + branches: Vec>>, +} + +impl FanoutConfig { + pub fn new(trunk: LayerConfig) -> Self { + FanoutConfig { + trunk: Box::new(trunk), + branches: Vec::new(), + } + } + + pub fn with_branch(mut self, branch: LayerConfig) -> Self { + self.branches.push(branch); + self + } +} + +impl + 'static> Fanout { + pub fn new(mut descriptor: Descriptor, config: &FanoutConfig) -> Self { + // Create trunk. + let trunk_inputs: Vec = descriptor.inputs().iter().cloned().collect(); + let trunk = layer_from_config( + descriptor.sub("trunk", trunk_inputs), + &config.trunk, + ); + + // Create branches. Each branch will have same inputs which are trunk's outputs. + let branch_inputs: Vec = trunk.descriptor().outputs().iter().cloned().collect(); + let mut branches = Vec::new(); + for (i, branch_config) in config.branches.iter().enumerate() { + let branch = layer_from_config( + descriptor.sub(&format!("branch_{}", i), branch_inputs.clone()), + branch_config, + ); + + // Expose branch outputs as this layer outputs. + for output in branch.descriptor().outputs() { + descriptor.add_output_copy(output); + } + + // Expose branch learnable params as this layer params. + for params in branch.descriptor().params() { + descriptor.add_params_copy(params); + } + + branches.push(branch); + } + + Fanout { + descriptor: descriptor, + trunk: trunk, + branches: branches, + } + } +} + +impl + 'static> Layer for Fanout { + fn compute_output(&self, backend: &B, context: &mut Context) { + self.trunk.compute_output(backend, context); + for branch in self.branches.iter() { + branch.compute_output(backend, context); + } + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + // Backpropagation process is only supported on a single branch. + // Determine which branch is used (by finding output gradients on the context) + // and compute gradients only along that branch. + + let mut branch_index = None; + for i in 0..self.branches.len() { + let branch_desc = self.branches[i].descriptor(); + for j in 0..branch_desc.outputs().len() { + let has_output_gradient = context.has_data_gradient(&branch_desc.outputs()[j]); + match (branch_index, has_output_gradient) { + // Unpopulated output gradient without a selected branch. Nothing to do. + (None, false) => (), + // This is the first populated output gradient we saw. + // Mark this branch as the one we'll be using for backprop. + (None, true) => { + assert_eq!(j, 0, "Branch {} has partial output gradients; should either have all or none", i); + branch_index = Some(i); + }, + // We have a selected branch and this is an unpopulated output gradient. + (Some(i2), false) => { + assert_ne!(i, i2, "Branch {} has partial output gradients (missing {})", i, j); + }, + // We have a selected branch and this is a populated output gradient. + (Some(i2), true) => { + assert_eq!(i, i2, + "Seen output gradients on branches {} and {}; backprop only on one branch is supported", i, i2); + }, + } + } + } + + assert_ne!(branch_index, None, "No output gradiens on any branch"); + + self.branches[branch_index.unwrap()].compute_gradients(backend, context); + self.trunk.compute_gradients(backend, context); + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} + +impl Debug for Fanout { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Fanout") + } +} \ No newline at end of file diff --git a/juice/src/net/container/mod.rs b/juice/src/net/container/mod.rs new file mode 100644 index 000000000..9dd37ec58 --- /dev/null +++ b/juice/src/net/container/mod.rs @@ -0,0 +1,5 @@ +mod fanout; +mod sequential; + +pub use fanout::*; +pub use sequential::*; \ No newline at end of file diff --git a/juice/src/net/container/sequential.rs b/juice/src/net/container/sequential.rs new file mode 100644 index 000000000..f9445e480 --- /dev/null +++ b/juice/src/net/container/sequential.rs @@ -0,0 +1,238 @@ +//! A container layer the composes nested layers in a sequence. +//! +//! In the most simple case, all nested layers are executed one by one, with outputs of one layer +//! becoming inputs of the next one. Inputs of the Sequential layers are inputs to the first +//! nested layer, and outputs of the last nested layer are the outpus of the Sequential layer: +//! +//! ``` +//! let mut cfg = SequentialConfig::default(); +//! cfg.add_layer("linear", LayerConfig::Linear(LinearConfig{ output_size: 10})); +//! cfg.add_layer("softmax", LayerConfig::Softmax); +//! ``` +//! +//! Sequential layer also supports complex flow graphs (as long as they are acyclic) +//! by allowing nested layers inputs and outputs to be mapped to named "buffers": +//! +//! ``` +//! let mut cfg = SequentialConfig::default(); +//! cfg.add_input("in"); +//! cfg +//! .add_layer("linear1", LayerConfig::Linear(LinearConfig{ output_size: 10})) +//! .map_input("in") +//! .map_output("linear1_out"); +//! cfg +//! .add_layer("linear2", LayerConfig::Linear(LinearConfig{ output_size: 10})) +//! .map_input("in") +//! .map_output("linear2_out"); +//! cfg +//! .add_layer(/*some 2-input 1-output layer*/) +//! .map_input("linear1_out") +//! .map_input("linear2_out") +//! .map_output("out"); +//! cfg.add_output("out"); +//! ``` +//! +//! Note that currently it is requires that layer i can only use outputs from +//! previous layers 0..i-1. +use std::collections::HashMap; +use std::fmt::{Debug, Formatter}; + +use crate::co::IBackend; +use crate::net::{layer_from_config, Context, Descriptor, Layer, LayerConfig}; +use crate::util::LayerOps; + +#[derive(Debug, Clone, Default)] +pub struct SequentialChildConfig { + name: String, + config: LayerConfig, + inputs: Vec, + outputs: Vec, +} + +#[derive(Debug, Clone, Default)] +pub struct SequentialConfig { + // List of named inputs. If not given, only a single input is assumed. + inputs: Vec, + + // Sequence of layers. + layers: Vec, + + // List of named outputs. If not given, Sequential layer outputs will match the outputs + // of the last layer. + outputs: Vec, +} + +pub struct Sequential { + // Outward-facing inputs, outputs and params. + descriptor: Descriptor, + // Nested layers. + children: Vec>>, +} + +impl SequentialChildConfig { + pub fn map_input(mut self, name: &str) -> Self { + self.inputs.push(name.to_string()); + self + } + pub fn map_output(mut self, name: &str) -> Self { + self.outputs.push(name.to_string()); + self + } +} + +impl SequentialConfig { + pub fn new() -> Self { + SequentialConfig::default() + } + + pub fn with_layer(mut self, name: &str, child_config: LayerConfig) -> Self { + self.layers.push(SequentialChildConfig { + name: name.to_string(), + config: child_config, + inputs: Vec::new(), + outputs: Vec::new(), + }); + self + } + + pub fn with_input(mut self, name: &str) -> Self { + self.inputs.push(name.to_string()); + self + } +} + +impl + 'static> Sequential { + pub fn new(mut descriptor: Descriptor, config: &SequentialConfig) -> Self { + // Create internal layers one by one and connect them. + // For the purpose of connecting layers, all inputs and outputs have names, + // which are either explicitly given in the config, or have implicit form of + // io_{i}_{j} for layer inputs and io_{i+1}_{j} for outputs, where i is the layer index + // (0-based) and j is the input/output index within the layer. + // io_0_{j} are inputs to the Sequential layer itself. + + // All layer output Inouts known so far, keyed by their internal names. + // This is initialized with shapes of the Sequential inputs below. + let mut internal_outputs = HashMap::new(); + + // Output names from the previous layer. These will be assumed to be the inputs of the next + // layer unless it has explicit input names in the config. + // Initialized with Sequenial input names below. + let mut prev_layer_output_names = Vec::new(); + + // Create an array of Sequential inputs where: + // * internal names are either explicitly set in config or implicitly set to "io_0_{j}", + // * shapes and data paths are taken from the descriptor. + for (j, input) in descriptor.inputs().iter().enumerate() { + let internal_name = if config.inputs.len() > j { + config.inputs[j].clone() + } else { + format!("io_0_{}", j) + }; + prev_layer_output_names.push(internal_name.clone()); + internal_outputs.insert(internal_name, input.clone()); + } + + // Create children layers. + let mut children = Vec::new(); + for (i, child_config) in config.layers.iter().enumerate() { + // Inputs for a child are either given explicitly in the config, or replicate the + // outputs of the previous child. + let child_input_names = if !child_config.inputs.is_empty() { + child_config.inputs.clone() + } else { + prev_layer_output_names + }; + + let child_inputs = child_input_names + .iter() + .map(|name| { + internal_outputs + .get(name) + .unwrap_or_else(|| panic!("Unknown input/output name {}", name)) + .clone() + }) + .collect(); + + let mut child_layer = layer_from_config( + descriptor.sub(&child_config.name, child_inputs), + &child_config.config, + ); + + // Create data buffer paths for child outpus and save the outputs for subsequent layers. + let child_descriptor = child_layer.descriptor_mut(); + // Config cannot have more outputs that layer actually has. + assert!(child_config.outputs.len() <= child_descriptor.outputs().len()); + prev_layer_output_names = Vec::with_capacity(child_descriptor.outputs().len()); + for (j, output) in child_descriptor.outputs_mut().iter_mut().enumerate() { + let output_name = if j < child_config.outputs.len() { + child_config.outputs[j].clone() + } else { + format!("io_{}_{}", i + 1, j) + }; + + // Assign data buffer path. + output.set_path(&format!("{}.{}", descriptor.path(), &output_name)); + + prev_layer_output_names.push(output_name.clone()); + internal_outputs.insert(output_name, output.clone()); + } + + // Copy layer learnable params links into Sequential descriptor. + for params in child_layer.descriptor().params() { + descriptor.add_params_copy(params); + } + + children.push(child_layer); + } + + // If outputs are given explicitly, use them. Otherwise take outputs of the last layer. + if !config.outputs.is_empty() { + for output_name in config.outputs.iter() { + descriptor.add_output( + internal_outputs + .get(output_name) + .unwrap_or_else(|| panic!("Can't find output {}", output_name)) + .unit_shape() + .clone(), + ); + } + } else { + for output in children.last().unwrap().descriptor().outputs() { + descriptor.add_output_copy(output) + } + } + + Sequential { + descriptor: descriptor, + children: children, + } + } +} + +impl + 'static> Layer for Sequential { + fn compute_output(&self, backend: &B, context: &mut Context) { + for child in self.children.iter() { + child.compute_output(backend, context); + } + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + for child in self.children.iter().rev() { + child.compute_gradients(backend, context); + } + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} + +impl Debug for Sequential { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "Sequential") + } +} \ No newline at end of file diff --git a/juice/src/net/context.rs b/juice/src/net/context.rs new file mode 100644 index 000000000..80c6d88d6 --- /dev/null +++ b/juice/src/net/context.rs @@ -0,0 +1,180 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::iter; +use std::rc::Rc; + +use crate::co::{SharedTensor, TensorDesc}; +use co::frameworks::native::get_native_backend; +use crate::net::{Inout, LearnableParamsLink}; + +/// Context stores data for a single invocation of a network (forward and optionally backward), +/// which includes data passed between layers (at Junctions), loss function gradients with respect +/// to this data (again, at Junctions) and also loss function gradients with respect to the +/// learnable parameters. +#[derive(Debug)] +pub struct Context { + batch_size: usize, + data: HashMap>>>, + data_gradient: HashMap>>>, + param_gradient: HashMap>>>, +} + +impl Context { + pub fn new(batch_size: usize) -> Self { + Context { + batch_size: batch_size, + data: HashMap::new(), + data_gradient: HashMap::new(), + param_gradient: HashMap::new(), + } + } + + pub fn batch_size(&self) -> usize { + self.batch_size + } + + pub fn has_data_gradient(&self, inout: &Inout) -> bool { + Self::has_inout_buffer(&self.data_gradient, inout) + } + + // Get data buffer for the given inout spec found inside the current scope. + // Panics if this data buffer doesn't exist. + pub fn get_data(&mut self, inout: &Inout) -> Rc>> { + Self::get_inout_buffer(&self.data, inout, "data") + } + + // Get data gradient buffer for the given inout spec found inside the current scope. + // Panics if this data buffer doesn't exist. + pub fn get_data_gradient(&mut self, inout: &Inout) -> Rc>> { + Self::get_inout_buffer(&self.data_gradient, inout, "data gradient") + } + + // Same as `get_data()` but will create the buffer on the fly if it doesn't exist. + pub fn acquire_data(&mut self, inout: &Inout) -> Rc>> { + Self::acquire_inout_buffer(&mut self.data, inout, self.batch_size, "data") + } + + // Same as `get_data_gradient()` but will create the buffer on the fly if it doesn't exist. + pub fn acquire_data_gradient(&mut self, inout: &Inout) -> Rc>> { + Self::acquire_inout_buffer( + &mut self.data_gradient, + inout, + self.batch_size, + "data gradient", + ) + } + + // Takes the tensor out of the context. Panics if no such tensor. + pub fn take_data(&mut self, inout: &Inout) -> SharedTensor { + Self::take_inout_buffer(&mut self.data, inout, "data") + } + + // Get params gradient buffer for the given learnable params link. + // Panics if this data buffer doesn't exist. + pub fn get_params_gradient( + &mut self, + params: &LearnableParamsLink, + ) -> Rc>> { + let key = Rc::as_ptr(¶ms) as usize; + match self.param_gradient.get(&key) { + Some(data) => data.clone(), + None => panic!("No params gradient for {:?}", params.borrow()), + } + } + + // Same as `get_params_gradient()` but will create the buffer on the fly if it doesn't exist. + // Buffer is zero-filled on creation. + pub fn acquire_params_gradient( + &mut self, + params: &LearnableParamsLink, + ) -> Rc>> { + let key = Rc::as_ptr(¶ms) as usize; + match self.param_gradient.get(&key) { + Some(data) => return data.clone(), + None => (), + } + + // println!("Creating params gradient for {:?}", params.borrow()); + + let backend = get_native_backend(); + let shape = params.borrow().data.desc().clone(); + let mut buffer = SharedTensor::::new(&shape); + // Initialize with zeroes. + buffer + .write_only(backend.device()) + .unwrap() + .as_mut_slice() + .fill(0.0); + let buffer_rc = Rc::new(RefCell::new(buffer)); + self.param_gradient.insert(key, buffer_rc.clone()); + buffer_rc + } + + fn has_inout_buffer( + storage: &HashMap>>>, + inout: &Inout, + ) -> bool { + let key = Rc::as_ptr(&inout.junction) as usize; + storage.get(&key).is_some() + } + + fn get_inout_buffer( + storage: &HashMap>>>, + inout: &Inout, + purpose: &str, + ) -> Rc>> { + let key = Rc::as_ptr(&inout.junction) as usize; + match storage.get(&key) { + Some(data) => data.clone(), + None => panic!("No {} for {:?}", purpose, inout), + } + } + + fn acquire_inout_buffer( + storage: &mut HashMap>>>, + inout: &Inout, + batch_size: usize, + purpose: &str, + ) -> Rc>> { + let key = Rc::as_ptr(&inout.junction) as usize; + match storage.get(&key) { + Some(data) => return data.clone(), + None => (), + }; + + // REMOVE + let backend = get_native_backend(); + + let shape: TensorDesc = iter::once(batch_size) + .chain(inout.junction.unit_shape.iter().map(|i| *i)) + .collect(); + // println!( + // "Creating {} for {:?} with shape {:?}", + // purpose, inout, shape + // ); + let mut buffer = SharedTensor::::new(&shape); + + // REMOVE + buffer + .write_only(backend.device()) + .unwrap() + .as_mut_slice() + .fill(0.0); + + let buffer_rc = Rc::new(RefCell::new(buffer)); + storage.insert(key, buffer_rc.clone()); + buffer_rc + } + + fn take_inout_buffer( + storage: &mut HashMap>>>, + inout: &Inout, + purpose: &str, + ) -> SharedTensor { + let key = Rc::as_ptr(&inout.junction) as usize; + match storage.remove(&key) { + Some(data) => Rc::try_unwrap(data).unwrap().into_inner(), + None => panic!("No {} for {:?}", purpose, inout), + } + } +} diff --git a/juice/src/net/descriptor.rs b/juice/src/net/descriptor.rs new file mode 100644 index 000000000..e3c027c29 --- /dev/null +++ b/juice/src/net/descriptor.rs @@ -0,0 +1,184 @@ +use std::cell::RefCell; + +use std::rc::Rc; + +use crate::co::{SharedTensor, TensorDesc}; + +/// A junction is a point connecting a layer output to another layer input (or several of them). +/// Junction describes the shape of the data which flows through it (except for the batch size +/// part of it, which is a property of specific invocation and is captured in a `Context`). +/// Each junction will have an associated buffer in a `Context` (two of them, actually: one +/// for data and another for gradient). +#[derive(Debug)] +pub struct Junction { + /// Shape of one data unit in a batch. + pub unit_shape: TensorDesc, + /// Human-readable path of the junction, mostly for logging and debugging. + /// Can change after creation to support layer waterfall construction/connection model. + pub path: RefCell, +} + +/// A struct representing either an input or output of a layer. +/// Inouts that share same junction are "connected" in the sense that they use the same buffer +/// (written to by output Inout and read from by input Inout). +#[derive(Debug, Clone)] +pub struct Inout { + // Junction used to read/write data. + pub junction: Rc, +} + +/// A single blob of params that can be learned during network training. +/// Params are "owned" by the layers, but layers must expose them via LearnableParamsLink in +/// the Descriptor. Container layers must expose all nested layers params. +#[derive(Debug)] +pub struct LearnableParams { + pub data: SharedTensor, + pub learning_rate: f32, + /// Human-readable path which includes the owner layer path, mostly for logging and debuggin. + pub path: String, +} + +/// A pointer to an instance of LearnableParams. Among other things is used to find associated +/// gradient buffer in the `Context`. +pub type LearnableParamsLink = Rc::>; + +/// Descriptor of a layer, containing information about layer name, +/// inputs, outputs and params. Inputs and outputs are created using a waterfall model: +/// 1. Parent logic creates a Descriptor with a chosen name and inputs (with shapes). +/// 2. Descriptor is passed to the layer constructor. Layer determines the number and shapes of +/// the outputs and adds them to the descriptor. Params are also initialized and added if +/// layer uses them. +/// 3. After layer is created, parent logic assigns human-readable paths to the outputs Junctions +/// and connects them appropriately. +#[derive(Debug, Clone)] +pub struct Descriptor { + path: String, + inputs: Vec, + outputs: Vec, + + // All learnable params of the layer. For container layers, this must contain all params + // of the nested layers. + params: Vec, +} + +impl Inout { + // Create an Inout without buffer path. + pub fn new(unit_shape: TensorDesc) -> Self { + Inout { + junction: Rc::new(Junction { + unit_shape: unit_shape, + path: RefCell::new("".to_owned()), + }), + } + } + + // Create an Inout with buffer path. + pub fn new_with_path(unit_shape: TensorDesc, path: &str) -> Self { + Inout { + junction: Rc::new(Junction { + unit_shape: unit_shape, + path: RefCell::new(path.to_owned()), + }), + } + } + + pub fn set_path(&mut self, path: &str) { + self.junction.path.replace(path.to_owned()); + } + + pub fn unit_shape(&self) -> &TensorDesc { + &self.junction.unit_shape + } +} + +impl Descriptor { + // Create a top-level Descriptor. + pub fn top(name: &str, inputs: Vec) -> Self { + Descriptor { + path: name.to_owned(), + inputs: inputs, + outputs: Vec::new(), + params: Vec::new(), + } + } + + // Create a Descriptor which is nested under this one. + // In practice, "nested" only means the new descriptor path is constructed as + // ".". + pub fn sub(&self, name: &str, inputs: Vec) -> Self { + Descriptor { + path: format!("{}.{}", self.path, name), + inputs: inputs, + outputs: Vec::new(), + params: Vec::new(), + } + } + + pub fn path(&self) -> &str { + &self.path + } + + pub fn inputs(&self) -> &[Inout] { + &self.inputs + } + + pub fn input(&self, index: usize) -> &Inout { + &self.inputs[index] + } + + pub fn outputs(&self) -> &[Inout] { + &self.outputs + } + + pub fn outputs_mut(&mut self) -> &mut [Inout] { + &mut self.outputs + } + + pub fn output(&self, index: usize) -> &Inout { + &self.outputs[index] + } + + pub fn output_mut(&mut self, index: usize) -> &mut Inout { + &mut self.outputs[index] + } + + pub fn params(&self) -> &[LearnableParamsLink] { + &self.params + } + pub fn params_mut(&mut self) -> &mut [LearnableParamsLink] { + &mut self.params + } + + pub fn param(&self, index: usize) -> &LearnableParamsLink { + &self.params[index] + } + + pub fn add_output(&mut self, unit_shape: TensorDesc) -> &mut Inout { + self.outputs.push(Inout::new(unit_shape)); + self.outputs.last_mut().unwrap() + } + + pub fn add_output_copy(&mut self, inout: &Inout) { + self.outputs.push(inout.clone()) + } + + pub fn create_params( + &mut self, + name: &str, + data: SharedTensor, + learning_rate: f32, + ) -> LearnableParamsLink { + let params = LearnableParams { + data: data, + learning_rate: learning_rate, + path: format!("{}.{}", self.path, name), + }; + let params_rc = Rc::new(RefCell::new(params)); + self.params.push(params_rc.clone()); + params_rc + } + + pub fn add_params_copy(&mut self, params: &LearnableParamsLink) { + self.params.push(params.clone()); + } +} diff --git a/juice/src/net/layer.rs b/juice/src/net/layer.rs new file mode 100644 index 000000000..eca9e8b90 --- /dev/null +++ b/juice/src/net/layer.rs @@ -0,0 +1,66 @@ +use std::fmt::Debug; + +use crate::co::{IBackend}; +use crate::net::{Context, Descriptor, LayerConfig}; +use crate::net::activation::*; +use crate::net::common::*; +use crate::net::container::*; +use crate::net::loss::*; +use crate::util::LayerOps; + +/// A generalized layer in a network, performing certain function on inputs producing outputs. +/// Layers be can combined in an acyclic graph forming an ML network that can compute output from +/// inputs and can be "trained" using the backpropagation process. +/// +/// Note that a Layer is a more general concept than conventional ML layers and includes: +/// * conventional "layers" like convolutional, fully-connected, dropout, etc; +/// * activation functions like ReLU, softmax, etc; +/// * groups of sublayers. +/// +/// Layer can have arbitrary number of inputs, outputs and weights, which are all described in the +/// `Descriptor`. Inputs and outputs declare the shapes of the 'units' of data, which then can +/// be batched according to `Context` settings. The actual shapes of then inputs and outputs are +/// always of the form [N, {unit_shape}] where N is the batch size. +/// +/// Number and unit shapes of the inputs are defined by the upstream logic. Number and unit shapes +/// of the outputs are determined by the layer depending on input unit shapes and layer settings. +/// When creating a layer, parent logic passes a partially filled `Descriptor`, containing inputs +/// information. Layer then must fill the outputs of the `Descriptor`. +/// +/// It is assumed that weight shapes do not depend on batch size N (as weights are created once and +/// cannot change shape during learning). +pub trait Layer: Debug { + // Computes output given the input(s). + fn compute_output(&self, backend: &B, context: &mut Context); + + fn compute_gradients(&self, backend: &B, context: &mut Context); + + fn descriptor(&self) -> &Descriptor; + + fn descriptor_mut(&mut self) -> &mut Descriptor; +} + +/// Creates layer from a config. +/// Takes a partially filled Descriptor, which should have a valid path and fully populated inputs +/// (including data_path). +/// This is an internal function, typically users should be using net_from_config() instead. +/// TODO: Make it private (for now required for Solver). +pub fn layer_from_config + 'static>( + descriptor: Descriptor, + config: &LayerConfig, +) -> Box> { + match config { + LayerConfig::Sequential(sequential_config) => { + Box::new(Sequential::new(descriptor, sequential_config)) + } + LayerConfig::Fanout(fanout_config) => Box::new(Fanout::new(descriptor, fanout_config)), + LayerConfig::Linear(linear_config) => Box::new(Linear::new(descriptor, linear_config)), + LayerConfig::LogSoftmax => Box::new(LogSoftmax::new(descriptor)), + LayerConfig::Relu => Box::new(Relu::new(descriptor)), + LayerConfig::Sigmoid => Box::new(Sigmoid::new(descriptor)), + LayerConfig::NegativeLogLikelihood(nll_config) => { + Box::new(NegativeLogLikelihood::new(descriptor, nll_config)) + } + LayerConfig::MeanSquaredError => Box::new(MeanSquaredError::new(descriptor)), + } +} \ No newline at end of file diff --git a/juice/src/net/loss/mean_squared_error.rs b/juice/src/net/loss/mean_squared_error.rs new file mode 100644 index 000000000..291de1c56 --- /dev/null +++ b/juice/src/net/loss/mean_squared_error.rs @@ -0,0 +1,88 @@ +use crate::co::{IBackend, ITensorDesc}; +use crate::coblas::plugin::*; +use crate::net::{Context, Descriptor, Layer}; +use crate::util::{native_backend, Axpby}; + +#[derive(Debug)] +// Layer implementing the Mean Squared Error loss function. +// This implementation supports sparse labels (marked as NaN values in label tensor). +// Gradient for absent label values will be 0.0. +pub struct MeanSquaredError { + descriptor: Descriptor, +} + +impl MeanSquaredError { + pub fn new(descriptor: Descriptor) -> Self { + assert_eq!( + descriptor.inputs().len(), + 2, + "MeanSquaredError must take 2 inputs: values and labels" + ); + assert_eq!( + descriptor.inputs()[0].unit_shape().size(), + descriptor.inputs()[1].unit_shape().size(), + "Labels must be of the same size" + ); + // Loss layers don't have outputs. + + MeanSquaredError { + descriptor: descriptor, + } + } +} + +impl + Copy> Layer for MeanSquaredError { + fn compute_output(&self, backend: &B, context: &mut Context) { + // No output computation since loss layer doesn't have outputs. + // It's main purpose is to start the backpropagation process by + // computing the loss gradient with respect to net final output + // in compute_gradients(). + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + let predictions = context.get_data(self.descriptor.input(0)); + let labels = context.get_data(self.descriptor.input(1)); + let input_gradient = context.acquire_data_gradient(self.descriptor.input(0)); + + let native = native_backend(); + + let predictions_ref = predictions.borrow(); + let predictions_data = predictions_ref + .read(native.device()) + .unwrap() + .as_slice::(); + + let labels_ref = labels.borrow(); + let labels_data = labels_ref.read(native.device()).unwrap().as_slice::(); + + let mut input_gradient_ref = input_gradient.borrow_mut(); + let input_gradient_data = input_gradient_ref + .write_only(native.device()) + .unwrap() + .as_mut_slice::(); + + for i in 0..predictions_data.len() { + input_gradient_data[i] = match labels_data[i].is_nan() { + true => 0.0, + false => 2.0 * (predictions_data[i] - labels_data[i]), + }; + } + + // // Gradient is calculated as 2 * (predictions - labels). + // backend.copy(&labels.borrow(), &mut input_gradient.borrow_mut()); + // backend.axpby( + // &native_scalar(2.0), + // &predictions.borrow(), + // &native_scalar(-2.0), + // &mut input_gradient.borrow_mut(), + // ); + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} diff --git a/juice/src/net/loss/mod.rs b/juice/src/net/loss/mod.rs new file mode 100644 index 000000000..3379ed33e --- /dev/null +++ b/juice/src/net/loss/mod.rs @@ -0,0 +1,5 @@ +mod mean_squared_error; +mod negative_log_likelihood; + +pub use mean_squared_error::*; +pub use negative_log_likelihood::*; \ No newline at end of file diff --git a/juice/src/net/loss/negative_log_likelihood.rs b/juice/src/net/loss/negative_log_likelihood.rs new file mode 100644 index 000000000..4d668b453 --- /dev/null +++ b/juice/src/net/loss/negative_log_likelihood.rs @@ -0,0 +1,78 @@ +use crate::co::{IBackend, ITensorDesc}; +use crate::net::{Context, Descriptor, Layer}; +use crate::util::native_backend; + +#[derive(Clone, Debug, Default)] +pub struct NegativeLogLikelihoodConfig { + /// How many different classes can be classified. + pub num_classes: usize, +} + +#[derive(Debug)] +pub struct NegativeLogLikelihood { + descriptor: Descriptor, + num_classes: usize, +} + +impl NegativeLogLikelihood { + pub fn new(descriptor: Descriptor, config: &NegativeLogLikelihoodConfig) -> Self { + assert_eq!( + descriptor.inputs().len(), + 2, + "NegativeLogLikelihood must take 2 inputs: probabilities and labels" + ); + assert_eq!( + descriptor.inputs()[1].unit_shape().size(), + 1, + "Labels must be of [1] shape" + ); + + // Note that loss layers don't have outputs, since the result of loss computation is always + // a single number which can't then be piped to other layers which expect data to have + // shape [batch_size, ...] + + NegativeLogLikelihood { + descriptor: descriptor, + num_classes: config.num_classes, + } + } +} + +impl Layer for NegativeLogLikelihood { + fn compute_output(&self, backend: &B, context: &mut Context) { + // No output computation since loss layer doesn't have outputs. + // It's main purpose is to start the backpropagation process by + // computing the loss gradient with respect to net final output + // in compute_gradients(). + } + + fn compute_gradients(&self, backend: &B, context: &mut Context) { + let probabilities_gradient = context.acquire_data_gradient(self.descriptor.input(0)); + let labels = context.get_data(self.descriptor.input(1)); + + let native = native_backend(); + let labels_data = labels.borrow(); + let native_labels = labels_data.read(native.device()).unwrap().as_slice::(); + let mut writable_gradient = vec![0f32; probabilities_gradient.borrow().desc().size()]; + + for (batch_n, &label_value) in native_labels.iter().enumerate() { + let index = (self.num_classes * batch_n) + label_value as usize; + writable_gradient[index] = -1f32; + } + crate::util::write_to_memory( + probabilities_gradient + .borrow_mut() + .write_only(native.device()) + .unwrap(), + &writable_gradient, + ); + } + + fn descriptor(&self) -> &Descriptor { + &self.descriptor + } + + fn descriptor_mut(&mut self) -> &mut Descriptor { + &mut self.descriptor + } +} diff --git a/juice/src/net/mod.rs b/juice/src/net/mod.rs new file mode 100644 index 000000000..faa3ef09a --- /dev/null +++ b/juice/src/net/mod.rs @@ -0,0 +1,34 @@ +//! Representation of a neural network. +//! +//! Network consists of 2 parts: +//! 1. Static layer configuration and connections between them. Each layer takes data as input, +//! performs certain operation which produced the output. Layers can be combined into +//! hierarchical structures via special container layers. Configuration of each layer, as +//! well as connections between them is static, i.e. it cannot change in runtime. The shape +//! of data 'units' in inputs/outputs is also fixed, but the batch size (batch comprises +//! several data 'units') is not. Layer inputs/outputs as well as their connections is +//! captured in a `Descriptor`, which also contains information about learnable params. +//! 2. Dynamic state representing (partial) data flow through the network. When doing the forward +//! pass through the net (converting inputs to outputs), all the intermediate state (data +//! buffers passed between layers) is saved in a `Context`, which allows reuse of this data +//! when doing the backpropagation step (intermediate data for backpropagation, for example, +//! gradients, is also saved in the `Context`). Context also defines the batch size for this +//! particular instance of exercising the network, which allows to use different batch sizes +//! for different use cases (for example simultaneous learning with batches and using the net +//! for producing outputs by setting batch size to 1). + +mod activation; +mod common; +mod config; +mod container; +mod context; +mod descriptor; +mod layer; +mod loss; +mod network; + +pub use config::*; +pub use context::*; +pub use descriptor::*; +pub use layer::*; +pub use network::*; diff --git a/juice/src/net/network.rs b/juice/src/net/network.rs new file mode 100644 index 000000000..c2a3ccedb --- /dev/null +++ b/juice/src/net/network.rs @@ -0,0 +1,159 @@ +use std::fs; +use std::io; +use std::path; + +use crate::net_capnp::network as capnp_network; + +use crate::capnp_util::*; +use crate::co::frameworks::native::get_native_backend; +use crate::co::{IBackend, ITensorDesc, SharedTensor, TensorDesc}; +use crate::coblas::plugin::Copy; +use crate::net::layer::Layer; +use crate::net::{layer_from_config, Context, Descriptor, Inout, LayerConfig}; +use crate::util::LayerOps; + +// A trainable network. +pub struct Network> { + config: LayerConfig, + top: Box>, +} + +impl + 'static> Network { + /// Creates network from a config. + pub fn from_config(config: LayerConfig, input_shapes: &[TensorDesc]) -> Network { + let inputs = input_shapes + .iter() + .enumerate() + .map(|(i, shape)| Inout::new_with_path(shape.clone(), &format!("net_in_{}", i))) + .collect(); + let descriptor = Descriptor::top("net", inputs); + let top = layer_from_config(descriptor, &config); + + Network { + config: config, + top: top, + } + } + + /// Top level layer (typically a container layer). + pub fn top(&self) -> &dyn Layer { + self.top.as_ref() + } + + /// Mutable top level layer (typically a container layer). + pub fn top_mut(&mut self) -> &mut dyn Layer { + self.top.as_mut() + } + + /// Do a forward pass on the provided inputs and return the network output. + /// This function assumes the network has exactly one input and exactly one output + /// (will panic otherwise). + /// Input shape must be either [] or [N, ] + /// (latter case for batched processing). + /// Returns a tensor of shape which is either [] or [N, ], + /// depending on the input shape. + pub fn transform(&self, backend: &B, input: &SharedTensor) -> SharedTensor { + assert_eq!(self.top.descriptor().inputs().len(), 1); + assert_eq!(self.top.descriptor().outputs().len(), 1); + + // Figure out batch size. + let net_input_size = self.top.descriptor().input(0).unit_shape().size(); + let batch_size = if input.desc().size() == net_input_size { + 1 + } else { + assert!(input.desc().len() > 1); + let input_unit_size = input.desc().iter().skip(1).fold(1, |acc, i| acc * i); + assert_eq!(input_unit_size, net_input_size); + input.desc()[0] + }; + + let mut context = Context::new(batch_size); + + // Copy input data into the context. + let context_inputs = context.acquire_data(self.top.descriptor().input(0)); + assert_eq!(context_inputs.borrow().desc().size(), input.desc().size()); + backend.copy(&input, &mut context_inputs.borrow_mut()).unwrap(); + + // Compute network output and take it out of the context as a return value. + self.top.compute_output(backend, &mut context); + context.take_data(self.top.descriptor().output(0)) + } + + pub fn save>(&mut self, path: P) -> io::Result<()> { + let path = path.as_ref(); + let ref mut out = fs::File::create(path)?; + + let mut message = ::capnp::message::Builder::new_default(); + { + let mut net_message = message.init_root::(); + self.write_capnp(&mut net_message); + } + ::capnp::serialize_packed::write_message(out, &message).unwrap(); + + Ok(()) + } +} + +impl + 'static> Clone for Network { + fn clone(&self) -> Network { + let input_shapes: Vec = self + .top + .descriptor() + .inputs() + .iter() + .map(|input| input.unit_shape().clone()) + .collect(); + let net = Network::from_config(self.config.clone(), &input_shapes); + + // Copy weights data. + let backend = get_native_backend(); + assert_eq!( + self.top.descriptor().params().len(), + net.top.descriptor().params().len() + ); + for i in 0..self.top.descriptor().params().len() { + let from_params = self.top.descriptor().params()[i].borrow(); + let mut to_params = net.top.descriptor().params()[i].borrow_mut(); + backend.copy(&from_params.data, &mut to_params.data).unwrap(); + to_params.learning_rate = from_params.learning_rate; + } + + net + } +} + +impl<'a, B: IBackend + LayerOps> CapnpWrite<'a> for Network { + type Builder = capnp_network::Builder<'a>; + + /// Write the Layer into a capnp message. + fn write_capnp(&self, builder: &mut Self::Builder) { + // Write top-level cofnig. + { + // let mut config_message = builder.reborrow().init_config(); + // self.config.write_capnp(&mut config_message); + } + + // Write input shapes. + { + let input_shapes: Vec = self + .top + .descriptor() + .inputs() + .iter() + .map(|input| input.unit_shape().clone()) + .collect(); + + let inputs_count = self.top.descriptor().inputs().len(); + let mut inputs_message = builder.reborrow().init_inputs(input_shapes.len() as u32); + for i in 0..input_shapes.len() { + let mut input_message = inputs_message.reborrow().get(i as u32); + let mut vals = input_message + .reborrow() + .init_shape(input_shapes[i].len() as u32); + for j in 0..input_shapes[i].len() { + vals.set(j as u32, input_shapes[i][j] as u64); + } + } + } + } +} diff --git a/juice/src/solver/mod.rs b/juice/src/solver/mod.rs index 78af6986f..23994482e 100644 --- a/juice/src/solver/mod.rs +++ b/juice/src/solver/mod.rs @@ -6,79 +6,117 @@ pub mod confusion_matrix; pub mod regression_evaluator; +use std::cell::RefCell; + +use std::rc::Rc; + pub use self::confusion_matrix::ConfusionMatrix; pub use self::regression_evaluator::{RegressionEvaluator, RegressionLoss}; use crate::co::prelude::*; -use crate::layer::*; -use crate::layers::SequentialConfig; +use crate::net::*; use crate::solvers::*; -use std::marker::PhantomData; use crate::util::{ArcLock, LayerOps, SolverOps}; -use std::rc::Rc; #[derive(Debug)] /// Solver that optimizes a [Layer][1] with a given objective. /// [1]: ../layer/index.html -pub struct Solver +pub struct Solver where - SolverB: IBackend + SolverOps, B: IBackend + LayerOps, { - net: Layer, - objective: Layer, + backend: Rc, + context: Context, + + net: Box>, + objective: Box>, /// The implementation of the Solver - pub worker: Box>, + pub worker: Box>, config: SolverConfig, /// The current iteration / number of times weights have been updated iter: usize, - - solver_backend: PhantomData, } -impl Solver +impl Solver where - SolverB: IBackend + SolverOps + 'static, - B: IBackend + LayerOps + 'static, + B: IBackend + LayerOps + SolverOps + 'static, { /// Create Solver from [SolverConfig][1] /// [1]: ./struct.SolverConfig.html /// /// This is the **preferred method** to create a Solver for training a neural network. - pub fn from_config(net_backend: Rc, obj_backend: Rc, config: &SolverConfig) -> Solver { - let network = Layer::from_config(net_backend, &config.network); - let mut worker = config.solver.with_config(obj_backend.clone(), &config); - worker.init(&network); + pub fn from_config( + backend: Rc, + config: &SolverConfig, + input_shapes: &[TensorDesc], + label_shape: &TensorDesc, + ) -> Solver { + let context = Context::new(config.minibatch_size); + let network = layer_from_config( + Descriptor::top( + "net", + input_shapes + .iter() + .enumerate() + .map(|(i, shape)| Inout::new_with_path(shape.clone(), &format!("net_in_{}", i))) + .collect(), + ), + &config.network, + ); + assert_eq!(network.descriptor().outputs().len(), 1); // Net must have only one output. + + let objective = layer_from_config( + Descriptor::top( + "loss", + vec![ + network.descriptor().output(0).clone(), + Inout::new_with_path(label_shape.clone(), "labels"), + ], + ), + &config.objective, + ); + + let weight_shapes: Vec = network + .descriptor() + .params() + .iter() + .map(|w| w.borrow().data.desc().clone()) + .collect(); + let mut worker = config.solver.with_config(backend.clone(), &config); + worker.init(&weight_shapes); + + // Loss layer cannot have params (no one will train them!). + assert!(objective.descriptor().params().is_empty()); Solver { + backend: backend, + context: context, worker: worker, net: network, - objective: Layer::from_config(obj_backend, &config.objective), + objective: objective, iter: 0, - config: config.clone(), - solver_backend: PhantomData::, } } } -impl Solver +impl Solver where - SolverB: IBackend + SolverOps + 'static, - B: IBackend + LayerOps + 'static, + B: IBackend + LayerOps + SolverOps + 'static, { - fn init(&mut self, backend: Rc) { + fn init(&mut self, backend: Rc, input_shapes: &[TensorDesc]) { info!("Initializing solver from configuration"); - let mut config = self.config.clone(); - self.init_net(backend, &mut config); + let config = self.config.clone(); + self.init_net(backend, &config, input_shapes); } /// Initialize the training net - fn init_net(&mut self, backend: Rc, param: &mut SolverConfig) { - self.net = Layer::from_config(backend, ¶m.network); + fn init_net(&mut self, backend: Rc, param: &SolverConfig, input_shapes: &[TensorDesc]) { + unimplemented!(); + //self.net = layer_from_config(&*backend, ¶m.network, input_shapes); } /// Train the network with one minibatch @@ -86,38 +124,79 @@ where &mut self, mb_data: ArcLock>, mb_target: ArcLock>, - ) -> ArcLock> { - // forward through network and classifier - let network_out = self.net.forward(&[mb_data])[0].clone(); - let _ = self.objective.forward(&[network_out.clone(), mb_target]); - - // forward through network and classifier - let classifier_gradient = self.objective.backward(&[]); - self.net.backward(&classifier_gradient[0..1]); + ) -> SharedTensor { + // Copy intput data into the network context. + let data = self.context.acquire_data(self.net.descriptor().input(0)); + self.backend + .copy(&mb_data.read().unwrap(), &mut data.borrow_mut()) + .unwrap(); + let labels = self + .context + .acquire_data(self.objective.descriptor().input(1)); + self.backend + .copy(&mb_target.read().unwrap(), &mut labels.borrow_mut()) + .unwrap(); + + // Compute network output and the loss. + self.net.compute_output(&*self.backend, &mut self.context); + self.objective + .compute_output(&*self.backend, &mut self.context); + + // Compute params gradients by doing a backpropagation on the network. + self.objective + .compute_gradients(&*self.backend, &mut self.context); + self.net + .compute_gradients(&*self.backend, &mut self.context); + + // Let the solver worker adjust the params gradients before applying them to params. + let params: Vec = + self.net.descriptor().params().iter().cloned().collect(); + let params_gradients: Vec<(Rc>>, f32)> = params + .iter() + .map(|p| { + ( + self.context.get_params_gradient(p), + p.borrow().learning_rate, + ) + }) + .collect(); + self.worker + .compute_update(&self.config, ¶ms_gradients, self.iter); + + // Finally apply the weight change. + let shared_a = crate::util::native_scalar(-1f32); + for i in 0..self.net.descriptor().params().len() { + let gradient = ¶ms_gradients[i].0.borrow(); + let params = &mut self.net.descriptor_mut().param(i).borrow_mut().data; + self.backend.axpy(&shared_a, gradient, params).unwrap(); + } - self.worker.compute_update(&self.config, &mut self.net, self.iter); - self.net.update_weights(self.worker.backend()); self.iter += 1; + let out_buffer = self.context.get_data(self.net.descriptor().output(0)); + let mut network_out = SharedTensor::::new(out_buffer.borrow().desc()); + self.backend + .copy(&out_buffer.borrow(), &mut network_out) + .unwrap(); network_out } - /// Returns the network trained by the solver. - /// - /// This is the recommended method to get a usable trained network. - pub fn network(&self) -> &Layer { - &self.net - } - - /// Returns the network trained by the solver. - /// - /// This is the recommended method to get a trained network, - /// if you want to alter the network. Keep in mind that altering the network - /// might render the solver unusable and continuing training the network with it will yield - /// unexpected results. - pub fn mut_network(&mut self) -> &mut Layer { - &mut self.net - } + // /// Returns the network trained by the solver. + // /// + // /// This is the recommended method to get a usable trained network. + // pub fn network(&self) -> &Layer { + // &self.net + // } + + // /// Returns the network trained by the solver. + // /// + // /// This is the recommended method to get a trained network, + // /// if you want to alter the network. Keep in mind that altering the network + // /// might render the solver unusable and continuing training the network with it will yield + // /// unexpected results. + // pub fn mut_network(&mut self) -> &mut Layer { + // &mut self.net + // } } /// Implementation of a specific Solver. @@ -130,7 +209,7 @@ where SolverB: IBackend + SolverOps, { /// Initialize the solver, setting up any network related data. - fn init(&mut self, net: &Layer) {} + fn init(&mut self, weight_shapes: &[TensorDesc]) {} /// Update the weights of the net with part of the gradient. /// @@ -143,7 +222,12 @@ where /// Used by [step][2] to optimize the network. /// /// [2]: ./struct.Solver.html#method.step - fn compute_update(&mut self, param: &SolverConfig, network: &mut Layer, iter: usize); + fn compute_update( + &mut self, + param: &SolverConfig, + weight_gradients: &[(Rc>>, f32)], + iter: usize, + ); /// Returns the backend used by the solver. fn backend(&self) -> &SolverB; @@ -240,8 +324,8 @@ impl Default for SolverConfig { fn default() -> SolverConfig { SolverConfig { name: "".to_owned(), - network: LayerConfig::new("default", SequentialConfig::default()), - objective: LayerConfig::new("default", SequentialConfig::default()), + network: LayerConfig::default(), + objective: LayerConfig::default(), solver: SolverKind::SGD(SGDKind::Momentum), minibatch_size: 1, @@ -354,7 +438,10 @@ pub enum SolverKind { impl SolverKind { /// Create a Solver of the specified kind with the supplied SolverConfig. - pub fn with_config + 'static, NetB: IBackend + LayerOps + 'static>( + pub fn with_config< + B: IBackend + SolverOps + 'static, + NetB: IBackend + LayerOps + 'static, + >( &self, backend: Rc, config: &SolverConfig, @@ -375,7 +462,10 @@ pub enum SGDKind { impl SGDKind { /// Create a Solver of the specified kind with the supplied SolverConfig. - pub fn with_config + 'static, NetB: IBackend + LayerOps + 'static>( + pub fn with_config< + B: IBackend + SolverOps + 'static, + NetB: IBackend + LayerOps + 'static, + >( &self, backend: Rc, config: &SolverConfig, diff --git a/juice/src/solvers/mod.rs b/juice/src/solvers/mod.rs index 08ab06916..cad708159 100644 --- a/juice/src/solvers/mod.rs +++ b/juice/src/solvers/mod.rs @@ -27,12 +27,14 @@ //! [minimum]: http://mathworld.wolfram.com/GlobalMinimum.html //! [backprop]: https://en.wikipedia.org/wiki/Backpropagation +use std::rc::Rc; +use std::cell::RefCell; + #[allow(unused_import_braces)] pub use self::sgd::Momentum; pub mod sgd; use crate::co::{IBackend, SharedTensor}; -use crate::layer::*; use crate::solver::*; use crate::util::*; @@ -40,10 +42,10 @@ trait SGDSolver, NetB: IBackend + LayerOps>, + weight_gradient: &mut SharedTensor, history_blob_id: usize, - global_lr: &f32, - blob_lr: &f32, + global_lr: f32, + blob_lr: f32, ); /// [Clip gradients][1] when they exceed [SolverConfig.clip_gradients][2]. @@ -59,20 +61,19 @@ trait SGDSolver, NetB: IBackend + LayerOps + 'static>(&self, config: &SolverConfig, net: &mut Layer) { + fn clip_gradients(&self, config: &SolverConfig, weight_gradients: &[Rc>>]) { // skip clipping gradients if SolverConfig.clip_gradients is set to None if let Some(clip_threshold) = config.clip_gradients { let native = native_backend(); - let net_gradients = net.learnable_weights_gradients(); let mut sumsq_diff = 0f32; let backend = self.backend(); - for net_gradient in net_gradients.clone() { - let gradient = net_gradient.read().unwrap(); + for net_gradient in weight_gradients { + let gradient = &net_gradient.borrow(); // PERF: preallocate tensor once let mut result = SharedTensor::new(&[1]); // gradient.sumsq_diff(self.backend(), &mut result); - self.backend().dot(&gradient, &gradient, &mut result); + self.backend().dot(gradient, gradient, &mut result); let sumsq_diff_slice = result.read(native.device()).unwrap().as_slice::(); sumsq_diff += sumsq_diff_slice[0]; @@ -88,8 +89,8 @@ trait SGDSolver, NetB: IBackend + LayerOps, NetB: IBackend + LayerOps /// E.g. with a `minibatch_size` of 4 we need to scale the gradient by 0.25 (= 1/4). - fn normalize(&self, config: &SolverConfig, weight_blob: &ArcLock>) { + fn normalize(&self, config: &SolverConfig, weight_blob: &mut SharedTensor) { if config.minibatch_size > 1 { let scale_factor = 1f32 / config.minibatch_size as f32; - let mut gradient = weight_blob.write().unwrap(); let native = native_backend(); let mut scale_factor_shared = native_scalar(scale_factor); // self.backend().scal_plain(&scale_factor_shared, &mut gradient).unwrap(); - self.backend().scal(&mut scale_factor_shared, &mut gradient).unwrap(); + self.backend().scal(&mut scale_factor_shared, weight_blob).unwrap(); } } diff --git a/juice/src/solvers/sgd/mod.rs b/juice/src/solvers/sgd/mod.rs index e39f22f53..cd852b536 100644 --- a/juice/src/solvers/sgd/mod.rs +++ b/juice/src/solvers/sgd/mod.rs @@ -17,7 +17,6 @@ //! //! [backprop]: https://en.wikipedia.org/wiki/Backpropagation //! [gd]: https://en.wikipedia.org/wiki/Gradient_descent - pub use self::momentum::Momentum; /// Implement [ISolver][1] for [SGD solvers][2]. @@ -26,39 +25,49 @@ pub use self::momentum::Momentum; #[macro_export] macro_rules! impl_isolver_sgd { ($t:ty) => { - impl, NetB: IBackend + LayerOps + 'static> ISolver - for $t + impl, NetB: IBackend + LayerOps + 'static> + ISolver for $t { /// Initialize the SGD Momentum solver, allocating memory for its history. - fn init(&mut self, net: &Layer) { - self.history = Vec::with_capacity(net.learnable_weights_gradients().len()); + fn init(&mut self, weight_shapes: &[TensorDesc]) { + self.history = Vec::with_capacity(weight_shapes.len()); - for weight_gradient in net.learnable_weights_gradients() { - let shape = weight_gradient.read().unwrap().desc().clone(); - let mut tensor = SharedTensor::new(&shape); + for shape in weight_shapes { + let mut tensor = SharedTensor::new(shape); let filler = crate::weight::FillerType::Constant { value: 0f32 }; filler.fill(&mut tensor); - let history_tensor = Arc::new(RwLock::new(tensor)); - self.history.push(history_tensor); + self.history.push(tensor); } } - fn compute_update(&mut self, config: &SolverConfig, net: &mut Layer, iter: usize) { + fn compute_update( + &mut self, + config: &SolverConfig, + weight_gradients: &[(Rc>>, f32)], + iter: usize, + ) { let rate = config.get_learning_rate(iter); - SGDSolver::::clip_gradients(self, config, net); - for (weight_id, weight_gradient) in net.learnable_weights_gradients().iter().enumerate() { - SGDSolver::::normalize(self, config, weight_gradient); + SGDSolver::::clip_gradients( + self, + config, + &weight_gradients.iter().map(|(t, r)| t.clone()).collect::>(), + ); + for (weight_id, (weights_data, learning_rate)) in + weight_gradients.iter().enumerate() + { + SGDSolver::::normalize(self, config, &mut weights_data.borrow_mut()); + // SGDSolver::::regularize(self, config, weight_gradient, net.weights_weight_decay()[weight_id]); SGDSolver::::compute_update_value( self, config, - weight_gradient, + &mut weights_data.borrow_mut(), weight_id, - &rate, - &net.learnable_weights_lr()[weight_id].unwrap(), + rate, + *learning_rate, ); } } diff --git a/juice/src/solvers/sgd/momentum.rs b/juice/src/solvers/sgd/momentum.rs index ab396091b..767cd9247 100644 --- a/juice/src/solvers/sgd/momentum.rs +++ b/juice/src/solvers/sgd/momentum.rs @@ -14,12 +14,12 @@ //! It also makes solving more stable. use crate::co::prelude::*; -use crate::layer::*; + use crate::solver::*; use crate::solvers::SGDSolver; use crate::util::*; use std::rc::Rc; -use std::sync::{Arc, RwLock}; +use std::cell::RefCell; #[derive(Debug)] /// Stochastic Gradient Descent with Momentum. @@ -28,7 +28,7 @@ use std::sync::{Arc, RwLock}; /// [1]: ./index.html pub struct Momentum> { /// The gradient update from the previous iteration for each blob. - history: Vec>>, + history: Vec>, /// The backend used for computing the gradient. backend: Rc, @@ -56,14 +56,16 @@ impl> Momentum { } } -impl, NetB: IBackend + LayerOps + 'static> SGDSolver for Momentum { +impl, NetB: IBackend + LayerOps + 'static> SGDSolver + for Momentum +{ fn compute_update_value( &mut self, config: &SolverConfig, - weight_gradient: &ArcLock>, + weight_gradient: &mut SharedTensor, history_blob_id: usize, - global_lr: &f32, - blob_lr: &f32, + global_lr: f32, + blob_lr: f32, ) { // PERF: check if value is changed before writing it crate::weight::FillerType::Constant { @@ -71,24 +73,25 @@ impl, NetB: IBackend + LayerOps + 'static> SGD } .fill(&mut self.lr); - crate::weight::FillerType::Constant { value: config.momentum }.fill(&mut self.momentum); + crate::weight::FillerType::Constant { + value: config.momentum, + } + .fill(&mut self.momentum); - let backend = ISolver::::backend(self); + let backend : &B = &self.backend; let device = IBackend::device(backend); - let history_blob = &self.history[history_blob_id]; + let history_blob = &mut self.history[history_blob_id]; Axpby::axpby( backend, &self.lr, - &weight_gradient.read().unwrap(), + weight_gradient, &self.momentum, - &mut history_blob.write().unwrap(), + history_blob, ) .unwrap(); - backend - .copy(&history_blob.read().unwrap(), &mut weight_gradient.write().unwrap()) - .unwrap(); + backend.copy(history_blob, weight_gradient).unwrap(); } } diff --git a/juice/src/train/mod.rs b/juice/src/train/mod.rs new file mode 100644 index 000000000..c07e5d40f --- /dev/null +++ b/juice/src/train/mod.rs @@ -0,0 +1,5 @@ +mod trainer; +mod optimizer; + +pub use optimizer::*; +pub use trainer::*; \ No newline at end of file diff --git a/juice/src/train/optimizer/adam.rs b/juice/src/train/optimizer/adam.rs new file mode 100644 index 000000000..10da716c1 --- /dev/null +++ b/juice/src/train/optimizer/adam.rs @@ -0,0 +1,140 @@ +//! Adam optimizer. +//! Computes the update Vᵢ from params gradient ∇ᵢ as: +//! Mᵢ = β₁Mᵢ₋₁ + (1-β₁)∇ᵢ, +//! Sᵢ = β₂Sᵢ₋₁ + (1-β₂)∇ᵢ⊙∇ᵢ, +//! M₀ = 0, +//! S₀ = 0, +//! M̂ᵢ = Mᵢ/(1-β₁ᵗ), +//! Ŝᵢ = Sᵢ/(1-β₂ᵗ), +//! Vᵢ = M̂ᵢ⊘(√Ŝᵢ+ε), +//! where: +//! ⊙ - pointwise multiplication, +//! ⊘ - pointwise division, +//! β₁, β₂ - averaging parameters (typically set to 0.9 and 0.999 respectively), +//! ε - small constant to prevent division by zero (typically 1e-8). +//! +//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific +//! learning rates.) + +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +use crate::train::Optimizer; +use crate::util::native_backend; +use crate::weight::FillerType; +use co::prelude::*; + +#[derive(Clone, Debug)] +pub struct AdamConfig { + pub beta1: f32, + pub beta2: f32, + pub epsilon: f32, +} + +pub struct Adam { + // First gradient moment (Mᵢ). + first_moments: HashMap>, + // Second gradient moment (Sᵢ). + second_moments: HashMap>, + + // Original β₁ as well as raised to t-th power (β₁ᵗ). + beta1: f32, + beta1_nth: f32, + // Original β₂ as well as raised to t-th power (β₂ᵗ). + beta2: f32, + beta2_nth: f32, + + epsilon: f32, +} + +impl Default for AdamConfig { + fn default() -> Self { + AdamConfig { + beta1: 0.9, + beta2: 0.999, + epsilon: 1.0e-8, + } + } +} + +impl Adam { + pub fn new(config: &AdamConfig) -> Self { + Adam { + first_moments: HashMap::new(), + second_moments: HashMap::new(), + beta1: config.beta1, + beta1_nth: config.beta1, + beta2: config.beta2, + beta2_nth: config.beta2, + epsilon: config.epsilon, + } + } +} + +// TODO: Rewrite with backend ops (requires element-wise square and square root support). +impl Optimizer for Adam { + fn adjust_weight_change( + &mut self, + backend: &B, + weight_changes: &HashMap>>>, + ) { + let native = native_backend(); + + for (key, change) in weight_changes { + let mut change_ref = change.borrow_mut(); + + let first_moment = self.first_moments.entry(*key).or_insert_with(|| { + let mut tensor = SharedTensor::new(change_ref.desc()); + FillerType::fill_constant(&mut tensor, 0.0); + tensor + }); + let second_moment = self.second_moments.entry(*key).or_insert_with(|| { + let mut tensor = SharedTensor::new(change_ref.desc()); + FillerType::fill_constant(&mut tensor, 0.0); + tensor + }); + + let len = change_ref.desc().size(); + + let change_slice = change_ref + .read_write(native.device()) + .unwrap() + .as_mut_slice::(); + let first_moment_slice = first_moment + .read_write(native.device()) + .unwrap() + .as_mut_slice::(); + let second_moment_slice = second_moment + .read_write(native.device()) + .unwrap() + .as_mut_slice::(); + + // We can rewrite the matrix equations at the top of this file in a element-wise form: + // Mᵢ[j] = β₁Mᵢ₋₁[j] + (1-β₁)∇ᵢ[j] + // Sᵢ[j] = β₂Sᵢ₋₁[j] + (1-β₂)∇ᵢ[j]² + // Vᵢ[j] = Mᵢ[j] / ((1-β₁ᵗ)•√(Sᵢ[j]/(1-β₂ᵗ) + ε) + for j in 0..len { + // ∇ᵢ[j]. + let w = change_slice[j]; + // Mᵢ[j], M̂ᵢ[j]. + let m = self.beta1 * first_moment_slice[j] + (1.0 - self.beta1) * w; + let m_hat = m / (1.0 - self.beta1_nth); + // Sᵢ[j], Ŝᵢ[j]. + let s = self.beta2 * second_moment_slice[j] + (1.0 - self.beta2) * w * w; + let s_hat = s / (1.0 - self.beta2_nth); + // Vᵢ[j]. + let v = m_hat / (s_hat.sqrt() + self.epsilon); + + assert!(!v.is_nan()); + + change_slice[j] = v; + first_moment_slice[j] = m; + second_moment_slice[j] = s; + } + } + + self.beta1_nth *= self.beta1; + self.beta2_nth *= self.beta2; + } +} diff --git a/juice/src/train/optimizer/mod.rs b/juice/src/train/optimizer/mod.rs new file mode 100644 index 000000000..4b5d981c1 --- /dev/null +++ b/juice/src/train/optimizer/mod.rs @@ -0,0 +1,48 @@ +mod adam; +mod sgd_momentum; + +use std::rc::Rc; +use std::cell::RefCell; +use std::collections::HashMap; +use std::default::Default; + +use crate::coblas::plugin::Copy; +use co::prelude::*; +use crate::util::Axpby; + +use adam::Adam; +use sgd_momentum::SgdWithMomentum; + +// Expose configs publicly. +pub use adam::AdamConfig; +pub use sgd_momentum::SgdWithMomentumConfig; + +// A gradient descent optimization algorithm. +pub trait Optimizer { + // Called on each minibatch training cycle. Takes all weight gradients computed during + // backpropagation (indexed by an opaque key which is guaranteed to be stable for the + // duration of the program). + // Modifies the changes in-place; modified changes will then be applied to the weights: + // W = W - α•change, + // where α is the learning rate (combined from global and param-specific rates). + fn adjust_weight_change(&mut self, backend: &B, weight_changes: &HashMap>>>); +} + +#[derive(Clone, Debug)] +pub enum OptimizerConfig { + SgdWithMomentum(SgdWithMomentumConfig), + Adam(AdamConfig), +} + +impl Default for OptimizerConfig { + fn default() -> Self { + OptimizerConfig::SgdWithMomentum(Default::default()) + } +} + +pub fn optimizer_from_config + Copy>(config: &OptimizerConfig) -> Box> { + match config { + OptimizerConfig::SgdWithMomentum(sgd_config) => Box::new(SgdWithMomentum::new(sgd_config)), + OptimizerConfig::Adam(adam_config) => Box::new(Adam::new(adam_config)), + } +} \ No newline at end of file diff --git a/juice/src/train/optimizer/sgd_momentum.rs b/juice/src/train/optimizer/sgd_momentum.rs new file mode 100644 index 000000000..a73f39087 --- /dev/null +++ b/juice/src/train/optimizer/sgd_momentum.rs @@ -0,0 +1,83 @@ +//! SGD with momentum. +//! Computes the update Vᵢ from params gradient ∇ᵢ as: +//! Vᵢ = (1-β)Vᵢ₋₁ + β∇ᵢ, +//! V₀ = 0, +//! where: +//! β is the momentum parameter (typically set to 0.1). +//! +//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific +//! learning rates.) + +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +use crate::coblas::plugin::Copy; +use crate::train::Optimizer; +use crate::util::{native_scalar, Axpby}; +use crate::weight::FillerType; +use co::prelude::*; + +#[derive(Clone, Debug)] +pub struct SgdWithMomentumConfig { + pub momentum: f32, +} + +pub struct SgdWithMomentum { + history: HashMap>, + // Precomputed tensor constants. + zero: SharedTensor, + momentum: SharedTensor, + one_minus_momentum: SharedTensor, +} + +impl Default for SgdWithMomentumConfig { + fn default() -> Self { + SgdWithMomentumConfig { momentum: 0.1 } + } +} + +impl SgdWithMomentum { + pub fn new(config: &SgdWithMomentumConfig) -> Self { + SgdWithMomentum { + history: HashMap::new(), + zero: native_scalar(0.0), + momentum: native_scalar(config.momentum), + one_minus_momentum: native_scalar(1.0 - config.momentum), + } + } +} + +impl + Copy> Optimizer for SgdWithMomentum { + fn adjust_weight_change( + &mut self, + backend: &B, + weight_changes: &HashMap>>>, + ) { + for (key, change) in weight_changes { + let mut change_ref = change.borrow_mut(); + + let history = self.history.entry(*key).or_insert_with(|| { + let mut tensor = SharedTensor::new(change_ref.desc()); + FillerType::fill_constant(&mut tensor, 0.0); + tensor + }); + + // Make sure the params shape didn't change under us. + assert_eq!(history.desc().size(), change_ref.desc().size()); + + // Compute Vᵢ=(1-β)Vᵢ₋₁ + β∇. + backend + .axpby( + &self.momentum, + &change_ref, + &self.one_minus_momentum, + history, + ) + .unwrap(); + + // Copy Vᵢ to the weight change which should hold the return value. + backend.copy(history, &mut change_ref).unwrap(); + } + } +} diff --git a/juice/src/train/trainer.rs b/juice/src/train/trainer.rs new file mode 100644 index 000000000..84746f467 --- /dev/null +++ b/juice/src/train/trainer.rs @@ -0,0 +1,168 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +use crate::co::prelude::*; +use crate::net::*; +use crate::train::{optimizer_from_config, Optimizer, OptimizerConfig, SgdWithMomentumConfig}; +use crate::util::{format_tensor, SolverOps}; + +#[derive(Clone)] +pub struct TrainerConfig { + pub batch_size: usize, + pub objective: LayerConfig, + pub optimizer: OptimizerConfig, + pub learning_rate: f32, +} + +/// Trains a network through minibatch backpropagation. +/// +/// Trains a network doing backpropagation from a configured output. For multi-output +/// networks, several Trainers can be constructed (each with its own loss function) +/// to perform asynchronous training. +/// Doesn't own the network. +pub struct Trainer { + config: TrainerConfig, + + // Objective (loss) function used for backpropagation. + objective: Box>, + + optimizer: Box>, + + iter: usize, +} + +fn key_from_rc(rc: &Rc>) -> usize { + Rc::as_ptr(rc) as usize +} + +impl Default for TrainerConfig { + fn default() -> Self { + TrainerConfig { + batch_size: 32, + objective: LayerConfig::MeanSquaredError, + optimizer: OptimizerConfig::SgdWithMomentum(SgdWithMomentumConfig::default()), + learning_rate: 0.001, + } + } +} + +impl + 'static> Trainer { + pub fn from_config( + backend: &B, + config: &TrainerConfig, + net: &Network, + label_shape: &TensorDesc, + ) -> Self { + // Create objective. + let objective_descriptor = Descriptor::top( + "loss", + vec![ + net.top().descriptor().output(0).clone(), + Inout::new_with_path(label_shape.clone(), "labels"), + ], + ); + let objective = layer_from_config(objective_descriptor, &config.objective); + let optimizer = optimizer_from_config(&config.optimizer); + + Trainer { + config: (*config).clone(), + objective: objective, + optimizer: optimizer, + iter: 0, + } + } + + pub fn train_minibatch( + &mut self, + backend: &B, + net: &mut Network, + inputs: &SharedTensor, + labels: &SharedTensor, + ) -> SharedTensor { + trace!("Inputs:\n{}", format_tensor(inputs)); + trace!("Labels:\n{}", format_tensor(labels)); + + let batch_size = inputs.desc()[0]; + assert_eq!(batch_size, labels.desc()[0]); + + let mut context = Context::new(batch_size); + + // Copy intput and label data into the context. + // Copy inputs. + let context_inputs = context.acquire_data(net.top().descriptor().input(0)); + assert_eq!(context_inputs.borrow().desc().size(), inputs.desc().size()); + backend + .copy(&inputs, &mut context_inputs.borrow_mut()) + .unwrap(); + // Copy labels. + let context_labels = context.acquire_data(self.objective.descriptor().input(1)); + backend + .copy(&labels, &mut context_labels.borrow_mut()) + .unwrap(); + + // Compute network output and the loss. + net.top().compute_output(backend, &mut context); + self.objective.compute_output(backend, &mut context); + + // Compute params gradients by doing a backpropagation on the network. + self.objective.compute_gradients(backend, &mut context); + trace!( + "Loss gradient:\n{}", + format_tensor( + &context + .get_data_gradient(self.objective.descriptor().input(0)) + .borrow() + ) + ); + net.top().compute_gradients(backend, &mut context); + + // Collect computed gradients. + let params_gradients: HashMap>>> = net + .top() + .descriptor() + .params() + .iter() + .map(|p| { + ( + key_from_rc(p), + // Use acquire* instead of get* to create missing gradients. This is necessary + // because backpropagation might not reach all layers. Created gradients + // are zero-filled and thus will not change weights when applied. + context.acquire_params_gradient(p), + ) + }) + .collect(); + + // Let the optimizer adjust the params gradients before applying them to params. + self.optimizer + .adjust_weight_change(backend, ¶ms_gradients); + + trace!("Gradient after worker:"); + params_gradients.iter().for_each(|(p, r)| { + trace!("{:?}: \n{}", p, format_tensor(&r.borrow())); + }); + + // Finally apply the weight change. + net.top().descriptor().params().iter().for_each(|p| { + let key = key_from_rc(p); + + let mut params = p.borrow_mut(); + let change = params_gradients.get(&key).unwrap().borrow(); + + // When applying the (optimized) gradient, additionally: + // 1. Normalize for batch size (multiply by 1/batch_size). + // 2. Multiply by individual params learning rate. + // 3. Multiply by global learning rate. + let a = crate::util::native_scalar( + -params.learning_rate * self.config.learning_rate / (self.config.batch_size as f32), + ); + + backend.axpy(&a, &change, &mut params.data).unwrap(); + }); + + self.iter += 1; + + context.take_data(net.top().descriptor().output(0)) + } +} diff --git a/juice/src/util.rs b/juice/src/util.rs index 581fb9f58..7b51c24ea 100644 --- a/juice/src/util.rs +++ b/juice/src/util.rs @@ -26,7 +26,11 @@ pub fn write_to_memory(mem: &mut FlatBox, data } /// Write into a native Coaster Memory with a offset. -pub fn write_to_memory_offset(mem: &mut FlatBox, data: &[T], offset: usize) { +pub fn write_to_memory_offset( + mem: &mut FlatBox, + data: &[T], + offset: usize, +) { let mem_buffer = mem.as_mut_slice::(); for (index, datum) in data.iter().enumerate() { // mem_buffer[index + offset] = *datum; @@ -41,7 +45,11 @@ pub fn write_to_memory_offset(mem: &mut FlatBo /// is assumed to be the batchsize. /// /// Allocates memory on a Native Backend if neccessary. -pub fn write_batch_sample(tensor: &mut SharedTensor, data: &[T], i: usize) { +pub fn write_batch_sample( + tensor: &mut SharedTensor, + data: &[T], + i: usize, +) { let native_backend = native_backend(); let tensor_desc = tensor.desc(); let batch_size = tensor_desc[0]; @@ -59,7 +67,10 @@ pub fn write_batch_sample(tensor: &mut SharedT pub fn native_scalar(scalar: T) -> SharedTensor { let native = native_backend(); let mut shared_scalar = SharedTensor::::new(&[1]); - write_to_memory(shared_scalar.write_only(native.device()).unwrap(), &[scalar]); + write_to_memory( + shared_scalar.write_only(native.device()).unwrap(), + &[scalar], + ); shared_scalar } @@ -72,6 +83,39 @@ pub fn cast_vec_usize_to_i32(input: Vec) -> Vec { out } +pub fn format_tensor(tensor: &SharedTensor) -> String { + let native = native_backend(); + + let mut output = String::new(); + + if tensor.desc().len() == 1 + || (tensor.desc().len() == 2 && (tensor.desc()[0] == 1 || tensor.desc()[1] == 1)) + { + // One-dimensional, print as a single row. + let native_data: &[f32] = tensor.read(native.device()).unwrap().as_slice(); + for v in native_data { + output += &format!("{:.5} ", v); + } + output += "\n"; + } else { + // Use first dimension a row number. + let rows = tensor.desc()[0]; + let columns = tensor.desc().iter().skip(1).fold(1, |acc, d| acc * d); + let native_data: &[f32] = tensor.read(native.device()).unwrap().as_slice(); + for row in 0..rows { + for col in 0..columns { + output += &format!("{:.5} ", native_data[row * columns + col]); + } + output += "\n"; + } + } + output +} + +pub fn print_tensor(tensor: &SharedTensor) { + println!("{}", &format_tensor(tensor)); +} + /// Extends IBlas with Axpby pub trait Axpby: Axpy + Scal { /// Performs the operation y := a*x + b*y . diff --git a/juice/tests/layer_specs.rs b/juice/tests/layer_specs.rs index b05f7a2f5..715bfa0bd 100644 --- a/juice/tests/layer_specs.rs +++ b/juice/tests/layer_specs.rs @@ -1,7 +1,7 @@ extern crate coaster as co; extern crate juice; -#[cfg(test)] +#[cfg(all(test, whatever))] mod layer_spec { use crate::co::prelude::*; use juice::layer::*; diff --git a/juice/tests/q_learning.rs b/juice/tests/q_learning.rs new file mode 100644 index 000000000..2fc0a7dc5 --- /dev/null +++ b/juice/tests/q_learning.rs @@ -0,0 +1,385 @@ +extern crate coaster; +extern crate juice; + +#[cfg(test)] +mod cartpole { + /// This test verifies Q-learning in a cartpole environment: + /// O + /// │ + /// │ + /// ┌──┴──┐ + /// │ ├───► F + /// └─────┘ + /// + /// "Cartpole" is an inverted pendulum on a plaftorm that can move in one dimension. + /// Agent must apply force to move the platform either to the left or to the right + /// in order to balance the pole in an upright position. Scenario ends when the pole + /// angle crosses a certain threshold. Agent is rewarded with R=1 for each cycle + /// of the scenario (so the longer the agent is able to keep pole from falling, the bigger + /// overall reward it gets). + /// + /// State "s" consists of [cart_pos, cart_vel, pole_angle, pole_angle_vel] variables. + /// Possible actions "a" are [left, right]. + /// Q function Q(s, a) is approximated by a neural network with trainable weights θ: Q(s, a, θ). + /// Network takes the 4 state variables as the input and outputs the the expected return value + /// for each action: + /// + /// [s1 s2 s3 s4] -> net -> [Q(s, left) Q(s, right)] + /// + /// During training, on each step agent observes the state "s", chooses the best action "a" + /// using an ε-greedy policy based on currently learned Q-function (that is, it takes a random + /// action with ε probability and an action "a" that maximizes Q(s, a) with probability 1-ε), + /// gets the reward "R" and observes the next step "s'". + /// The tuple [s, a, R, s'] is saved into experience replay buffer. + /// + /// For training, a batch is taken from the replay buffer. For each replay tuple [s, a, R, s'], + /// "s" becomes the network input and the label is computed as + /// + /// l = R + γ max Q*(s', a) + /// a + /// + /// where Q* is an earlier snapshot of Q (a snapshot is used instead of Q itself for stability). + /// Note that this gives the label value only for one of the actions; the full label is set to + /// + /// [l NaN] + /// + /// (assuming "a" was the first action). MSE loss function is assumed to ignore NaN values for + /// loss and backpropagation gradient computations. + use rand::{thread_rng, Rng}; + use std::collections::VecDeque; + use std::rc::Rc; + + use coaster::frameworks::native::get_native_backend; + use coaster::prelude::*; + use juice::net::*; + use juice::train::*; + use juice::util::{write_batch_sample, LayerOps}; + + const STATE_SIZE: usize = 4; + const ACTION_COUNT: usize = 2; + const BATCH_SIZE: usize = 32; + const REPLAY_BUFFER_SIZE: usize = 1024; + const CART_MASS: f64 = 1.0; + const POLE_MASS: f64 = 1.0; + const POLE_LENGTH: f64 = 1.0; + const EARTH_G: f64 = 9.8; + const ENV_STEP: f64 = 0.1; + const DISCOUNT: f32 = 0.9; + + #[derive(Clone, Copy, Debug)] + enum Action { + Left, + Right, + } + + #[derive(Default)] + struct Environment { + // Position, velocity and acceleration of the cart. + cart_pos: f64, + cart_vel: f64, + cart_acc: f64, + + // Angle, angular velocity and acceleration of the pole (angle = 0 means upright). + pole_angle: f64, + pole_angle_vel: f64, + pole_angle_acc: f64, + } + + struct ReplayEntry { + state: [f32; STATE_SIZE], + action: Action, + reward: f32, + // None if the resulting state is a final one. + next_state: Option<[f32; STATE_SIZE]>, + } + + impl Environment { + fn new() -> Environment { + Environment { + // Start with a slight offset so that the pole starts falling. + pole_angle: 0.001, + ..Default::default() + } + } + + // Execute an actor action, transitioning into a new env state. + // Action can be None for the purpose of testing the Environment logic itself + // (agent always makes an action). + fn step(&mut self, action: Option) { + // Shorthands and local vars. + let m_p = POLE_MASS; + let m_c = CART_MASS; + let l = POLE_LENGTH; + let m = m_p + m_c; + let f = match action { + None => 0.0, + Some(Action::Left) => -1.0, + Some(Action::Right) => 1.0, + }; + let th = self.pole_angle; + let th_dot = self.pole_angle_vel; + let th_ddot = self.pole_angle_acc; + let sin_th = th.sin(); + let cos_th = th.cos(); + + self.cart_acc = (f + m_p * l * (th_dot.powi(2) * sin_th - th_ddot * cos_th)) / m; + self.cart_vel += self.cart_acc * ENV_STEP; + self.cart_pos += self.cart_vel * ENV_STEP; + + self.pole_angle_acc = (EARTH_G * sin_th + + cos_th * (-f - m_p * l * th_dot.powi(2) * sin_th) / m) + / (l * (4.0 / 3.0 - m_p * cos_th.powi(2) / m)); + self.pole_angle_vel += self.pole_angle_acc * ENV_STEP; + self.pole_angle += self.pole_angle_vel * ENV_STEP; + } + + // Returns true if the pole has reached some critical angle. + fn is_final(&self) -> bool { + self.pole_angle.abs() > std::f64::consts::PI * 0.25 + } + + fn observe(&self) -> [f32; STATE_SIZE] { + [ + self.cart_pos as f32, + self.cart_vel as f32, + self.pole_angle as f32, + self.pole_angle_vel as f32, + ] + } + } + + // Returns a completely random action. + fn random_action() -> Action { + if thread_rng().gen::() < 0.5 { + Action::Left + } else { + Action::Right + } + } + + fn epsion_greedy_action + 'static>( + backend: &B, + net: &Network, + state: &[f32; STATE_SIZE], + epsilon: f64, + ) -> Action { + if thread_rng().gen::() < epsilon { + random_action() + } else { + let action_values = get_action_values(backend, net, state); + if action_values[0] > action_values[1] { + Action::Left + } else { + Action::Right + } + } + } + + // Returns the predicted action values for a given state. + fn get_action_values + 'static>( + backend: &B, + net: &Network, + state: &[f32; STATE_SIZE], + ) -> [f32; ACTION_COUNT] { + let mut input = SharedTensor::new(&[1, STATE_SIZE]); + write_batch_sample(&mut input, state, 0); + let output = net.transform(backend, &input); + + let mut result = [0.0; ACTION_COUNT]; + let native_backend = get_native_backend(); + result.clone_from_slice(output.read(native_backend.device()).unwrap().as_slice()); + result + } + + fn create_batch + 'static>( + backend: &B, + buffer: &VecDeque, + target_net: &Network, + ) -> (SharedTensor, SharedTensor) { + let mut inputs = SharedTensor::new(&vec![BATCH_SIZE, STATE_SIZE]); + let mut labels = SharedTensor::new(&vec![BATCH_SIZE, ACTION_COUNT]); + + for i in 0..BATCH_SIZE { + let j = thread_rng().gen_range(0..buffer.len()); + let (buffer_action, other_action) = match buffer[j].action { + Action::Left => (0, 1), + Action::Right => (1, 0), + }; + + let mut action_values = [std::f32::NAN; ACTION_COUNT]; + + // For the (s, a, s', r) tuple in the buffer, we can compute more precise target as + // y = r + γ•max Q*(s', _), if s' is not terminal, or + // y = r, if s' is termninal. + action_values[buffer_action] = buffer[j].reward + + match buffer[j].next_state { + None => 0.0, + Some(s) => { + DISCOUNT + * get_action_values(backend, target_net, &s) + .iter() + .fold(std::f32::NEG_INFINITY, |a, &b| a.max(b)) + } + }; + + // For the other action a2, we just use the target_net: + // y = Q*(s, a2). + // action_values[other_action] = + // get_action_values(backend, target_net, &buffer[j].state)[other_action]; + + write_batch_sample(&mut inputs, &buffer[j].state, i); + write_batch_sample(&mut labels, &action_values, i); + } + + (inputs, labels) + } + + // Runs 3 scenarios with a greedy policy using the provided learned Q-function. + // Returns the average number of steps the agent was able to keep the cartpole from + // falling (capped at 100 steps). + fn eval + 'static>(backend: &B, net: &Network) -> f32 { + let mut sum = 0.0; + for _ in 0..3 { + let mut env = Environment::new(); + for i in 0..100 { + let action = epsion_greedy_action(backend, net, &env.observe(), 0.0); + env.step(Some(action)); + sum += 1.0; + if env.is_final() { + break; + } + } + } + sum / 3.0 + } + + // A test on the environment simulator. + // When no forces are present, pole angle should be oscillating around PI. + #[test] + fn environment_is_sane_without_force() { + let mut env = Environment::new(); + let mut avg_angle = 0.0; + for _ in 0..10000 { + env.step(None); + avg_angle += env.pole_angle; + } + avg_angle /= 10000.0; + assert!( + (avg_angle - std::f64::consts::PI).abs() < 0.01, + "Avg. angle: {}", + avg_angle + ); + } + + // A test on the environment simulator. + // When force is applied to the left, cart should be moving to the left. + #[test] + fn environment_is_sane_with_left_force() { + let mut env = Environment::new(); + for _ in 0..10 { + env.step(Some(Action::Left)); + } + assert!(env.cart_pos < 0.0, "Cart pos: {}", env.cart_pos); + assert!(env.cart_vel < 0.0, "Cart vel: {}", env.cart_vel); + assert!(env.cart_acc < 0.0, "Cart acc: {}", env.cart_acc); + } + + // A test on the environment simulator. + // When force is applied to the right, cart should be moving to the right. + #[test] + fn environment_is_sane_with_right_force() { + let mut env = Environment::new(); + for _ in 0..10 { + env.step(Some(Action::Right)); + } + assert!(env.cart_pos > 0.0, "Cart pos: {}", env.cart_pos); + assert!(env.cart_vel > 0.0, "Cart vel: {}", env.cart_vel); + assert!(env.cart_acc > 0.0, "Cart acc: {}", env.cart_acc); + } + + #[test] + fn learns_cartpole_control() { + env_logger::init(); + + let backend = get_native_backend(); + + // Create the network representing the Q-function Q(s, a). + let net_conf = LayerConfig::Sequential( + SequentialConfig::new() + .with_layer("linear1", LayerConfig::Linear(LinearConfig::new(50))) + .with_layer("relu1", LayerConfig::Relu) + .with_layer("linear2", LayerConfig::Linear(LinearConfig::new(50))) + .with_layer("relu2", LayerConfig::Relu) + .with_layer( + "linear3", + LayerConfig::Linear(LinearConfig::new(ACTION_COUNT)), + ), + ); + let mut net = Network::from_config(net_conf, &[vec![STATE_SIZE]]); + + // Create the trainer. + let trainer_conf = TrainerConfig { + batch_size: BATCH_SIZE, + objective: LayerConfig::MeanSquaredError, + optimizer: OptimizerConfig::SgdWithMomentum(Default::default()), + ..Default::default() + }; + let mut trainer = Trainer::from_config(&backend, &trainer_conf, &net, &vec![ACTION_COUNT]); + + let mut replay_buffer = VecDeque::new(); + let mut env = Environment::new(); + + // Network used to compute full returns from a certain state. + // This is a periodic snapshot from the main net (main network isn't used due to + // stability issues). + let mut target_net = net.clone(); + let mut epsilon = 1.0; + + for i in 0..1000000 { + // Do a step. + let state = env.observe(); + let action = epsion_greedy_action(&backend, &net, &state, epsilon); + env.step(Some(action)); + let (reward, next_state) = match env.is_final() { + false => (1.0, Some(env.observe())), + true => (0.0, None), + }; + + // Store the result in the replay buffer. + let replay_entry = ReplayEntry { + state: state, + action: action, + reward: reward, + next_state: next_state, + }; + replay_buffer.push_front(replay_entry); + replay_buffer.truncate(REPLAY_BUFFER_SIZE); + + // Restart the environment if reached a final state. + if env.is_final() { + env = Environment::new(); + } + + if replay_buffer.len() >= BATCH_SIZE { + let (inputs, labels) = create_batch(&backend, &replay_buffer, &target_net); + trainer.train_minibatch(&backend, &mut net, &inputs, &labels); + } + + // Evaluate performance and snapshot the bootstrapping net every 100 steps. + if i % 100 == 0 { + let score = eval(&backend, &net); + println!("Epoch: {}; score: {}; ε: {}", i / 100, score, epsilon); + target_net = net.clone(); + epsilon = (epsilon * 0.995).max(0.01); + + // Stop when we reach 95 score. + if i / 100 == 1000 { + // if score >= 95.0 { + return; + } + } + } + + assert!(false, "Failed to reach score 95"); + } +} diff --git a/rust-blas/src/matrix/ops.rs b/rust-blas/src/matrix/ops.rs index 4c7167290..7a7a6cb37 100644 --- a/rust-blas/src/matrix/ops.rs +++ b/rust-blas/src/matrix/ops.rs @@ -28,16 +28,22 @@ macro_rules! gemm_impl(($($t: ident), +) => ( impl Gemm for $t { fn gemm(alpha: &$t, at: Transpose, a: &dyn Matrix<$t>, bt: Transpose, b: &dyn Matrix<$t>, beta: &$t, c: &mut dyn Matrix<$t>) { unsafe { - let (m, k) = match at { + let (ar, ac) = match at { Transpose::NoTrans => (a.rows(), a.cols()), _ => (a.cols(), a.rows()), }; - - let n = match bt { - Transpose::NoTrans => b.cols(), - _ => b.rows(), + let (br, bc) = match bt { + Transpose::NoTrans => (b.rows(), b.cols()), + _ => (b.cols(), b.rows()), }; + let (m, k) = (ar, ac); + let n = bc; + + if br != k || c.rows() != m || c.cols() != n { + panic!("Wrong GEMM dimensions: [{},{}]x[{},{}] -> [{},{}]", ar, ac, br, bc, c.rows(), c.cols()); + } + prefix!($t, gemm)(a.order(), at, bt, m, n, k,