Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New layer architecture #159

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# will have compiled files and executables
target

Cargo.lock

# These are backup files generated by rustfmt
**/*.rs.bk
141 changes: 141 additions & 0 deletions juice/src/train/optimizer/adam.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
//! Adam optimizer.
//! Computes the update Vᵢ from params gradient ∇ᵢ as:
//! Mᵢ = β₁Mᵢ₋₁ + (1-β₁)∇ᵢ,
//! Sᵢ = β₂Sᵢ₋₁ + (1-β₂)∇ᵢ⊙∇ᵢ,
//! M₀ = 0,
//! S₀ = 0,
//! M̂ᵢ = Mᵢ/(1-β₁ᵗ),
//! Ŝᵢ = Sᵢ/(1-β₂ᵗ),
//! Vᵢ = M̂ᵢ⊘(√Ŝᵢ+ε),
//! where:
//! ⊙ - pointwise multiplication,
//! ⊘ - pointwise division,
//! β₁, β₂ - averaging parameters (typically set to 0.9 and 0.999 respectively),
//! ε - small constant to prevent division by zero (typically 1e-8).
//!
//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific
//! learning rates.)

use std::cell::RefCell;
use std::collections::HashMap;
use std::rc::Rc;

use crate::coblas::plugin::Copy;
use crate::train::Optimizer;
use crate::util::native_backend;
use crate::weight::FillerType;
use co::prelude::*;

#[derive(Clone, Debug)]
pub struct AdamConfig {
pub beta1: f32,
pub beta2: f32,
pub epsilon: f32,
}

pub struct Adam {
// First gradient moment (Mᵢ).
first_moments: HashMap<usize, SharedTensor<f32>>,
// Second gradient moment (Sᵢ).
second_moments: HashMap<usize, SharedTensor<f32>>,

// Original β₁ as well as raised to t-th power (β₁ᵗ).
beta1: f32,
beta1_nth: f32,
// Original β₂ as well as raised to t-th power (β₂ᵗ).
beta2: f32,
beta2_nth: f32,

epsilon: f32,
}

impl Default for AdamConfig {
fn default() -> Self {
AdamConfig {
beta1: 0.9,
beta2: 0.999,
epsilon: 1.0e-8,
}
}
}

impl Adam {
pub fn new(config: &AdamConfig) -> Self {
Adam {
first_moments: HashMap::new(),
second_moments: HashMap::new(),
beta1: config.beta1,
beta1_nth: config.beta1,
beta2: config.beta2,
beta2_nth: config.beta2,
epsilon: config.epsilon,
}
}
}

// TODO: Rewrite with backend ops (requires element-wise square and square root support).
impl<B: IBackend> Optimizer<B> for Adam {
fn adjust_weight_change(
&mut self,
backend: &B,
weight_changes: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
) {
let native = native_backend();

for (key, change) in weight_changes {
let mut change_ref = change.borrow_mut();

let mut first_moment = self.first_moments.entry(*key).or_insert_with(|| {
let mut tensor = SharedTensor::new(change_ref.desc());
FillerType::fill_constant(&mut tensor, 0.0);
tensor
});
let mut second_moment = self.second_moments.entry(*key).or_insert_with(|| {
let mut tensor = SharedTensor::new(change_ref.desc());
FillerType::fill_constant(&mut tensor, 0.0);
tensor
});

let len = change_ref.desc().size();

let change_slice = change_ref
.read_write(native.device())
.unwrap()
.as_mut_slice::<f32>();
let first_moment_slice = first_moment
.read_write(native.device())
.unwrap()
.as_mut_slice::<f32>();
let second_moment_slice = second_moment
.read_write(native.device())
.unwrap()
.as_mut_slice::<f32>();

// We can rewrite the matrix equations at the top of this file in a element-wise form:
// Mᵢ[j] = β₁Mᵢ₋₁[j] + (1-β₁)∇ᵢ[j]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

♥️

// Sᵢ[j] = β₂Sᵢ₋₁[j] + (1-β₂)∇ᵢ[j]²
// Vᵢ[j] = Mᵢ[j] / ((1-β₁ᵗ)•√(Sᵢ[j]/(1-β₂ᵗ) + ε)
for j in 0..len {
// ∇ᵢ[j].
let w = change_slice[j];
// Mᵢ[j], M̂ᵢ[j].
let m = self.beta1 * first_moment_slice[j] + (1.0 - self.beta1) * w;
let m_hat = m / (1.0 - self.beta1_nth);
// Sᵢ[j], Ŝᵢ[j].
let s = self.beta2 * second_moment_slice[j] + (1.0 - self.beta2) * w * w;
let s_hat = s / (1.0 - self.beta2_nth);
// Vᵢ[j].
let v = m_hat / (s_hat.sqrt() + self.epsilon);

assert!(!v.is_nan());

change_slice[j] = v;
first_moment_slice[j] = m;
second_moment_slice[j] = s;
}
}

self.beta1_nth *= self.beta1;
self.beta2_nth *= self.beta2;
}
}
13 changes: 9 additions & 4 deletions juice/src/train/optimizer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod adam;
mod sgd_momentum;

use std::rc::Rc;
Expand All @@ -9,24 +10,28 @@ use crate::coblas::plugin::Copy;
use co::prelude::*;
use crate::util::Axpby;

use adam::Adam;
use sgd_momentum::SgdWithMomentum;

// Expose configs publicly.
pub use adam::AdamConfig;
pub use sgd_momentum::SgdWithMomentumConfig;

// A gradient descent optimization algorithm.
pub trait Optimizer<B: IBackend> {
// Called on each minibatch training cycle. Takes all weight gradients computed during
// backpropagation (indexed by an opaque key which is guaranteed to be stable for the
// duration of the program).
// Modifies the changes in-place; modified changes will then be directly applied to the weights:
// W = W - change
// Modifies the changes in-place; modified changes will then be applied to the weights:
// W = W - α•change,
// where α is the learning rate (combined from global and param-specific rates).
fn adjust_weight_change(&mut self, backend: &B, weight_changes: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>);
}

#[derive(Clone, Debug)]
pub enum OptimizerConfig {
SgdWithMomentum(SgdWithMomentumConfig),
Adam,
Adam(AdamConfig),
}

impl Default for OptimizerConfig {
Expand All @@ -38,6 +43,6 @@ impl Default for OptimizerConfig {
pub fn optimizer_from_config<B: IBackend + Axpby<f32> + Copy<f32>>(config: &OptimizerConfig) -> Box<dyn Optimizer<B>> {
match config {
OptimizerConfig::SgdWithMomentum(sgd_config) => Box::new(SgdWithMomentum::new(sgd_config)),
OptimizerConfig::Adam => unimplemented!(),
OptimizerConfig::Adam(adam_config) => Box::new(Adam::new(adam_config)),
}
}
16 changes: 10 additions & 6 deletions juice/src/train/optimizer/sgd_momentum.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
//! SGD with momentum.
//! Computes the update Vᵢ from params gradient ∇ᵢ as:
//! Vᵢ = (1-β)Vᵢ₋₁ + β∇ᵢ,
//! V₀ = 0,
//! where:
//! β is the momentum parameter (typically set to 0.1).
//!
//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific
//! learning rates.)

use std::cell::RefCell;
use std::collections::HashMap;
use std::rc::Rc;
Expand All @@ -13,12 +23,6 @@ pub struct SgdWithMomentumConfig {
pub momentum: f32,
}

// SGD with momentum.
// Computes the update Vᵢ from params gradient ∇ᵢ as:
// Vᵢ=(1-β)Vᵢ₋₁ + β∇ᵢ,
// V₀ = 0,
// where:
// β is the momentum parameter (typically set to 0.1).
pub struct SgdWithMomentum {
history: HashMap<usize, SharedTensor<f32>>,
// Precomputed tensor constants.
Expand Down
3 changes: 2 additions & 1 deletion juice/tests/q_learning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ mod cartpole {
epsilon = (epsilon * 0.995).max(0.01);

// Stop when we reach 95 score.
if score >= 95.0 {
if i / 100 == 1000 {
// if score >= 95.0 {
return;
}
}
Expand Down