fff-rs · hweom · Mar 12, 2022 · Mar 12, 2022 · Mar 12, 2022 · May 7, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 # will have compiled files and executables
 target
 
+Cargo.lock
+
 # These are backup files generated by rustfmt
 **/*.rs.bk
diff --git a/juice/src/train/optimizer/adam.rs b/juice/src/train/optimizer/adam.rs
@@ -0,0 +1,141 @@
+//! Adam optimizer.
+//! Computes the update Vᵢ from params gradient ∇ᵢ as:
+//!   Mᵢ = β₁Mᵢ₋₁ + (1-β₁)∇ᵢ,
+//!   Sᵢ = β₂Sᵢ₋₁ + (1-β₂)∇ᵢ⊙∇ᵢ,
+//!   M₀ = 0,
+//!   S₀ = 0,
+//!   M̂ᵢ = Mᵢ/(1-β₁ᵗ),
+//!   Ŝᵢ = Sᵢ/(1-β₂ᵗ),
+//!   Vᵢ = M̂ᵢ⊘(√Ŝᵢ+ε),
+//! where:
+//!   ⊙ - pointwise multiplication,
+//!   ⊘ - pointwise division,
+//!   β₁, β₂ - averaging parameters (typically set to 0.9 and 0.999 respectively),
+//!   ε - small constant to prevent division by zero (typically 1e-8).
+//!
+//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific
+//! learning rates.)
+
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::rc::Rc;
+
+use crate::coblas::plugin::Copy;
+use crate::train::Optimizer;
+use crate::util::native_backend;
+use crate::weight::FillerType;
+use co::prelude::*;
+
+#[derive(Clone, Debug)]
+pub struct AdamConfig {
+    pub beta1: f32,
+    pub beta2: f32,
+    pub epsilon: f32,
+}
+
+pub struct Adam {
+    // First gradient moment (Mᵢ).
+    first_moments: HashMap<usize, SharedTensor<f32>>,
+    // Second gradient moment (Sᵢ).
+    second_moments: HashMap<usize, SharedTensor<f32>>,
+
+    // Original β₁ as well as raised to t-th power (β₁ᵗ).
+    beta1: f32,
+    beta1_nth: f32,
+    // Original β₂ as well as raised to t-th power (β₂ᵗ).
+    beta2: f32,
+    beta2_nth: f32,
+
+    epsilon: f32,
+}
+
+impl Default for AdamConfig {
+    fn default() -> Self {
+        AdamConfig {
+            beta1: 0.9,
+            beta2: 0.999,
+            epsilon: 1.0e-8,
+        }
+    }
+}
+
+impl Adam {
+    pub fn new(config: &AdamConfig) -> Self {
+        Adam {
+            first_moments: HashMap::new(),
+            second_moments: HashMap::new(),
+            beta1: config.beta1,
+            beta1_nth: config.beta1,
+            beta2: config.beta2,
+            beta2_nth: config.beta2,
+            epsilon: config.epsilon,
+        }
+    }
+}
+
+// TODO: Rewrite with backend ops (requires element-wise square and square root support).
+impl<B: IBackend> Optimizer<B> for Adam {
+    fn adjust_weight_change(
+        &mut self,
+        backend: &B,
+        weight_changes: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>,
+    ) {
+        let native = native_backend();
+
+        for (key, change) in weight_changes {
+            let mut change_ref = change.borrow_mut();
+
+            let mut first_moment = self.first_moments.entry(*key).or_insert_with(|| {
+                let mut tensor = SharedTensor::new(change_ref.desc());
+                FillerType::fill_constant(&mut tensor, 0.0);
+                tensor
+            });
+            let mut second_moment = self.second_moments.entry(*key).or_insert_with(|| {
+                let mut tensor = SharedTensor::new(change_ref.desc());
+                FillerType::fill_constant(&mut tensor, 0.0);
+                tensor
+            });
+
+            let len = change_ref.desc().size();
+
+            let change_slice = change_ref
+                .read_write(native.device())
+                .unwrap()
+                .as_mut_slice::<f32>();
+            let first_moment_slice = first_moment
+                .read_write(native.device())
+                .unwrap()
+                .as_mut_slice::<f32>();
+            let second_moment_slice = second_moment
+                .read_write(native.device())
+                .unwrap()
+                .as_mut_slice::<f32>();
+
+            // We can rewrite the matrix equations at the top of this file in a element-wise form:
+            //   Mᵢ[j] = β₁Mᵢ₋₁[j] + (1-β₁)∇ᵢ[j]
+            //   Sᵢ[j] = β₂Sᵢ₋₁[j] + (1-β₂)∇ᵢ[j]²
+            //   Vᵢ[j] = Mᵢ[j] / ((1-β₁ᵗ)•√(Sᵢ[j]/(1-β₂ᵗ) + ε)
+            for j in 0..len {
+                // ∇ᵢ[j].
+                let w = change_slice[j];
+                // Mᵢ[j], M̂ᵢ[j].
+                let m = self.beta1 * first_moment_slice[j] + (1.0 - self.beta1) * w;
+                let m_hat = m / (1.0 - self.beta1_nth);
+                // Sᵢ[j], Ŝᵢ[j].
+                let s = self.beta2 * second_moment_slice[j] + (1.0 - self.beta2) * w * w;
+                let s_hat = s / (1.0 - self.beta2_nth);
+                // Vᵢ[j].
+                let v = m_hat / (s_hat.sqrt() + self.epsilon);
+
+                assert!(!v.is_nan());
+
+                change_slice[j] = v;
+                first_moment_slice[j] = m;
+                second_moment_slice[j] = s;
+            }
+        }
+
+        self.beta1_nth *= self.beta1;
+        self.beta2_nth *= self.beta2;
+    }
+}
diff --git a/juice/src/train/optimizer/mod.rs b/juice/src/train/optimizer/mod.rs
@@ -1,3 +1,4 @@
+mod adam;
 mod sgd_momentum;
 
 use std::rc::Rc;
@@ -9,24 +10,28 @@ use crate::coblas::plugin::Copy;
 use co::prelude::*;
 use crate::util::Axpby;
 
+use adam::Adam;
 use sgd_momentum::SgdWithMomentum;
 
+// Expose configs publicly.
+pub use adam::AdamConfig;
 pub use sgd_momentum::SgdWithMomentumConfig;
 
 // A gradient descent optimization algorithm.
 pub trait Optimizer<B: IBackend> {
     // Called on each minibatch training cycle. Takes all weight gradients computed during
     // backpropagation (indexed by an opaque key which is guaranteed to be stable for the
     // duration of the program).
-    // Modifies the changes in-place; modified changes will then be directly applied to the weights:
-    //  W = W - change
+    // Modifies the changes in-place; modified changes will then be applied to the weights:
+    //   W = W - α•change,
+    // where α is the learning rate (combined from global and param-specific rates).
     fn adjust_weight_change(&mut self, backend: &B, weight_changes: &HashMap<usize, Rc<RefCell<SharedTensor<f32>>>>);
 }
 
 #[derive(Clone, Debug)]
 pub enum OptimizerConfig {
     SgdWithMomentum(SgdWithMomentumConfig),
-    Adam,
+    Adam(AdamConfig),
 }
 
 impl Default for OptimizerConfig {
@@ -38,6 +43,6 @@ impl Default for OptimizerConfig {
 pub fn optimizer_from_config<B: IBackend + Axpby<f32> + Copy<f32>>(config: &OptimizerConfig) -> Box<dyn Optimizer<B>> {
     match config {
         OptimizerConfig::SgdWithMomentum(sgd_config) => Box::new(SgdWithMomentum::new(sgd_config)),
-        OptimizerConfig::Adam => unimplemented!(),
+        OptimizerConfig::Adam(adam_config) => Box::new(Adam::new(adam_config)),
     }
 }
diff --git a/juice/src/train/optimizer/sgd_momentum.rs b/juice/src/train/optimizer/sgd_momentum.rs
@@ -1,3 +1,13 @@
+//! SGD with momentum.
+//! Computes the update Vᵢ from params gradient ∇ᵢ as:
+//!   Vᵢ = (1-β)Vᵢ₋₁ + β∇ᵢ,
+//!   V₀ = 0,
+//! where:
+//!   β is the momentum parameter (typically set to 0.1).
+//! 
+//! (Note that the update Vᵢ is then additionally scaled by Trainer using global and param-specific
+//! learning rates.)
+
 use std::cell::RefCell;
 use std::collections::HashMap;
 use std::rc::Rc;
@@ -13,12 +23,6 @@ pub struct SgdWithMomentumConfig {
     pub momentum: f32,
 }
 
-// SGD with momentum.
-// Computes the update Vᵢ from params gradient ∇ᵢ as:
-//   Vᵢ=(1-β)Vᵢ₋₁ + β∇ᵢ,
-//   V₀ = 0,
-// where:
-//   β is the momentum parameter (typically set to 0.1).
 pub struct SgdWithMomentum {
     history: HashMap<usize, SharedTensor<f32>>,
     // Precomputed tensor constants.

diff --git a/juice/tests/q_learning.rs b/juice/tests/q_learning.rs
@@ -373,7 +373,8 @@ mod cartpole {
                 epsilon = (epsilon * 0.995).max(0.01);
 
                 // Stop when we reach 95 score.
-                if score >= 95.0 {
+                if i / 100 == 1000 {
+                // if score >= 95.0 {
                     return;
                 }
             }