Skip to content

Commit

Permalink
add adadelta for torch (#534)
Browse files Browse the repository at this point in the history
Co-authored-by: Haifeng Jin <haifeng-jin@users.noreply.github.com>
  • Loading branch information
haifeng-jin and haifeng-jin authored Jul 19, 2023
1 parent 4fcd567 commit 9986ffa
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 8 deletions.
56 changes: 56 additions & 0 deletions keras_core/backend/torch/optimizers/torch_adadelta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import torch

from keras_core import ops
from keras_core import optimizers
from keras_core.backend.torch.optimizers import torch_parallel_optimizer


class Adadelta(
torch_parallel_optimizer.TorchParallelOptimizer, optimizers.Adadelta
):
def _parallel_update_step(
self,
grads,
variables,
learning_rate,
):
keras_variables = variables
variables = [v.value for v in variables]

dtype = variables[0].dtype
lr = ops.cast(learning_rate, dtype)
rho = self.rho

accumulated_grads = [
self._accumulated_grads[self._get_variable_index(variable)].value
for variable in keras_variables
]
accumulated_delta_vars = [
self._accumulated_delta_vars[
self._get_variable_index(variable)
].value
for variable in keras_variables
]
torch._foreach_mul_(accumulated_grads, rho)
torch._foreach_add_(
accumulated_grads, torch._foreach_mul(grads, grads), alpha=1 - rho
)

def rms(x):
return torch._foreach_sqrt(torch._foreach_add(x, self.epsilon))

delta_vars = torch._foreach_mul(
torch._foreach_div(
torch._foreach_mul(rms(accumulated_delta_vars), grads),
rms(accumulated_grads),
),
-1,
)
torch._foreach_mul_(accumulated_delta_vars, rho)
torch._foreach_add_(
accumulated_delta_vars,
torch._foreach_mul(delta_vars, delta_vars),
alpha=1 - rho,
)

torch._foreach_add_(variables, delta_vars, alpha=lr)
2 changes: 2 additions & 0 deletions keras_core/backend/torch/optimizers/torch_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
class TorchOptimizer(BaseOptimizer):
def __new__(cls, *args, **kwargs):
# Import locally to avoid circular imports.
from keras_core.backend.torch.optimizers import torch_adadelta
from keras_core.backend.torch.optimizers import torch_adam
from keras_core.backend.torch.optimizers import torch_adamw
from keras_core.backend.torch.optimizers import torch_rmsprop
from keras_core.backend.torch.optimizers import torch_sgd

OPTIMIZERS = {
optimizers.Adadelta: torch_adadelta.Adadelta,
optimizers.Adam: torch_adam.Adam,
optimizers.AdamW: torch_adamw.AdamW,
optimizers.RMSprop: torch_rmsprop.RMSprop,
Expand Down
5 changes: 2 additions & 3 deletions keras_core/backend/torch/optimizers/torch_rmsprop.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,8 @@ def _parallel_update_step(
self._momentums[self._get_variable_index(variable)].value
for variable in keras_variables
]
momentum_list = torch._foreach_add(
increments, momentum_list, alpha=self.momentum
)
torch._foreach_mul_(momentum_list, self.momentum)
torch._foreach_add_(momentum_list, increments)
torch._foreach_add_(variables, momentum_list, alpha=-1)
else:
torch._foreach_add_(variables, increments, alpha=-1)
2 changes: 1 addition & 1 deletion keras_core/backend/torch/optimizers/torch_sgd.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def _parallel_update_step(
variables = [v.value for v in variables]
if self.momentum != 0:
bufs = [
self.momentums[self._get_variable_index(variable.value)].value
self.momentums[self._get_variable_index(variable)].value
for variable in keras_variables
]

Expand Down
9 changes: 5 additions & 4 deletions keras_core/optimizers/adadelta_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np

from keras_core import backend
from keras_core import ops
from keras_core import testing
from keras_core.optimizers.adadelta import Adadelta

Expand All @@ -16,7 +17,7 @@ def test_config(self):

def test_single_step(self):
optimizer = Adadelta(learning_rate=0.5)
grads = np.array([1.0, 6.0, 7.0, 2.0])
grads = ops.array([1.0, 6.0, 7.0, 2.0])
vars = backend.Variable([1.0, 2.0, 3.0, 4.0])
optimizer.apply_gradients(zip([grads], [vars]))
self.assertAllClose(
Expand All @@ -25,7 +26,7 @@ def test_single_step(self):

def test_weight_decay(self):
grads, var1, var2, var3 = (
np.zeros(()),
ops.zeros(()),
backend.Variable(2.0),
backend.Variable(2.0, name="exclude"),
backend.Variable(2.0),
Expand All @@ -49,8 +50,8 @@ def test_correctness_with_golden(self):
optimizer = Adadelta(learning_rate=1.0, rho=0.8, epsilon=1e-6)

x = backend.Variable(np.ones([10]))
grads = np.arange(0.1, 1.1, 0.1)
first_grads = np.full((10,), 0.01)
grads = ops.arange(0.1, 1.1, 0.1)
first_grads = ops.full((10,), 0.01)

golden = np.tile(
[[0.9978], [0.9947], [0.9915], [0.9882], [0.9849]], (1, 10)
Expand Down

0 comments on commit 9986ffa

Please sign in to comment.