aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/optimizer_v2/adam.py
diff options
context:
space:
mode:
Diffstat (limited to 'tensorflow/contrib/optimizer_v2/adam.py')
-rw-r--r--tensorflow/contrib/optimizer_v2/adam.py138
1 files changed, 17 insertions, 121 deletions
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py
index d538ad0fb0..363e020757 100644
--- a/tensorflow/contrib/optimizer_v2/adam.py
+++ b/tensorflow/contrib/optimizer_v2/adam.py
@@ -18,37 +18,35 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
-from tensorflow.contrib.optimizer_v2 import optimizer_v2
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.training import training_ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.util import deprecation
-class AdamOptimizer(optimizer_v2.OptimizerV2):
+class AdamOptimizer(adam.Adam):
"""Optimizer that implements the Adam algorithm.
See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
"""
+ @deprecation.deprecated_args(
+ "2018-10-01",
+ "`use_locking = True` is no longer supported and will be ignored.",
+ ("use_locking", [False]))
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
use_locking=False, name="Adam"):
"""Construct a new Adam optimizer.
Initialization:
- $$m_0 := 0 (Initialize initial 1st moment vector)$$
- $$v_0 := 0 (Initialize initial 2nd moment vector)$$
- $$t := 0 (Initialize timestep)$$
-
+ $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+ $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+ $$t := 0 \text{(Initialize timestep)}$$
The update rule for `variable` with gradient `g` uses an optimization
described at the end of section2 of the paper:
$$t := t + 1$$
- $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$
+ $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
@@ -88,111 +86,9 @@ class AdamOptimizer(optimizer_v2.OptimizerV2):
name: Optional name for the operations created when applying gradients.
Defaults to "Adam".
"""
- super(AdamOptimizer, self).__init__(use_locking, name)
-
- self._set_hyper("learning_rate", learning_rate)
- self._set_hyper("beta1", beta1)
- self._set_hyper("beta2", beta2)
- self._set_hyper("epsilon", epsilon)
-
- def _get_beta_accumulators(self, state=None):
- if state is None:
- state = self._get_per_graph_state()
- return (state.get_non_slot("beta1_power"),
- state.get_non_slot("beta2_power"))
-
- def _create_vars(self, var_list, state):
- # Non-slot variables end up on the same device(s).
- state.create_non_slot(initial_value=state.get_hyper("beta1"),
- name="beta1_power")
- state.create_non_slot(initial_value=state.get_hyper("beta2"),
- name="beta2_power")
-
- # Create slots for the first and second moments.
- for v in var_list:
- state.zeros_slot(v, "m")
- state.zeros_slot(v, "v")
-
- def _apply_dense(self, grad, var, state):
- m = state.get_slot(var, "m")
- v = state.get_slot(var, "v")
- beta1_power, beta2_power = self._get_beta_accumulators(state)
- return training_ops.apply_adam(
- var, m, v,
- math_ops.cast(beta1_power, var.dtype.base_dtype),
- math_ops.cast(beta2_power, var.dtype.base_dtype),
- state.get_hyper("learning_rate", var.dtype.base_dtype),
- state.get_hyper("beta1", var.dtype.base_dtype),
- state.get_hyper("beta2", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
- grad, use_locking=self._use_locking).op
-
- def _resource_apply_dense(self, grad, var, state):
- m = state.get_slot(var, "m")
- v = state.get_slot(var, "v")
- beta1_power, beta2_power = self._get_beta_accumulators(state)
- return training_ops.resource_apply_adam(
- var.handle, m.handle, v.handle,
- math_ops.cast(beta1_power, grad.dtype.base_dtype),
- math_ops.cast(beta2_power, grad.dtype.base_dtype),
- state.get_hyper("learning_rate", grad.dtype.base_dtype),
- state.get_hyper("beta1", grad.dtype.base_dtype),
- state.get_hyper("beta2", grad.dtype.base_dtype),
- state.get_hyper("epsilon", grad.dtype.base_dtype),
- grad, use_locking=self._use_locking)
-
- def _apply_sparse_shared(self, grad, var, indices, scatter_add, state):
- beta1_power, beta2_power = self._get_beta_accumulators(state)
- beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
- beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
- lr_t = state.get_hyper("learning_rate", var.dtype.base_dtype)
- beta1_t = state.get_hyper("beta1", var.dtype.base_dtype)
- beta2_t = state.get_hyper("beta2", var.dtype.base_dtype)
- epsilon_t = state.get_hyper("epsilon", var.dtype.base_dtype)
- lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
- # m_t = beta1 * m + (1 - beta1) * g_t
- m = state.get_slot(var, "m")
- m_scaled_g_values = grad * (1 - beta1_t)
- m_t = state_ops.assign(m, m * beta1_t,
- use_locking=self._use_locking)
- with ops.control_dependencies([m_t]):
- m_t = scatter_add(m, indices, m_scaled_g_values)
- # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
- v = state.get_slot(var, "v")
- v_scaled_g_values = (grad * grad) * (1 - beta2_t)
- v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
- with ops.control_dependencies([v_t]):
- v_t = scatter_add(v, indices, v_scaled_g_values)
- v_sqrt = math_ops.sqrt(v_t)
- var_update = state_ops.assign_sub(var,
- lr * m_t / (v_sqrt + epsilon_t),
- use_locking=self._use_locking)
- return control_flow_ops.group(*[var_update, m_t, v_t])
-
- def _apply_sparse(self, grad, var, state):
- return self._apply_sparse_shared(
- grad.values, var, grad.indices,
- lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda
- x, i, v, use_locking=self._use_locking),
- state)
-
- def _resource_scatter_add(self, x, i, v):
- with ops.control_dependencies(
- [resource_variable_ops.resource_scatter_add(
- x.handle, i, v)]):
- return x.value()
-
- def _resource_apply_sparse(self, grad, var, indices, state):
- return self._apply_sparse_shared(
- grad, var, indices, self._resource_scatter_add, state)
-
- def _finish(self, state):
- # Update the power accumulators.
- beta1_power, beta2_power = self._get_beta_accumulators(state)
- update_beta1 = beta1_power.assign(
- beta1_power * state.get_hyper("beta1"),
- use_locking=self._use_locking)
- update_beta2 = beta2_power.assign(
- beta2_power * state.get_hyper("beta2"),
- use_locking=self._use_locking)
- return control_flow_ops.group(update_beta1, update_beta2)
+ super(AdamOptimizer, self).__init__(
+ learning_rate=learning_rate,
+ beta_1=beta1,
+ beta_2=beta2,
+ epsilon=epsilon,
+ name=name)