diff options
Diffstat (limited to 'tensorflow/contrib/optimizer_v2/adam.py')
-rw-r--r-- | tensorflow/contrib/optimizer_v2/adam.py | 138 |
1 files changed, 17 insertions, 121 deletions
diff --git a/tensorflow/contrib/optimizer_v2/adam.py b/tensorflow/contrib/optimizer_v2/adam.py index d538ad0fb0..363e020757 100644 --- a/tensorflow/contrib/optimizer_v2/adam.py +++ b/tensorflow/contrib/optimizer_v2/adam.py @@ -18,37 +18,35 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.contrib.optimizer_v2 import optimizer_v2 -from tensorflow.python.framework import ops -from tensorflow.python.ops import control_flow_ops -from tensorflow.python.ops import math_ops -from tensorflow.python.ops import resource_variable_ops -from tensorflow.python.ops import state_ops -from tensorflow.python.training import training_ops +from tensorflow.python.keras.optimizer_v2 import adam +from tensorflow.python.util import deprecation -class AdamOptimizer(optimizer_v2.OptimizerV2): +class AdamOptimizer(adam.Adam): """Optimizer that implements the Adam algorithm. See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) ([pdf](http://arxiv.org/pdf/1412.6980.pdf)). """ + @deprecation.deprecated_args( + "2018-10-01", + "`use_locking = True` is no longer supported and will be ignored.", + ("use_locking", [False])) def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=False, name="Adam"): """Construct a new Adam optimizer. Initialization: - $$m_0 := 0 (Initialize initial 1st moment vector)$$ - $$v_0 := 0 (Initialize initial 2nd moment vector)$$ - $$t := 0 (Initialize timestep)$$ - + $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ + $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ + $$t := 0 \text{(Initialize timestep)}$$ The update rule for `variable` with gradient `g` uses an optimization described at the end of section2 of the paper: $$t := t + 1$$ - $$lr_t := \text{learning_rate} * \sqrt{(1 - beta_2^t) / (1 - beta_1^t)}$$ + $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$ $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$ $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$ @@ -88,111 +86,9 @@ class AdamOptimizer(optimizer_v2.OptimizerV2): name: Optional name for the operations created when applying gradients. Defaults to "Adam". """ - super(AdamOptimizer, self).__init__(use_locking, name) - - self._set_hyper("learning_rate", learning_rate) - self._set_hyper("beta1", beta1) - self._set_hyper("beta2", beta2) - self._set_hyper("epsilon", epsilon) - - def _get_beta_accumulators(self, state=None): - if state is None: - state = self._get_per_graph_state() - return (state.get_non_slot("beta1_power"), - state.get_non_slot("beta2_power")) - - def _create_vars(self, var_list, state): - # Non-slot variables end up on the same device(s). - state.create_non_slot(initial_value=state.get_hyper("beta1"), - name="beta1_power") - state.create_non_slot(initial_value=state.get_hyper("beta2"), - name="beta2_power") - - # Create slots for the first and second moments. - for v in var_list: - state.zeros_slot(v, "m") - state.zeros_slot(v, "v") - - def _apply_dense(self, grad, var, state): - m = state.get_slot(var, "m") - v = state.get_slot(var, "v") - beta1_power, beta2_power = self._get_beta_accumulators(state) - return training_ops.apply_adam( - var, m, v, - math_ops.cast(beta1_power, var.dtype.base_dtype), - math_ops.cast(beta2_power, var.dtype.base_dtype), - state.get_hyper("learning_rate", var.dtype.base_dtype), - state.get_hyper("beta1", var.dtype.base_dtype), - state.get_hyper("beta2", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), - grad, use_locking=self._use_locking).op - - def _resource_apply_dense(self, grad, var, state): - m = state.get_slot(var, "m") - v = state.get_slot(var, "v") - beta1_power, beta2_power = self._get_beta_accumulators(state) - return training_ops.resource_apply_adam( - var.handle, m.handle, v.handle, - math_ops.cast(beta1_power, grad.dtype.base_dtype), - math_ops.cast(beta2_power, grad.dtype.base_dtype), - state.get_hyper("learning_rate", grad.dtype.base_dtype), - state.get_hyper("beta1", grad.dtype.base_dtype), - state.get_hyper("beta2", grad.dtype.base_dtype), - state.get_hyper("epsilon", grad.dtype.base_dtype), - grad, use_locking=self._use_locking) - - def _apply_sparse_shared(self, grad, var, indices, scatter_add, state): - beta1_power, beta2_power = self._get_beta_accumulators(state) - beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) - beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) - lr_t = state.get_hyper("learning_rate", var.dtype.base_dtype) - beta1_t = state.get_hyper("beta1", var.dtype.base_dtype) - beta2_t = state.get_hyper("beta2", var.dtype.base_dtype) - epsilon_t = state.get_hyper("epsilon", var.dtype.base_dtype) - lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) - # m_t = beta1 * m + (1 - beta1) * g_t - m = state.get_slot(var, "m") - m_scaled_g_values = grad * (1 - beta1_t) - m_t = state_ops.assign(m, m * beta1_t, - use_locking=self._use_locking) - with ops.control_dependencies([m_t]): - m_t = scatter_add(m, indices, m_scaled_g_values) - # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) - v = state.get_slot(var, "v") - v_scaled_g_values = (grad * grad) * (1 - beta2_t) - v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) - with ops.control_dependencies([v_t]): - v_t = scatter_add(v, indices, v_scaled_g_values) - v_sqrt = math_ops.sqrt(v_t) - var_update = state_ops.assign_sub(var, - lr * m_t / (v_sqrt + epsilon_t), - use_locking=self._use_locking) - return control_flow_ops.group(*[var_update, m_t, v_t]) - - def _apply_sparse(self, grad, var, state): - return self._apply_sparse_shared( - grad.values, var, grad.indices, - lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda - x, i, v, use_locking=self._use_locking), - state) - - def _resource_scatter_add(self, x, i, v): - with ops.control_dependencies( - [resource_variable_ops.resource_scatter_add( - x.handle, i, v)]): - return x.value() - - def _resource_apply_sparse(self, grad, var, indices, state): - return self._apply_sparse_shared( - grad, var, indices, self._resource_scatter_add, state) - - def _finish(self, state): - # Update the power accumulators. - beta1_power, beta2_power = self._get_beta_accumulators(state) - update_beta1 = beta1_power.assign( - beta1_power * state.get_hyper("beta1"), - use_locking=self._use_locking) - update_beta2 = beta2_power.assign( - beta2_power * state.get_hyper("beta2"), - use_locking=self._use_locking) - return control_flow_ops.group(update_beta1, update_beta2) + super(AdamOptimizer, self).__init__( + learning_rate=learning_rate, + beta_1=beta1, + beta_2=beta2, + epsilon=epsilon, + name=name) |