6 files changed, 1547 insertions, 318 deletions
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index e6169e9e80..ba9c6a2320 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4393,6 +4393,7 @@ cuda_py_tests(
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
+        "training/learning_rate_decay_v2_test.py",
         "training/momentum_test.py",
         "training/optimizer_test.py",
         "training/proximal_adagrad_test.py",
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index fd195a7965..29b5465321 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -17,19 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.training import learning_rate_decay_v2
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.exponential_decay")
+@tf_export(v1=["train.exponential_decay"])
 def exponential_decay(learning_rate,
                       global_step,
                       decay_steps,
@@ -95,32 +88,19 @@ def exponential_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("global_step is required for exponential_decay.")
-  with ops.name_scope(
-      name, "ExponentialDecay",
-      [learning_rate, global_step, decay_steps, decay_rate]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-    dtype = learning_rate.dtype
-    decay_steps = math_ops.cast(decay_steps, dtype)
-    decay_rate = math_ops.cast(decay_rate, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      return math_ops.multiply(
-          learning_rate, math_ops.pow(decay_rate, p), name=name)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
-
-
-@tf_export("train.piecewise_constant")
+  decayed_lr = learning_rate_decay_v2.exponential_decay(learning_rate,
+                                                        global_step,
+                                                        decay_steps,
+                                                        decay_rate,
+                                                        staircase=staircase,
+                                                        name=name)
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
+
+  return decayed_lr
+
+
+@tf_export(v1=["train.piecewise_constant"])
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
@@ -163,58 +143,15 @@ def piecewise_constant(x, boundaries, values, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if len(boundaries) != len(values) - 1:
-    raise ValueError(
-        "The length of boundaries should be 1 less than the length of values")
-  with ops.name_scope(name, "PiecewiseConstant",
-                      [x, boundaries, values, name]) as name:
-    boundaries = ops.convert_n_to_tensor(boundaries)
-    values = ops.convert_n_to_tensor(values)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      x_recomp = ops.convert_to_tensor(x)
-      # Avoid explicit conversion to x's dtype. This could result in faulty
-      # comparisons, for example if floats are converted to integers.
-      for i, b in enumerate(boundaries):
-        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
-          # We can promote int32 boundaries to int64 without loss of precision.
-          # This covers the most common case where the user passes in boundaries
-          # as an array of Python integers.
-          if (b.dtype.base_dtype == dtypes.int32 and
-              x_recomp.dtype.base_dtype == dtypes.int64):
-            b = math_ops.cast(b, x_recomp.dtype.base_dtype)
-            boundaries[i] = b
-          else:
-            raise ValueError(
-                "Boundaries (%s) must have the same dtype as x (%s)." %
-                (b.dtype.base_dtype, x_recomp.dtype.base_dtype))
-      # TODO(rdipietro): Ensure that boundaries' elements strictly increases.
-      for v in values[1:]:
-        if v.dtype.base_dtype != values[0].dtype.base_dtype:
-          raise ValueError(
-              "Values must have elements all with the same dtype (%s vs %s)." %
-              (values[0].dtype.base_dtype, v.dtype.base_dtype))
-      pred_fn_pairs = []
-      pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
-      pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
-      for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
-        # Need to bind v here; can do this with lambda v=v: ...
-        pred = (x_recomp > low) & (x_recomp <= high)
-        pred_fn_pairs.append((pred, lambda v=v: v))
-
-      # The default isn't needed here because our conditions are mutually
-      # exclusive and exhaustive, but tf.case requires it.
-      default = lambda: values[0]
-      return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
-
-
-@tf_export("train.polynomial_decay")
+  decayed_lr = learning_rate_decay_v2.piecewise_constant(x, boundaries, values,
+                                                         name=name)
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
+
+  return decayed_lr
+
+
+@tf_export(v1=["train.polynomial_decay"])
 def polynomial_decay(learning_rate,
                      global_step,
                      decay_steps,
@@ -299,46 +236,22 @@ def polynomial_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("global_step is required for polynomial_decay.")
-  with ops.name_scope(
-      name, "PolynomialDecay",
-      [learning_rate, global_step, decay_steps, end_learning_rate, power
-      ]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-    dtype = learning_rate.dtype
-    end_learning_rate = math_ops.cast(end_learning_rate, dtype)
-    power = math_ops.cast(power, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
-      if cycle:
-        # Find the first multiple of decay_steps that is bigger than
-        # global_step. If global_step is zero set the multiplier to 1
-        multiplier = control_flow_ops.cond(
-            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
-            lambda: math_ops.ceil(global_step_recomp / decay_steps))
-        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
-      else:
-        # Make sure that the global_step used is not bigger than decay_steps.
-        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-
-      p = math_ops.div(global_step_recomp, decay_steps_recomp)
-      return math_ops.add(
-          math_ops.multiply(learning_rate - end_learning_rate,
-                            math_ops.pow(1 - p, power)),
-          end_learning_rate,
-          name=name)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
-
-
-@tf_export("train.natural_exp_decay")
+  decayed_lr = learning_rate_decay_v2.polynomial_decay(
+      learning_rate,
+      global_step,
+      decay_steps,
+      end_learning_rate=end_learning_rate,
+      power=power,
+      cycle=cycle,
+      name=name)
+
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
+
+  return decayed_lr
+
+
+@tf_export(v1=["train.natural_exp_decay"])
 def natural_exp_decay(learning_rate,
                       global_step,
                       decay_steps,
@@ -410,32 +323,17 @@ def natural_exp_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("global_step is required for natural_exp_decay.")
-  with ops.name_scope(name, "NaturalExpDecay",
-                      [learning_rate, global_step, decay_rate]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-    dtype = learning_rate.dtype
-    decay_steps = math_ops.cast(decay_steps, dtype)
-    decay_rate = math_ops.cast(decay_rate, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      exponent = math_ops.exp(
-          math_ops.multiply(math_ops.negative(decay_rate), p))
-      return math_ops.multiply(learning_rate, exponent, name=name)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
-
-
-@tf_export("train.inverse_time_decay")
+  decayed_lr = learning_rate_decay_v2.natural_exp_decay(
+      learning_rate, global_step, decay_steps, decay_rate, staircase=staircase,
+      name=name)
+
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
+
+  return decayed_lr
+
+
+@tf_export(v1=["train.inverse_time_decay"])
 def inverse_time_decay(learning_rate,
                        global_step,
                        decay_steps,
@@ -507,32 +405,21 @@ def inverse_time_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("global_step is required for inverse_time_decay.")
-  with ops.name_scope(name, "InverseTimeDecay",
-                      [learning_rate, global_step, decay_rate]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-    dtype = learning_rate.dtype
-    decay_steps = math_ops.cast(decay_steps, dtype)
-    decay_rate = math_ops.cast(decay_rate, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      p = global_step_recomp / decay_steps
-      if staircase:
-        p = math_ops.floor(p)
-      const = math_ops.cast(constant_op.constant(1), dtype)
-      denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
-      return math_ops.div(learning_rate, denom, name=name)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
-
-
-@tf_export("train.cosine_decay")
+  decayed_lr = learning_rate_decay_v2.inverse_time_decay(
+      learning_rate,
+      global_step,
+      decay_steps,
+      decay_rate,
+      staircase=staircase,
+      name=name)
+
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
+
+  return decayed_lr
+
+
+@tf_export(v1=["train.cosine_decay"])
 def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   """Applies cosine decay to the learning rate.
 
@@ -581,32 +468,16 @@ def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("cosine decay requires global_step")
-  with ops.name_scope(name, "CosineDecay",
-                      [learning_rate, global_step]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-    dtype = learning_rate.dtype
-    decay_steps = math_ops.cast(decay_steps, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      completed_fraction = global_step_recomp / decay_steps
-      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
-          constant_op.constant(math.pi) * completed_fraction))
-
-      decayed = (1 - alpha) * cosine_decayed + alpha
-      return math_ops.multiply(learning_rate, decayed)
+  decayed_lr = learning_rate_decay_v2.cosine_decay(
+      learning_rate, global_step, decay_steps, alpha=alpha, name=name)
 
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
 
-    return decayed_lr
+  return decayed_lr
 
 
-@tf_export("train.cosine_decay_restarts")
+@tf_export(v1=["train.cosine_decay_restarts"])
 def cosine_decay_restarts(learning_rate,
                           global_step,
                           first_decay_steps,
@@ -664,57 +535,22 @@ def cosine_decay_restarts(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("cosine decay restarts requires global_step")
-  with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name:
-    learning_rate = ops.convert_to_tensor(
-        learning_rate, name="initial_learning_rate")
-    dtype = learning_rate.dtype
-    first_decay_steps = math_ops.cast(first_decay_steps, dtype)
-    alpha = math_ops.cast(alpha, dtype)
-    t_mul = math_ops.cast(t_mul, dtype)
-    m_mul = math_ops.cast(m_mul, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      completed_fraction = global_step_recomp / first_decay_steps
-
-      def compute_step(completed_fraction, geometric=False):
-        """Helper for `cond` operation."""
-        if geometric:
-          i_restart = math_ops.floor(
-              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
-              math_ops.log(t_mul))
-
-          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
-          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
-
-        else:
-          i_restart = math_ops.floor(completed_fraction)
-          completed_fraction -= i_restart
+  decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
+      learning_rate,
+      global_step,
+      first_decay_steps,
+      t_mul=t_mul,
+      m_mul=m_mul,
+      alpha=alpha,
+      name=name)
 
-        return i_restart, completed_fraction
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
 
-      i_restart, completed_fraction = control_flow_ops.cond(
-          math_ops.equal(t_mul, 1.0),
-          lambda: compute_step(completed_fraction, geometric=False),
-          lambda: compute_step(completed_fraction, geometric=True))
+  return decayed_lr
 
-      m_fac = m_mul**i_restart
-      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
-          constant_op.constant(math.pi) * completed_fraction))
-      decayed = (1 - alpha) * cosine_decayed + alpha
 
-      return math_ops.multiply(learning_rate, decayed, name=name)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
-
-
-@tf_export("train.linear_cosine_decay")
+@tf_export(v1=["train.linear_cosine_decay"])
 def linear_cosine_decay(learning_rate,
                         global_step,
                         decay_steps,
@@ -781,37 +617,22 @@ def linear_cosine_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("linear cosine decay requires global_step")
-  with ops.name_scope(name, "LinearCosineDecay",
-                      [learning_rate, global_step]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-    dtype = learning_rate.dtype
-    decay_steps = math_ops.cast(decay_steps, dtype)
-    num_periods = math_ops.cast(num_periods, dtype)
-    alpha = math_ops.cast(alpha, dtype)
-    beta = math_ops.cast(beta, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
-
-      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
-      return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
-
-
-@tf_export("train.noisy_linear_cosine_decay")
+  decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
+      learning_rate,
+      global_step,
+      decay_steps,
+      num_periods=num_periods,
+      alpha=alpha,
+      beta=beta,
+      name=name)
+
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
+
+  return decayed_lr
+
+
+@tf_export(v1=["train.noisy_linear_cosine_decay"])
 def noisy_linear_cosine_decay(learning_rate,
                               global_step,
                               decay_steps,
@@ -886,42 +707,17 @@ def noisy_linear_cosine_decay(learning_rate,
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  if global_step is None:
-    raise ValueError("noisy linear cosine decay requires global_step")
-  with ops.name_scope(name, "NoisyLinearCosineDecay",
-                      [learning_rate, global_step]) as name:
-    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
-    dtype = learning_rate.dtype
-    decay_steps = math_ops.cast(decay_steps, dtype)
-    initial_variance = math_ops.cast(initial_variance, dtype)
-    variance_decay = math_ops.cast(variance_decay, dtype)
-    num_periods = math_ops.cast(num_periods, dtype)
-    alpha = math_ops.cast(alpha, dtype)
-    beta = math_ops.cast(beta, dtype)
-
-    def decayed_lr():
-      """Helper to recompute learning rate; most helpful in eager-mode."""
-      global_step_recomp = math_ops.cast(global_step, dtype)
-      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
-      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
-      variance = initial_variance / (
-          math_ops.pow(1.0 + global_step_recomp, variance_decay))
-      std = math_ops.sqrt(variance)
-      noisy_linear_decayed = (
-          linear_decayed + random_ops.random_normal(
-              linear_decayed.shape, stddev=std))
-
-      completed_fraction = global_step_recomp / decay_steps
-      fraction = 2.0 * num_periods * completed_fraction
-      cosine_decayed = 0.5 * (
-          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
-      noisy_linear_cosine_decayed = (
-          (alpha + noisy_linear_decayed) * cosine_decayed + beta)
-
-      return math_ops.multiply(
-          learning_rate, noisy_linear_cosine_decayed, name=name)
-
-    if not context.executing_eagerly():
-      decayed_lr = decayed_lr()
-
-    return decayed_lr
+  decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
+      learning_rate, global_step,
+      decay_steps,
+      initial_variance=initial_variance,
+      variance_decay=variance_decay,
+      num_periods=num_periods,
+      alpha=alpha,
+      beta=beta,
+      name=name)
+
+  if not context.executing_eagerly():
+    decayed_lr = decayed_lr()
+
+  return decayed_lr
diff --git a/tensorflow/python/training/learning_rate_decay_v2.py b/tensorflow/python/training/learning_rate_decay_v2.py
new file mode 100644
index 0000000000..9c5e144be6
--- /dev/null
+++ b/tensorflow/python/training/learning_rate_decay_v2.py
@@ -0,0 +1,898 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various learning rate decay functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import math
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("train.exponential_decay", v1=[])
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
+  """Applies exponential decay to the learning rate.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies an exponential decay function
+  to a provided initial learning rate.  It requires a `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns a no-arg function that produces the decayed learning
+  rate. This can be useful for changing the learning rate value across
+  different invocations of optimizer functions.
+  It is computed as:
+
+  ```python
+  decayed_learning_rate = learning_rate *
+                          decay_rate ^ (global_step / decay_steps)
+  ```
+
+  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
+  integer division and the decayed learning rate follows a staircase function.
+
+  Example: decay every 100000 steps with a base of 0.96:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  starter_learning_rate = 0.1
+  learning_rate_fn = tf.train.exponential_decay(starter_learning_rate,
+                                                global_step, 100000, 0.96,
+                                                staircase=True)
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.train.GradientDescentOptimizer(learning_rate_fn)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Must be positive.  See the decay computation above.
+    decay_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The decay rate.
+    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
+    name: String.  Optional name of the operation.  Defaults to
+      'ExponentialDecay'.
+
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("global_step is required for exponential_decay.")
+  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate,
+                 staircase, name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(
+        name, "ExponentialDecay",
+        [learning_rate, global_step, decay_steps, decay_rate]) as name:
+      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+      dtype = learning_rate.dtype
+      decay_steps = math_ops.cast(decay_steps, dtype)
+      decay_rate = math_ops.cast(decay_rate, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      p = global_step_recomp / decay_steps
+      if staircase:
+        p = math_ops.floor(p)
+      return math_ops.multiply(
+          learning_rate, math_ops.pow(decay_rate, p), name=name)
+
+  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
+                           decay_rate, staircase, name)
+
+
+@tf_export("train.piecewise_constant", v1=[])
+def piecewise_constant(x, boundaries, values, name=None):
+  """Piecewise constant from boundaries and interval values.
+
+  This function returns a no-arg callable to compute the piecewise constant.
+  This can be useful for changing the learning rate value across
+  different invocations of optimizer functions.
+
+  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
+    for the next 10000 steps, and 0.1 for any additional steps.
+
+  ```python
+  global_step = tf.Variable(0, trainable=False)
+  boundaries = [100000, 110000]
+  values = [1.0, 0.5, 0.1]
+  learning_rate_fn = tf.train.piecewise_constant(global_step, boundaries,
+    values)
+  learning_rate = learning_rate_fn()
+
+  # Later, whenever we perform an optimization step, we increment global_step.
+  ```
+
+  Args:
+    x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
+      `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
+    boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
+      increasing entries, and with all elements having the same type as `x`.
+    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
+      for the intervals defined by `boundaries`. It should have one more element
+      than `boundaries`, and all elements should have the same type.
+    name: A string. Optional name of the operation. Defaults to
+      'PiecewiseConstant'.
+
+  Returns:
+    A no-arg function that outputs a 0-D Tensor. The output of the no-arg
+    function is `values[0]` when `x <= boundaries[0]`,
+    `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
+    and values[-1] when `x > boundaries[-1]`.
+
+  Raises:
+    ValueError: if types of `x` and `boundaries` do not match, or types of all
+        `values` do not match or
+        the number of elements in the lists does not match.
+  """
+  if len(boundaries) != len(values) - 1:
+    raise ValueError(
+        "The length of boundaries should be 1 less than the length of values")
+  def decayed_lr(x, boundaries, values, name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(name, "PiecewiseConstant",
+                        [x, boundaries, values, name]) as name:
+      boundaries = ops.convert_n_to_tensor(boundaries)
+      values = ops.convert_n_to_tensor(values)
+      x_recomp = ops.convert_to_tensor(x)
+      # Avoid explicit conversion to x's dtype. This could result in faulty
+      # comparisons, for example if floats are converted to integers.
+      for i, b in enumerate(boundaries):
+        if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
+          # We can promote int32 boundaries to int64 without loss of precision.
+          # This covers the most common case where the user passes in boundaries
+          # as an array of Python integers.
+          if (b.dtype.base_dtype == dtypes.int32 and
+              x_recomp.dtype.base_dtype == dtypes.int64):
+            b = math_ops.cast(b, x_recomp.dtype.base_dtype)
+            boundaries[i] = b
+          else:
+            raise ValueError(
+                "Boundaries (%s) must have the same dtype as x (%s)." %
+                (b.dtype.base_dtype, x_recomp.dtype.base_dtype))
+      # TODO(rdipietro): Ensure that boundaries' elements strictly increases.
+      for v in values[1:]:
+        if v.dtype.base_dtype != values[0].dtype.base_dtype:
+          raise ValueError(
+              "Values must have elements all with the same dtype (%s vs %s)." %
+              (values[0].dtype.base_dtype, v.dtype.base_dtype))
+      pred_fn_pairs = []
+      pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
+      pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
+      for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
+        # Need to bind v here; can do this with lambda v=v: ...
+        pred = (x_recomp > low) & (x_recomp <= high)
+        pred_fn_pairs.append((pred, lambda v=v: v))
+
+      # The default isn't needed here because our conditions are mutually
+      # exclusive and exhaustive, but tf.case requires it.
+      default = lambda: values[0]
+      return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
+
+  return functools.partial(decayed_lr, x, boundaries, values, name)
+
+
+@tf_export("train.polynomial_decay", v1=[])
+def polynomial_decay(learning_rate,
+                     global_step,
+                     decay_steps,
+                     end_learning_rate=0.0001,
+                     power=1.0,
+                     cycle=False,
+                     name=None):
+  """Applies a polynomial decay to the learning rate.
+
+  It is commonly observed that a monotonically decreasing learning rate, whose
+  degree of change is carefully chosen, results in a better performing model.
+  This function applies a polynomial decay function to a provided initial
+  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
+
+  It requires a `global_step` value to compute the decayed learning rate.  You
+  can just pass a TensorFlow variable that you increment at each training step.
+
+  The function returns a no-arg callable that outputs the decayed learning
+  rate. This can be useful for changing the learning rate value across
+  different invocations of optimizer functions. It is computed as:
+
+  ```python
+  global_step = min(global_step, decay_steps)
+  decayed_learning_rate = (learning_rate - end_learning_rate) *
+                          (1 - global_step / decay_steps) ^ (power) +
+                          end_learning_rate
+
+  ```
+
+  If `cycle` is True then a multiple of `decay_steps` is used, the first one
+  that is bigger than `global_steps`.
+
+  ```python
+  decay_steps = decay_steps * ceil(global_step / decay_steps)
+  decayed_learning_rate_fn = (learning_rate - end_learning_rate) *
+                          (1 - global_step / decay_steps) ^ (power) +
+                          end_learning_rate
+  decayed_learning_rate = decayed_learning_rate_fn()
+
+  ```
+
+  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  starter_learning_rate = 0.1
+  end_learning_rate = 0.01
+  decay_steps = 10000
+  learning_rate_fn = tf.train.polynomial_decay(starter_learning_rate,
+                                               global_step, decay_steps,
+                                               end_learning_rate,
+                                               power=0.5)
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.train.GradientDescentOptimizer(learning_rate_fn)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Must be positive.  See the decay computation above.
+    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The minimal end learning rate.
+    power: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The power of the polynomial. Defaults to linear, 1.0.
+    cycle: A boolean, whether or not it should cycle beyond decay_steps.
+    name: String.  Optional name of the operation. Defaults to
+      'PolynomialDecay'.
+
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("global_step is required for polynomial_decay.")
+  def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate,
+                 power, cycle, name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(
+        name, "PolynomialDecay",
+        [learning_rate, global_step, decay_steps, end_learning_rate, power]
+    ) as name:
+      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+      dtype = learning_rate.dtype
+      end_learning_rate = math_ops.cast(end_learning_rate, dtype)
+      power = math_ops.cast(power, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      decay_steps_recomp = math_ops.cast(decay_steps, dtype)
+      if cycle:
+        # Find the first multiple of decay_steps that is bigger than
+        # global_step. If global_step is zero set the multiplier to 1
+        multiplier = control_flow_ops.cond(
+            math_ops.equal(global_step_recomp, 0), lambda: 1.0,
+            lambda: math_ops.ceil(global_step_recomp / decay_steps))
+        decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
+      else:
+        # Make sure that the global_step used is not bigger than decay_steps.
+        global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+
+      p = math_ops.div(global_step_recomp, decay_steps_recomp)
+      return math_ops.add(
+          math_ops.multiply(learning_rate - end_learning_rate,
+                            math_ops.pow(1 - p, power)),
+          end_learning_rate,
+          name=name)
+
+  return functools.partial(
+      decayed_lr, learning_rate, global_step, decay_steps, end_learning_rate,
+      power, cycle, name)
+
+
+@tf_export("train.natural_exp_decay", v1=[])
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False,
+                      name=None):
+  """Applies natural exponential decay to the initial learning rate.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies an exponential decay function
+  to a provided initial learning rate.  It requires an `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns a no-arg callable that produces the decayed learning
+  rate. This can be useful for changing the learning rate value across
+  different invocations of optimizer functions. It is computed as:
+
+  ```python
+  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
+  decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step /
+  decay_step))
+  ```
+
+  Example: decay exponentially with a base of 0.96:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  learning_rate = 0.1
+  decay_steps = 5
+  k = 0.5
+  learning_rate_fn = tf.train.natural_exp_decay(learning_rate, global_step,
+                                                decay_steps, k)
+
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.train.GradientDescentOptimizer(learning_rate_fn)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_steps: How often to apply decay.
+    decay_rate: A Python number.  The decay rate.
+    staircase: Whether to apply decay in a discrete staircase, as opposed to
+      continuous, fashion.
+    name: String.  Optional name of the operation.  Defaults to
+      'ExponentialTimeDecay'.
+
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("global_step is required for natural_exp_decay.")
+  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate, staircase,
+                 name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(name, "NaturalExpDecay",
+                        [learning_rate, global_step, decay_rate]) as name:
+      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+      dtype = learning_rate.dtype
+      decay_steps = math_ops.cast(decay_steps, dtype)
+      decay_rate = math_ops.cast(decay_rate, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      p = global_step_recomp / decay_steps
+      if staircase:
+        p = math_ops.floor(p)
+      exponent = math_ops.exp(
+          math_ops.multiply(math_ops.negative(decay_rate), p))
+      return math_ops.multiply(learning_rate, exponent, name=name)
+
+  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
+                           decay_rate, staircase, name)
+
+
+@tf_export("train.inverse_time_decay", v1=[])
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False,
+                       name=None):
+  """Applies inverse time decay to the initial learning rate.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies an inverse decay function
+  to a provided initial learning rate.  It requires an `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns a no-arg callable that produces the decayed learning
+  rate. This can be useful for changing the learning rate value across
+  different invocations of optimizer functions. It is computed as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
+  decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
+  decay_step))
+  ```
+
+  Example: decay 1/t with a rate of 0.5:
+
+  ```python
+  ...
+  global_step = tf.Variable(0, trainable=False)
+  learning_rate = 0.1
+  decay_steps = 1.0
+  decay_rate = 0.5
+  learning_rate_fn = tf.train.inverse_time_decay(learning_rate, global_step,
+  decay_steps, decay_rate)
+
+  # Passing global_step to minimize() will increment it at each step.
+  learning_step = (
+      tf.train.GradientDescentOptimizer(learning_rate_fn)
+      .minimize(...my loss..., global_step=global_step)
+  )
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` `Tensor` or a
+      Python number.  The initial learning rate.
+    global_step: A Python number.
+      Global step to use for the decay computation.  Must not be negative.
+    decay_steps: How often to apply decay.
+    decay_rate: A Python number.  The decay rate.
+    staircase: Whether to apply decay in a discrete staircase, as opposed to
+      continuous, fashion.
+    name: String.  Optional name of the operation.  Defaults to
+      'InverseTimeDecay'.
+
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("global_step is required for inverse_time_decay.")
+  def decayed_lr(learning_rate, global_step, decay_steps, decay_rate, staircase,
+                 name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(name, "InverseTimeDecay",
+                        [learning_rate, global_step, decay_rate]) as name:
+      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+      dtype = learning_rate.dtype
+      decay_steps = math_ops.cast(decay_steps, dtype)
+      decay_rate = math_ops.cast(decay_rate, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      p = global_step_recomp / decay_steps
+      if staircase:
+        p = math_ops.floor(p)
+      const = math_ops.cast(constant_op.constant(1), dtype)
+      denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
+      return math_ops.div(learning_rate, denom, name=name)
+
+  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
+                           decay_rate, staircase, name)
+
+
+@tf_export("train.cosine_decay", v1=[])
+def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0,
+                 name=None):
+  """Applies cosine decay to the learning rate.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies a cosine decay function
+  to a provided initial learning rate.  It requires a `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns a no-arg callable that produces the decayed learning
+  rate. This can be useful for changing the learning rate value across
+  different invocations of optimizer functions. It is computed as:
+
+  ```python
+  global_step = min(global_step, decay_steps)
+  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
+  decayed = (1 - alpha) * cosine_decay + alpha
+  decayed_learning_rate = learning_rate * decayed
+  ```
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = tf.train.cosine_decay(learning_rate, global_step, decay_steps)
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Number of steps to decay over.
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of learning_rate.
+    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("cosine decay requires global_step")
+  def decayed_lr(learning_rate, global_step, decay_steps, alpha, name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(name, "CosineDecay",
+                        [learning_rate, global_step]) as name:
+      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+      dtype = learning_rate.dtype
+      decay_steps = math_ops.cast(decay_steps, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      completed_fraction = global_step_recomp / decay_steps
+      cosine_decayed = 0.5 * (1.0 + math_ops.cos(
+          constant_op.constant(math.pi) * completed_fraction))
+
+      decayed = (1 - alpha) * cosine_decayed + alpha
+      return math_ops.multiply(learning_rate, decayed)
+
+  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
+                           alpha, name)
+
+
+@tf_export("train.cosine_decay_restarts", v1=[])
+def cosine_decay_restarts(learning_rate,
+                          global_step,
+                          first_decay_steps,
+                          t_mul=2.0,
+                          m_mul=1.0,
+                          alpha=0.0,
+                          name=None):
+  """Applies cosine decay with restarts to the learning rate.
+
+  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies a cosine decay function with
+  restarts to a provided initial learning rate.  It requires a `global_step`
+  value to compute the decayed learning rate.  You can just pass a TensorFlow
+  variable that you increment at each training step.
+
+  The function returns a no-arg callable that produces the decayed learning
+  rate while taking into account possible warm restarts. This can be useful for
+  changing the learning rate value across different invocations of optimizer
+  functions.
+
+  The learning rate multiplier first decays
+  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
+  restart is performed. Each new warm restart runs for `t_mul` times more steps
+  and with `m_mul` times smaller initial learning rate.
+
+  Example usage:
+  ```python
+  first_decay_steps = 1000
+  lr_decayed_fn = tf.train.cosine_decay_restarts(learning_rate, global_step,
+                                     first_decay_steps)
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.
+    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Number of steps to decay over.
+    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the number of iterations in the i-th period
+    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
+      Used to derive the initial learning rate of the i-th period:
+    alpha: A scalar `float32` or `float64` Tensor or a Python number.
+      Minimum learning rate value as a fraction of the learning_rate.
+    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("cosine decay restarts requires global_step")
+  def decayed_lr(learning_rate, global_step, first_decay_steps, t_mul, m_mul,
+                 alpha, name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]
+                       ) as name:
+      learning_rate = ops.convert_to_tensor(
+          learning_rate, name="initial_learning_rate")
+      dtype = learning_rate.dtype
+      first_decay_steps = math_ops.cast(first_decay_steps, dtype)
+      alpha = math_ops.cast(alpha, dtype)
+      t_mul = math_ops.cast(t_mul, dtype)
+      m_mul = math_ops.cast(m_mul, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      completed_fraction = global_step_recomp / first_decay_steps
+
+      def compute_step(completed_fraction, geometric=False):
+        """Helper for `cond` operation."""
+        if geometric:
+          i_restart = math_ops.floor(
+              math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
+              math_ops.log(t_mul))
+
+          sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
+          completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
+
+        else:
+          i_restart = math_ops.floor(completed_fraction)
+          completed_fraction -= i_restart
+
+        return i_restart, completed_fraction
+
+      i_restart, completed_fraction = control_flow_ops.cond(
+          math_ops.equal(t_mul, 1.0),
+          lambda: compute_step(completed_fraction, geometric=False),
+          lambda: compute_step(completed_fraction, geometric=True))
+
+      m_fac = m_mul**i_restart
+      cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
+          constant_op.constant(math.pi) * completed_fraction))
+      decayed = (1 - alpha) * cosine_decayed + alpha
+
+      return math_ops.multiply(learning_rate, decayed, name=name)
+
+  return functools.partial(decayed_lr, learning_rate, global_step,
+                           first_decay_steps, t_mul, m_mul, alpha, name)
+
+
+@tf_export("train.linear_cosine_decay", v1=[])
+def linear_cosine_decay(learning_rate,
+                        global_step,
+                        decay_steps,
+                        num_periods=0.5,
+                        alpha=0.0,
+                        beta=0.001,
+                        name=None):
+  """Applies linear cosine decay to the learning rate.
+
+  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+  https://arxiv.org/abs/1709.07417
+
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  Note that linear cosine decay is more aggressive than cosine decay and
+  larger initial learning rates can typically be used.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies a linear cosine decay function
+  to a provided initial learning rate.  It requires a `global_step` value to
+  compute the decayed learning rate.  You can just pass a TensorFlow variable
+  that you increment at each training step.
+
+  The function returns a no-arg callable that produces the decayed learning
+  rate. This can be useful for changing the learning rate value across
+  different invocations of optimizer functions. It is computed as:
+
+  ```python
+  global_step = min(global_step, decay_steps)
+  linear_decay = (decay_steps - global_step) / decay_steps)
+  cosine_decay = 0.5 * (
+      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
+  decayed = (alpha + linear_decay) * cosine_decay + beta
+  decayed_learning_rate = learning_rate * decayed
+  ```
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = tf.train.linear_cosine_decay(learning_rate, global_step,
+                                               decay_steps)
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Number of steps to decay over.
+    num_periods: Number of periods in the cosine part of the decay.
+      See computation above.
+    alpha: See computation above.
+    beta: See computation above.
+    name: String.  Optional name of the operation.  Defaults to
+      'LinearCosineDecay'.
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("linear cosine decay requires global_step")
+  def decayed_lr(learning_rate, global_step, decay_steps, num_periods, alpha,
+                 beta, name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(name, "LinearCosineDecay",
+                        [learning_rate, global_step]) as name:
+      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+      dtype = learning_rate.dtype
+      decay_steps = math_ops.cast(decay_steps, dtype)
+      num_periods = math_ops.cast(num_periods, dtype)
+      alpha = math_ops.cast(alpha, dtype)
+      beta = math_ops.cast(beta, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+      completed_fraction = global_step_recomp / decay_steps
+      fraction = 2.0 * num_periods * completed_fraction
+      cosine_decayed = 0.5 * (
+          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+
+      linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
+      return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
+
+  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
+                           num_periods, alpha, beta, name)
+
+
+@tf_export("train.noisy_linear_cosine_decay", v1=[])
+def noisy_linear_cosine_decay(learning_rate,
+                              global_step,
+                              decay_steps,
+                              initial_variance=1.0,
+                              variance_decay=0.55,
+                              num_periods=0.5,
+                              alpha=0.0,
+                              beta=0.001,
+                              name=None):
+  """Applies noisy linear cosine decay to the learning rate.
+
+  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
+  https://arxiv.org/abs/1709.07417
+
+  For the idea of warm starts here controlled by `num_periods`,
+  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
+  with Warm Restarts. https://arxiv.org/abs/1608.03983
+
+  Note that linear cosine decay is more aggressive than cosine decay and
+  larger initial learning rates can typically be used.
+
+  When training a model, it is often recommended to lower the learning rate as
+  the training progresses.  This function applies a noisy linear
+  cosine decay function to a provided initial learning rate.
+  It requires a `global_step` value to compute the decayed learning rate.
+  You can just pass a TensorFlow variable that you increment at each
+  training step.
+
+  The function returns a no-arg callable that produces the decayed learning
+  rate. This can be useful for changing the learning rate value across
+  different invocations of optimizer functions. It is computed as:
+
+  ```python
+  global_step = min(global_step, decay_steps)
+  linear_decay = (decay_steps - global_step) / decay_steps)
+  cosine_decay = 0.5 * (
+      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
+  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
+  decayed_learning_rate = learning_rate * decayed
+  ```
+  where eps_t is 0-centered gaussian noise with variance
+  initial_variance / (1 + global_step) ** variance_decay
+
+  Example usage:
+  ```python
+  decay_steps = 1000
+  lr_decayed_fn = tf.train.noisy_linear_cosine_decay(learning_rate, global_step,
+                                                     decay_steps)
+  ```
+
+  Args:
+    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
+      The initial learning rate.
+    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Global step to use for the decay computation.
+    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
+      Number of steps to decay over.
+    initial_variance: initial variance for the noise. See computation above.
+    variance_decay: decay for the noise's variance. See computation above.
+    num_periods: Number of periods in the cosine part of the decay.
+      See computation above.
+    alpha: See computation above.
+    beta: See computation above.
+    name: String.  Optional name of the operation.  Defaults to
+      'NoisyLinearCosineDecay'.
+  Returns:
+    A no-arg function that outputs the decayed learning rate, a scalar `Tensor`
+    of the same type as `learning_rate`.
+  Raises:
+    ValueError: if `global_step` is not supplied.
+  """
+  if global_step is None:
+    raise ValueError("noisy linear cosine decay requires global_step")
+  def decayed_lr(learning_rate, global_step, decay_steps, initial_variance,
+                 variance_decay, num_periods, alpha, beta, name):
+    """Helper to recompute learning rate; most helpful in eager-mode."""
+    with ops.name_scope(name, "NoisyLinearCosineDecay",
+                        [learning_rate, global_step]) as name:
+      learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
+      dtype = learning_rate.dtype
+      decay_steps = math_ops.cast(decay_steps, dtype)
+      initial_variance = math_ops.cast(initial_variance, dtype)
+      variance_decay = math_ops.cast(variance_decay, dtype)
+      num_periods = math_ops.cast(num_periods, dtype)
+      alpha = math_ops.cast(alpha, dtype)
+      beta = math_ops.cast(beta, dtype)
+
+      global_step_recomp = math_ops.cast(global_step, dtype)
+      global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
+      linear_decayed = (decay_steps - global_step_recomp) / decay_steps
+      variance = initial_variance / (
+          math_ops.pow(1.0 + global_step_recomp, variance_decay))
+      std = math_ops.sqrt(variance)
+      noisy_linear_decayed = (
+          linear_decayed + random_ops.random_normal(
+              linear_decayed.shape, stddev=std))
+
+      completed_fraction = global_step_recomp / decay_steps
+      fraction = 2.0 * num_periods * completed_fraction
+      cosine_decayed = 0.5 * (
+          1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
+      noisy_linear_cosine_decayed = (
+          (alpha + noisy_linear_decayed) * cosine_decayed + beta)
+
+      return math_ops.multiply(
+          learning_rate, noisy_linear_cosine_decayed, name=name)
+
+  return functools.partial(decayed_lr, learning_rate, global_step, decay_steps,
+                           initial_variance, variance_decay, num_periods, alpha,
+                           beta, name)
diff --git a/tensorflow/python/training/learning_rate_decay_v2_test.py b/tensorflow/python/training/learning_rate_decay_v2_test.py
new file mode 100644
index 0000000000..0f2d60dafc
--- /dev/null
+++ b/tensorflow/python/training/learning_rate_decay_v2_test.py
@@ -0,0 +1,497 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Functional test for learning rate decay."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+# Import resource_variable_ops for the variables-to-tensor implicit conversion.
+from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import googletest
+from tensorflow.python.training import learning_rate_decay_v2
+
+
+class LRDecayTestV2(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testContinuous(self):
+    self.evaluate(variables.global_variables_initializer())
+    step = 5
+    decayed_lr = learning_rate_decay_v2.exponential_decay(0.05, step, 10, 0.96)
+    expected = .05 * 0.96**(5.0 / 10.0)
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStaircase(self):
+    if context.executing_eagerly():
+      step = resource_variable_ops.ResourceVariable(0)
+      self.evaluate(variables.global_variables_initializer())
+      decayed_lr = learning_rate_decay_v2.exponential_decay(
+          .1, step, 3, 0.96, staircase=True)
+
+      # No change to learning rate due to staircase
+      expected = .1
+      self.evaluate(step.assign(1))
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+      expected = .1
+      self.evaluate(step.assign(2))
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+      # Decayed learning rate
+      expected = .1 * 0.96 ** (100 // 3)
+      self.evaluate(step.assign(100))
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  def testVariables(self):
+    with self.test_session():
+      step = variables.Variable(1)
+      assign_1 = step.assign(1)
+      assign_2 = step.assign(2)
+      assign_100 = step.assign(100)
+      decayed_lr = learning_rate_decay_v2.exponential_decay(.1, step, 3, 0.96,
+                                                            staircase=True)
+      variables.global_variables_initializer().run()
+      # No change to learning rate
+      assign_1.op.run()
+      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
+      assign_2.op.run()
+      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
+      # Decayed learning rate
+      assign_100.op.run()
+      expected = .1 * 0.96 ** (100 // 3)
+      self.assertAllClose(decayed_lr().eval(), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testPiecewiseConstant(self):
+    x = resource_variable_ops.ResourceVariable(-999)
+    decayed_lr = learning_rate_decay_v2.piecewise_constant(
+        x, [100, 110, 120], [1.0, 0.1, 0.01, 0.001])
+
+    self.evaluate(variables.global_variables_initializer())
+
+    self.assertAllClose(self.evaluate(decayed_lr()), 1.0, 1e-6)
+    self.evaluate(x.assign(100))
+    self.assertAllClose(self.evaluate(decayed_lr()), 1.0, 1e-6)
+    self.evaluate(x.assign(105))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.1, 1e-6)
+    self.evaluate(x.assign(110))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.1, 1e-6)
+    self.evaluate(x.assign(120))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.01, 1e-6)
+    self.evaluate(x.assign(999))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.001, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testPiecewiseConstantEdgeCases(self):
+    x_int = resource_variable_ops.ResourceVariable(
+        0, dtype=variables.dtypes.int32)
+    boundaries, values = [-1.0, 1.0], [1, 2, 3]
+    with self.assertRaises(ValueError):
+      decayed_lr = learning_rate_decay_v2.piecewise_constant(
+          x_int, boundaries, values)
+      decayed_lr()
+
+    x = resource_variable_ops.ResourceVariable(0.0)
+    boundaries, values = [-1.0, 1.0], [1.0, 2, 3]
+    with self.assertRaises(ValueError):
+      decayed_lr = learning_rate_decay_v2.piecewise_constant(
+          x, boundaries, values)()
+      decayed_lr()
+
+    # Test that ref types are valid.
+    if not context.executing_eagerly():
+      x = variables.Variable(0.0)
+      x_ref = x.op.outputs[0]   # float32_ref tensor should be accepted
+      boundaries, values = [1.0, 2.0], [1, 2, 3]
+      learning_rate_decay_v2.piecewise_constant(x_ref, boundaries, values)
+
+    # Test casting boundaries from int32 to int64.
+    x_int64 = resource_variable_ops.ResourceVariable(
+        0, dtype=variables.dtypes.int64)
+    boundaries, values = [1, 2, 3], [0.4, 0.5, 0.6, 0.7]
+    decayed_lr = learning_rate_decay_v2.piecewise_constant(
+        x_int64, boundaries, values)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.4, 1e-6)
+    self.evaluate(x_int64.assign(1))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.4, 1e-6)
+    self.evaluate(x_int64.assign(2))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.5, 1e-6)
+    self.evaluate(x_int64.assign(3))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.6, 1e-6)
+    self.evaluate(x_int64.assign(4))
+    self.assertAllClose(self.evaluate(decayed_lr()), 0.7, 1e-6)
+
+
+class LinearDecayTestV2(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWay(self):
+    step = 5
+    lr = 0.05
+    end_lr = 0.0
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
+    expected = lr * 0.5
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testEnd(self):
+    step = 10
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWayWithEnd(self):
+    step = 5
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
+    expected = (lr + end_lr) * 0.5
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEnd(self):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(lr, step, 10, end_lr)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEndWithCycle(self):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(
+        lr, step, 10, end_lr, cycle=True)
+    expected = (lr - end_lr) * 0.25 + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+
+class SqrtDecayTestV2(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWay(self):
+    step = 5
+    lr = 0.05
+    end_lr = 0.0
+    power = 0.5
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(
+        lr, step, 10, end_lr, power=power)
+    expected = lr * 0.5**power
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testEnd(self):
+    step = 10
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(
+        lr, step, 10, end_lr, power=power)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testHalfWayWithEnd(self):
+    step = 5
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(
+        lr, step, 10, end_lr, power=power)
+    expected = (lr - end_lr) * 0.5**power + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEnd(self):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(
+        lr, step, 10, end_lr, power=power)
+    expected = end_lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeyondEndWithCycle(self):
+    step = 15
+    lr = 0.05
+    end_lr = 0.001
+    power = 0.5
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(
+        lr, step, 10, end_lr, power=power, cycle=True)
+    expected = (lr - end_lr) * 0.25**power + end_lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+
+class PolynomialDecayTestV2(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBeginWithCycle(self):
+    lr = 0.001
+    decay_steps = 10
+    step = 0
+    decayed_lr = learning_rate_decay_v2.polynomial_decay(
+        lr, step, decay_steps, cycle=True)
+    expected = lr
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+
+class ExponentialDecayTestV2(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_decay_v2.natural_exp_decay(initial_lr, step, k,
+                                                          decay_rate)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr * math.exp(-i / k * decay_rate)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStaircase(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_decay_v2.natural_exp_decay(
+        initial_lr, step, k, decay_rate, staircase=True)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr * math.exp(-decay_rate * (i // k))
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+
+class InverseDecayTestV2(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_decay_v2.inverse_time_decay(initial_lr, step, k,
+                                                           decay_rate)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr / (1 + i / k * decay_rate)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testStaircase(self):
+    initial_lr = 0.1
+    k = 10
+    decay_rate = 0.96
+    step = resource_variable_ops.ResourceVariable(0)
+    decayed_lr = learning_rate_decay_v2.inverse_time_decay(
+        initial_lr, step, k, decay_rate, staircase=True)
+
+    self.evaluate(variables.global_variables_initializer())
+    for i in range(k + 1):
+      expected = initial_lr / (1 + decay_rate * (i // k))
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+      self.evaluate(step.assign_add(1))
+
+
+class CosineDecayTestV2(test_util.TensorFlowTestCase):
+
+  def np_cosine_decay(self, step, decay_steps, alpha=0.0):
+    step = min(step, decay_steps)
+    completed_fraction = step / decay_steps
+    decay = 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.cosine_decay(initial_lr, step,
+                                                       num_training_steps)
+      expected = self.np_cosine_decay(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.cosine_decay(initial_lr, step,
+                                                       num_training_steps,
+                                                       alpha)
+      expected = self.np_cosine_decay(step, num_training_steps, alpha)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+
+class CosineDecayRestartsTestV2(test_util.TensorFlowTestCase):
+
+  def np_cosine_decay_restarts(self, step, decay_steps, t_mul=2.0, m_mul=1.0,
+                               alpha=0.0):
+    fac = 1.0
+    while step >= decay_steps:
+      step -= decay_steps
+      decay_steps *= t_mul
+      fac *= m_mul
+
+    completed_fraction = step / decay_steps
+    decay = fac * 0.5 * (1.0 + math.cos(math.pi * completed_fraction))
+    return (1.0 - alpha) * decay + alpha
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDecay(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
+          initial_lr, step, num_training_steps)
+      expected = self.np_cosine_decay_restarts(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAlpha(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    alpha = 0.1
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
+          initial_lr, step, num_training_steps, alpha=alpha)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, alpha=alpha)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testMMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    m_mul = 0.9
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
+          initial_lr, step, num_training_steps, m_mul=m_mul)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, m_mul=m_mul)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testTMul(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    t_mul = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.cosine_decay_restarts(
+          initial_lr, step, num_training_steps, t_mul=t_mul)
+      expected = self.np_cosine_decay_restarts(
+          step, num_training_steps, t_mul=t_mul)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+
+class LinearCosineDecayTestV2(test_util.TensorFlowTestCase):
+
+  def np_linear_cosine_decay(self,
+                             step,
+                             decay_steps,
+                             alpha=0.0,
+                             beta=0.001,
+                             num_periods=0.5):
+    step = min(step, decay_steps)
+    linear_decayed = float(decay_steps - step) / decay_steps
+    fraction = 2.0 * num_periods * step / float(decay_steps)
+    cosine_decayed = 0.5 * (1.0 + math.cos(math.pi * fraction))
+    return (alpha + linear_decayed) * cosine_decayed + beta
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDefaultDecay(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
+          initial_lr, step, num_training_steps)
+      expected = self.np_linear_cosine_decay(step, num_training_steps)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonDefaultDecay(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      decayed_lr = learning_rate_decay_v2.linear_cosine_decay(
+          initial_lr,
+          step,
+          num_training_steps,
+          alpha=0.1,
+          beta=1e-4,
+          num_periods=5)
+      expected = self.np_linear_cosine_decay(
+          step, num_training_steps, alpha=0.1, beta=1e-4, num_periods=5)
+      self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
+
+
+class NoisyLinearCosineDecayTestV2(test_util.TensorFlowTestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDefaultNoisyLinearCosine(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      # No numerical check because of noise
+      decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
+          initial_lr, step, num_training_steps)
+      # Cannot be deterministically tested
+      self.evaluate(decayed_lr())
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNonDefaultNoisyLinearCosine(self):
+    num_training_steps = 1000
+    initial_lr = 1.0
+    for step in range(0, 1500, 250):
+      # No numerical check because of noise
+      decayed_lr = learning_rate_decay_v2.noisy_linear_cosine_decay(
+          initial_lr,
+          step,
+          num_training_steps,
+          initial_variance=0.5,
+          variance_decay=0.1,
+          alpha=0.1,
+          beta=1e-4,
+          num_periods=5)
+      # Cannot be deterministically tested
+      self.evaluate(decayed_lr())
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 9702430a12..38216ce9b1 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import functools
 
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
@@ -45,6 +46,29 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
 
     # Specially handled functions.
     self.function_handle = {}
+    for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
+                  "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
+                  "tf.train.inverse_time_decay", "tf.train.cosine_decay",
+                  "tf.train.cosine_decay_restarts",
+                  "tf.train.linear_cosine_decay",
+                  "tf.train.noisy_linear_cosine_decay"]:
+      self.function_handle[decay] = functools.partial(
+          self._learning_rate_decay_handler, decay_name=decay)
+
+  @staticmethod
+  def _learning_rate_decay_handler(file_edit_recorder, node, decay_name):
+    comment = ("ERROR: %s has been changed to return a callable instead of a "
+               "tensor when graph building, but its functionality remains "
+               "unchanged during eager execution (returns a callable like "
+               "before). The converter cannot detect and fix this reliably, so "
+               "you need to inspect this usage manually.\n") % decay_name
+    file_edit_recorder.add(
+        comment,
+        node.lineno,
+        node.col_offset,
+        decay_name,
+        decay_name,
+        error="%s requires manual check." % decay_name)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 57ac04de06..3886c1e8b9 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -63,6 +63,19 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log(3.8))\n")
 
+  def testLearningRateDecay(self):
+    for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
+                  "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
+                  "tf.train.inverse_time_decay", "tf.train.cosine_decay",
+                  "tf.train.cosine_decay_restarts",
+                  "tf.train.linear_cosine_decay",
+                  "tf.train.noisy_linear_cosine_decay"]:
+
+      text = "%s(a, b)\n" % decay
+      _, unused_report, errors, new_text = self._upgrade(text)
+      self.assertEqual(text, new_text)
+      self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
+
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):