aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/optimizer_v2
diff options
context:
space:
mode:
authorGravatar A. Unique TensorFlower <gardener@tensorflow.org>2018-08-14 23:25:58 -0700
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-08-14 23:29:23 -0700
commit63af3e170e0022c3e40d53dbc25c9d6d54f7c7be (patch)
tree3586ef2e3a6ad933a3c7ea1648321199929de2d6 /tensorflow/contrib/optimizer_v2
parentd623bc479174079134fc5341ca99b2491d0afce7 (diff)
Change the initialization of the mean squared gradient variable of the optimizer_v2 version of RMSProp.
PiperOrigin-RevId: 208774314
Diffstat (limited to 'tensorflow/contrib/optimizer_v2')
-rw-r--r--tensorflow/contrib/optimizer_v2/rmsprop.py32
-rw-r--r--tensorflow/contrib/optimizer_v2/rmsprop_test.py128
2 files changed, 80 insertions, 80 deletions
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py
index 164ff0ea06..3de53405ec 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop.py
@@ -22,7 +22,7 @@ A detailed description of rmsprop.
- divide gradient by the root of this average
mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
-mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square + epsilon)
+mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square)
delta = - mom
This implementation of RMSProp uses plain momentum, not Nesterov momentum.
@@ -33,7 +33,7 @@ gradients, and uses that average to estimate the variance:
mean_grad = decay * mean_square{t-1} + (1-decay) * gradient
mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2
mom = momentum * mom{t-1} + learning_rate * g_t /
- sqrt(mean_square - mean_grad**2 + epsilon)
+ sqrt(mean_square - mean_grad**2)
delta = - mom
"""
@@ -43,7 +43,6 @@ from __future__ import print_function
from tensorflow.contrib.optimizer_v2 import optimizer_v2
from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
from tensorflow.python.training import training_ops
@@ -87,7 +86,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
decay: A float hyperparameter. Discounting factor for the history/coming
gradient.
momentum: A float hyperparameter.
- epsilon: A float hyperparameter. Small value to avoid zero denominator.
+ epsilon: A float hyperparameter. Small value to initialize the average
+ square gradient variable and avoid zero denominator.
use_locking: If True use locks for update operation.
centered: If True, gradients are normalized by the estimated variance of
the gradient; if False, by the uncentered second moment. Setting this to
@@ -106,10 +106,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
def _create_vars(self, var_list, state):
for v in var_list:
- if v.get_shape().is_fully_defined():
- init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype)
- else:
- init_rms = array_ops.ones_like(v)
+ init_rms = state.get_hyper(
+ "epsilon", v.dtype.base_dtype) * array_ops.ones_like(v)
state.create_slot_with_initializer(v, init_rms, v.get_shape(),
v.dtype.base_dtype, "rms")
if self._centered:
@@ -129,7 +127,9 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ # epsilon is now the rms initial value and is not added to the
+ # denominator anymore, hence calling the kernel op with epsilon=0.
+ 0,
grad,
use_locking=self._use_locking).op
else:
@@ -140,7 +140,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ 0,
grad,
use_locking=self._use_locking).op
@@ -157,7 +157,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ 0,
grad,
use_locking=self._use_locking)
else:
@@ -168,7 +168,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ 0,
grad,
use_locking=self._use_locking)
@@ -185,7 +185,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ 0,
grad.values,
grad.indices,
use_locking=self._use_locking)
@@ -197,7 +197,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ 0,
grad.values,
grad.indices,
use_locking=self._use_locking)
@@ -215,7 +215,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ 0,
grad,
indices,
use_locking=self._use_locking)
@@ -227,7 +227,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2):
state.get_hyper("learning_rate", var.dtype.base_dtype),
state.get_hyper("decay", var.dtype.base_dtype),
state.get_hyper("momentum", var.dtype.base_dtype),
- state.get_hyper("epsilon", var.dtype.base_dtype),
+ 0,
grad,
indices,
use_locking=self._use_locking)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index dc23ef241a..628d0418dd 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -39,34 +39,34 @@ _DATA_TYPES = [dtypes.half, dtypes.float32]
_TEST_PARAM_VALUES = [
# learning_rate, decay, momentum, epsilon, centered, use_resource
- [0.5, 0.9, 0.0, 1e-3, True, False],
- [0.5, 0.9, 0.0, 1e-3, False, False],
- [0.5, 0.9, 0.0, 1e-3, True, True],
- [0.5, 0.9, 0.0, 1e-3, False, True],
- [0.1, 0.9, 0.0, 1e-3, True, False],
- [0.5, 0.95, 0.0, 1e-3, False, False],
- [0.5, 0.95, 0.0, 1e-5, True, False],
- [0.5, 0.95, 0.9, 1e-5, True, False],
+ [0.5, 0.9, 0.0, 1.0, True, False],
+ [0.5, 0.9, 0.0, 1.0, False, False],
+ [0.5, 0.9, 0.0, 1.0, True, True],
+ [0.5, 0.9, 0.0, 1.0, False, True],
+ [0.1, 0.9, 0.0, 1.0, True, False],
+ [0.5, 0.95, 0.0, 1.0, False, False],
+ [0.5, 0.8, 0.0, 1e-3, True, False],
+ [0.5, 0.8, 0.9, 1e-3, True, False],
]
class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
- epsilon, centered):
+ centered):
rms_t = rms * decay + (1 - decay) * g * g
- denom_t = rms_t + epsilon
if centered:
mg_t = mg * decay + (1 - decay) * g
- denom_t -= mg_t * mg_t
+ denom_t = rms_t - mg_t * mg_t
else:
mg_t = mg
+ denom_t = rms_t
mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
var_t = var - mom_t
return var_t, mg_t, rms_t, mom_t
def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
- lr, decay, momentum, epsilon, centered):
+ lr, decay, momentum, centered):
mg_t = copy.deepcopy(mg)
rms_t = copy.deepcopy(rms)
mom_t = copy.deepcopy(mom)
@@ -75,7 +75,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
gindex = gindexs[i]
gvalue = gvalues[i]
rms_t[gindex] = rms[gindex] * decay + (1 - decay) * gvalue * gvalue
- denom_t = rms_t[gindex] + epsilon
+ denom_t = rms_t[gindex]
if centered:
mg_t[gindex] = mg_t[gindex] * decay + (1 - decay) * gvalue
denom_t -= mg_t[gindex] * mg_t[gindex]
@@ -129,8 +129,8 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
- rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
- rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+ rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+ rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -144,10 +144,10 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
- decay, momentum, epsilon, centered)
+ decay, momentum, centered)
var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
- decay, momentum, epsilon, centered)
+ decay, momentum, centered)
# Validate updated params
if centered:
@@ -191,7 +191,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
loss = pred * pred
sgd_op = rmsprop.RMSPropOptimizer(
learning_rate=1.0,
- decay=0.0,
+ decay=0.1,
momentum=0.0,
epsilon=1.0,
centered=True).minimize(loss)
@@ -202,7 +202,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
sgd_op.run()
# Validate updated params
self.assertAllCloseAccordingToType(
- [[-111, -138]], var0.eval(), atol=0.01)
+ [[-7/3.0, -4/3.0]], var0.eval(), atol=0.01)
@parameterized.named_parameters(
*test_util.generate_combinations_with_testcase_name(
@@ -251,8 +251,8 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
- rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
- rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+ rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
+ rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype)
mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
@@ -266,10 +266,10 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
- learning_rate, decay, momentum, epsilon, centered)
+ learning_rate, decay, momentum, centered)
var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
- learning_rate, decay, momentum, epsilon, centered)
+ learning_rate, decay, momentum, centered)
# Validate updated params
if centered:
@@ -317,13 +317,13 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
# Check the parameters.
self.assertAllCloseAccordingToType(
np.array([
- 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
- 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+ 1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+ 2.0 - (0.1 * 2.0 / math.sqrt(0.901))
]), var0.eval())
self.assertAllCloseAccordingToType(
np.array([
- 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
- 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+ 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+ 4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
]), var1.eval())
# Step 2: the root mean square accumulators contain the previous update.
update.run()
@@ -335,17 +335,17 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
# Check the parameters.
self.assertAllCloseAccordingToType(
np.array([
- 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
- (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
- 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
- (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+ 1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+ (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+ 2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+ (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
]), var0.eval())
self.assertAllCloseAccordingToType(
np.array([
- 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
- (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
- 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
- (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+ 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+ (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+ 4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+ (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
]), var1.eval())
@parameterized.parameters(_DATA_TYPES)
@@ -357,7 +357,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
opt = rmsprop.RMSPropOptimizer(
- learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+ learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1.0)
update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
variables.global_variables_initializer().run()
@@ -383,22 +383,22 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
np.array([0.90001, 0.90001]), rms1.eval())
# Check the momentum accumulators
self.assertAllCloseAccordingToType(
- np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
- (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+ np.array([(0.1 * 2.0 / math.sqrt(0.901)),
+ (0.1 * 2.0 / math.sqrt(0.901))]), mom0.eval())
self.assertAllCloseAccordingToType(
- np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
- (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+ np.array([(0.01 * 2.0 / math.sqrt(0.90001)),
+ (0.01 * 2.0 / math.sqrt(0.90001))]), mom1.eval())
# Check that the parameters.
self.assertAllCloseAccordingToType(
np.array([
- 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
- 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+ 1.0 - (0.1 * 2.0 / math.sqrt(0.901)),
+ 2.0 - (0.1 * 2.0 / math.sqrt(0.901))
]), var0.eval())
self.assertAllCloseAccordingToType(
np.array([
- 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
- 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+ 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)),
+ 4.0 - (0.01 * 2.0 / math.sqrt(0.90001))
]), var1.eval())
# Step 2: the root mean square accumulators contain the previous update.
@@ -410,38 +410,38 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
self.assertAllCloseAccordingToType(
np.array([
- 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
- (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
- 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
- (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+ 0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+ (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)),
+ 0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+ (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))
]), mom0.eval())
self.assertAllCloseAccordingToType(
np.array([
- 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
- (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
- 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
- (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+ 0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+ (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)),
+ 0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+ (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))
]), mom1.eval())
# Check the parameters.
self.assertAllCloseAccordingToType(
np.array([
- 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
- (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
- (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
- 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
- (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
- (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+ 1.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+ (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+ (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))),
+ 2.0 - (0.1 * 2.0 / math.sqrt(0.901)) -
+ (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) +
+ (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)))
]), var0.eval())
self.assertAllCloseAccordingToType(
np.array([
- 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
- (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
- (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
- 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
- (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
- (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+ 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+ (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+ (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))),
+ 4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) -
+ (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) +
+ (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)))
]), var1.eval())