diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-08-14 23:25:58 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-08-14 23:29:23 -0700 |
commit | 63af3e170e0022c3e40d53dbc25c9d6d54f7c7be (patch) | |
tree | 3586ef2e3a6ad933a3c7ea1648321199929de2d6 /tensorflow/contrib/optimizer_v2 | |
parent | d623bc479174079134fc5341ca99b2491d0afce7 (diff) |
Change the initialization of the mean squared gradient variable of the optimizer_v2 version of RMSProp.
PiperOrigin-RevId: 208774314
Diffstat (limited to 'tensorflow/contrib/optimizer_v2')
-rw-r--r-- | tensorflow/contrib/optimizer_v2/rmsprop.py | 32 | ||||
-rw-r--r-- | tensorflow/contrib/optimizer_v2/rmsprop_test.py | 128 |
2 files changed, 80 insertions, 80 deletions
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop.py b/tensorflow/contrib/optimizer_v2/rmsprop.py index 164ff0ea06..3de53405ec 100644 --- a/tensorflow/contrib/optimizer_v2/rmsprop.py +++ b/tensorflow/contrib/optimizer_v2/rmsprop.py @@ -22,7 +22,7 @@ A detailed description of rmsprop. - divide gradient by the root of this average mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2 -mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square + epsilon) +mom = momentum * mom{t-1} + learning_rate * g_t / sqrt(mean_square) delta = - mom This implementation of RMSProp uses plain momentum, not Nesterov momentum. @@ -33,7 +33,7 @@ gradients, and uses that average to estimate the variance: mean_grad = decay * mean_square{t-1} + (1-decay) * gradient mean_square = decay * mean_square{t-1} + (1-decay) * gradient ** 2 mom = momentum * mom{t-1} + learning_rate * g_t / - sqrt(mean_square - mean_grad**2 + epsilon) + sqrt(mean_square - mean_grad**2) delta = - mom """ @@ -43,7 +43,6 @@ from __future__ import print_function from tensorflow.contrib.optimizer_v2 import optimizer_v2 from tensorflow.python.ops import array_ops -from tensorflow.python.ops import init_ops from tensorflow.python.training import training_ops @@ -87,7 +86,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): decay: A float hyperparameter. Discounting factor for the history/coming gradient. momentum: A float hyperparameter. - epsilon: A float hyperparameter. Small value to avoid zero denominator. + epsilon: A float hyperparameter. Small value to initialize the average + square gradient variable and avoid zero denominator. use_locking: If True use locks for update operation. centered: If True, gradients are normalized by the estimated variance of the gradient; if False, by the uncentered second moment. Setting this to @@ -106,10 +106,8 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): def _create_vars(self, var_list, state): for v in var_list: - if v.get_shape().is_fully_defined(): - init_rms = init_ops.ones_initializer(dtype=v.dtype.base_dtype) - else: - init_rms = array_ops.ones_like(v) + init_rms = state.get_hyper( + "epsilon", v.dtype.base_dtype) * array_ops.ones_like(v) state.create_slot_with_initializer(v, init_rms, v.get_shape(), v.dtype.base_dtype, "rms") if self._centered: @@ -129,7 +127,9 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + # epsilon is now the rms initial value and is not added to the + # denominator anymore, hence calling the kernel op with epsilon=0. + 0, grad, use_locking=self._use_locking).op else: @@ -140,7 +140,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + 0, grad, use_locking=self._use_locking).op @@ -157,7 +157,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + 0, grad, use_locking=self._use_locking) else: @@ -168,7 +168,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + 0, grad, use_locking=self._use_locking) @@ -185,7 +185,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + 0, grad.values, grad.indices, use_locking=self._use_locking) @@ -197,7 +197,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + 0, grad.values, grad.indices, use_locking=self._use_locking) @@ -215,7 +215,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + 0, grad, indices, use_locking=self._use_locking) @@ -227,7 +227,7 @@ class RMSPropOptimizer(optimizer_v2.OptimizerV2): state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), - state.get_hyper("epsilon", var.dtype.base_dtype), + 0, grad, indices, use_locking=self._use_locking) diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py index dc23ef241a..628d0418dd 100644 --- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py +++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py @@ -39,34 +39,34 @@ _DATA_TYPES = [dtypes.half, dtypes.float32] _TEST_PARAM_VALUES = [ # learning_rate, decay, momentum, epsilon, centered, use_resource - [0.5, 0.9, 0.0, 1e-3, True, False], - [0.5, 0.9, 0.0, 1e-3, False, False], - [0.5, 0.9, 0.0, 1e-3, True, True], - [0.5, 0.9, 0.0, 1e-3, False, True], - [0.1, 0.9, 0.0, 1e-3, True, False], - [0.5, 0.95, 0.0, 1e-3, False, False], - [0.5, 0.95, 0.0, 1e-5, True, False], - [0.5, 0.95, 0.9, 1e-5, True, False], + [0.5, 0.9, 0.0, 1.0, True, False], + [0.5, 0.9, 0.0, 1.0, False, False], + [0.5, 0.9, 0.0, 1.0, True, True], + [0.5, 0.9, 0.0, 1.0, False, True], + [0.1, 0.9, 0.0, 1.0, True, False], + [0.5, 0.95, 0.0, 1.0, False, False], + [0.5, 0.8, 0.0, 1e-3, True, False], + [0.5, 0.8, 0.9, 1e-3, True, False], ] class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum, - epsilon, centered): + centered): rms_t = rms * decay + (1 - decay) * g * g - denom_t = rms_t + epsilon if centered: mg_t = mg * decay + (1 - decay) * g - denom_t -= mg_t * mg_t + denom_t = rms_t - mg_t * mg_t else: mg_t = mg + denom_t = rms_t mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype) var_t = var - mom_t return var_t, mg_t, rms_t, mom_t def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom, - lr, decay, momentum, epsilon, centered): + lr, decay, momentum, centered): mg_t = copy.deepcopy(mg) rms_t = copy.deepcopy(rms) mom_t = copy.deepcopy(mom) @@ -75,7 +75,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): gindex = gindexs[i] gvalue = gvalues[i] rms_t[gindex] = rms[gindex] * decay + (1 - decay) * gvalue * gvalue - denom_t = rms_t[gindex] + epsilon + denom_t = rms_t[gindex] if centered: mg_t[gindex] = mg_t[gindex] * decay + (1 - decay) * gvalue denom_t -= mg_t[gindex] * mg_t[gindex] @@ -129,8 +129,8 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) - rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype) - rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype) + rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype) + rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype) mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) @@ -144,10 +144,10 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy( var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate, - decay, momentum, epsilon, centered) + decay, momentum, centered) var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy( var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate, - decay, momentum, epsilon, centered) + decay, momentum, centered) # Validate updated params if centered: @@ -191,7 +191,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): loss = pred * pred sgd_op = rmsprop.RMSPropOptimizer( learning_rate=1.0, - decay=0.0, + decay=0.1, momentum=0.0, epsilon=1.0, centered=True).minimize(loss) @@ -202,7 +202,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): sgd_op.run() # Validate updated params self.assertAllCloseAccordingToType( - [[-111, -138]], var0.eval(), atol=0.01) + [[-7/3.0, -4/3.0]], var0.eval(), atol=0.01) @parameterized.named_parameters( *test_util.generate_combinations_with_testcase_name( @@ -251,8 +251,8 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) - rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype) - rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype) + rms0_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype) + rms1_np = np.array([epsilon, epsilon], dtype=dtype.as_numpy_dtype) mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype) @@ -266,10 +266,10 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy( var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np, - learning_rate, decay, momentum, epsilon, centered) + learning_rate, decay, momentum, centered) var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy( var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np, - learning_rate, decay, momentum, epsilon, centered) + learning_rate, decay, momentum, centered) # Validate updated params if centered: @@ -317,13 +317,13 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): # Check the parameters. self.assertAllCloseAccordingToType( np.array([ - 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)), - 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) + 1.0 - (0.1 * 2.0 / math.sqrt(0.901)), + 2.0 - (0.1 * 2.0 / math.sqrt(0.901)) ]), var0.eval()) self.assertAllCloseAccordingToType( np.array([ - 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)), - 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) + 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)), + 4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) ]), var1.eval()) # Step 2: the root mean square accumulators contain the previous update. update.run() @@ -335,17 +335,17 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): # Check the parameters. self.assertAllCloseAccordingToType( np.array([ - 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) - - (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)), - 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) - - (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)) + 1.0 - (0.1 * 2.0 / math.sqrt(0.901)) - + (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)), + 2.0 - (0.1 * 2.0 / math.sqrt(0.901)) - + (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)) ]), var0.eval()) self.assertAllCloseAccordingToType( np.array([ - 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) - - (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)), - 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) - - (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)) + 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) - + (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)), + 4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) - + (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)) ]), var1.eval()) @parameterized.parameters(_DATA_TYPES) @@ -357,7 +357,7 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): grads1 = constant_op.constant([0.01, 0.01], dtype=dtype) opt = rmsprop.RMSPropOptimizer( - learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5) + learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1.0) update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) variables.global_variables_initializer().run() @@ -383,22 +383,22 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): np.array([0.90001, 0.90001]), rms1.eval()) # Check the momentum accumulators self.assertAllCloseAccordingToType( - np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)), - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval()) + np.array([(0.1 * 2.0 / math.sqrt(0.901)), + (0.1 * 2.0 / math.sqrt(0.901))]), mom0.eval()) self.assertAllCloseAccordingToType( - np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)), - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval()) + np.array([(0.01 * 2.0 / math.sqrt(0.90001)), + (0.01 * 2.0 / math.sqrt(0.90001))]), mom1.eval()) # Check that the parameters. self.assertAllCloseAccordingToType( np.array([ - 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)), - 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) + 1.0 - (0.1 * 2.0 / math.sqrt(0.901)), + 2.0 - (0.1 * 2.0 / math.sqrt(0.901)) ]), var0.eval()) self.assertAllCloseAccordingToType( np.array([ - 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)), - 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) + 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)), + 4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) ]), var1.eval()) # Step 2: the root mean square accumulators contain the previous update. @@ -410,38 +410,38 @@ class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase): np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval()) self.assertAllCloseAccordingToType( np.array([ - 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) + - (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)), - 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) + - (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)) + 0.5 * (0.1 * 2.0 / math.sqrt(0.901)) + + (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)), + 0.5 * (0.1 * 2.0 / math.sqrt(0.901)) + + (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001)) ]), mom0.eval()) self.assertAllCloseAccordingToType( np.array([ - 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) + - (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)), - 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) + - (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)) + 0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) + + (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)), + 0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) + + (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5)) ]), mom1.eval()) # Check the parameters. self.assertAllCloseAccordingToType( np.array([ - 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) - - (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) + - (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))), - 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) - - (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) + - (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))) + 1.0 - (0.1 * 2.0 / math.sqrt(0.901)) - + (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) + + (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))), + 2.0 - (0.1 * 2.0 / math.sqrt(0.901)) - + (0.5 * (0.1 * 2.0 / math.sqrt(0.901)) + + (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001))) ]), var0.eval()) self.assertAllCloseAccordingToType( np.array([ - 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) - - (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) + - (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))), - 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) - - (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) + - (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))) + 3.0 - (0.01 * 2.0 / math.sqrt(0.90001)) - + (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) + + (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))), + 4.0 - (0.01 * 2.0 / math.sqrt(0.90001)) - + (0.5 * (0.01 * 2.0 / math.sqrt(0.90001)) + + (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5))) ]), var1.eval()) |