diff options
author | A. Unique TensorFlower <gardener@tensorflow.org> | 2018-09-28 18:22:13 -0700 |
---|---|---|
committer | TensorFlower Gardener <gardener@tensorflow.org> | 2018-09-28 18:25:39 -0700 |
commit | b34ddf043324e52ee0acdfe62cb18beab7fed08e (patch) | |
tree | deafa347fa337350521f6784ad1e276dd2a4bd54 /tensorflow/contrib/tpu | |
parent | 5f822d694af6e4aa57fe8a426032a91dc61e30d6 (diff) |
Added flag to enable non-lazy Adam optimizer implementation for TPU embeddings
(actual implementation is pending).
Added comments with pointers to C++ implementations of optimizers.
PiperOrigin-RevId: 215026002
Diffstat (limited to 'tensorflow/contrib/tpu')
-rw-r--r-- | tensorflow/contrib/tpu/proto/optimization_parameters.proto | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto index fc1320501b..a43f45554f 100644 --- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto +++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto @@ -22,13 +22,22 @@ message LearningRate { } } +// Each optimizer's parameter proto has a link to its documentation and CPU +// implementation (if available) for user reference. + +// https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151 message AdagradParameters { float initial_accumulator = 1; } +// https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423 message StochasticGradientDescentParameters { } +// https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192 message FtrlParameters { float l1 = 1; float l2 = 2; @@ -41,21 +50,38 @@ message FtrlParameters { // learning rate feature instead, setting the learning rate to: // user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) // Here, t is the current timestep. +// +// https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer // https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54 +// +// Note that the code by default implements the lazy version of Adam +// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer) +// unless the use_non_lazy_adam parameter is set, in which case it implements +// the normal version of Adam that updates all parameters in the embedding +// table, even for entries that are not used in the current minibatch +// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If +// use_non_lazy_adam is enabled, use_gradient_accumulation is also required in +// order to get correct results; a warning will be printed otherwise (which may +// change to an error in the future). message AdamParameters { float beta1 = 3; float beta2 = 4; float epsilon = 5; float initial_m = 6; float initial_v = 7; + bool use_non_lazy_adam = 8; } +// https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271 message MomentumParameters { float momentum = 1; bool use_nesterov = 2; float initial_accum = 3; } +// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356 message RmsPropParameters { float rho = 1; float momentum = 2; @@ -64,6 +90,8 @@ message RmsPropParameters { float initial_mom = 5; } +// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372 message CenteredRmsPropParameters { float rho = 1; float momentum = 2; @@ -73,6 +101,7 @@ message CenteredRmsPropParameters { float initial_mg = 6; } +// Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf message MdlAdagradLightParameters { float l2 = 1; float lr_power = 2; @@ -91,6 +120,8 @@ message MdlAdagradLightParameters { float initial_benefit = 15; } +// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68 message AdadeltaParameters { float rho = 1; float epsilon = 2; @@ -98,6 +129,8 @@ message AdadeltaParameters { float initial_update = 4; } +// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer +// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164 message ProximalAdagradParameters { float l1 = 1; float l2 = 2; |