aboutsummaryrefslogtreecommitdiffhomepage
path: root/tensorflow/contrib/bayesflow
diff options
context:
space:
mode:
authorGravatar Joshua V. Dillon <jvdillon@google.com>2018-03-09 15:27:50 -0800
committerGravatar TensorFlower Gardener <gardener@tensorflow.org>2018-03-09 15:32:09 -0800
commitbe51a9fac97d1497f59ecfc3a9aec4b5f84c9b76 (patch)
tree00bf8150736eca422ab559634bce3e7949b58713 /tensorflow/contrib/bayesflow
parent88c75b081fe17f04c58c954a76d673abd15255cb (diff)
Migrate tf.contrib.bayesflow.optimizers to tfp.optimziers.
PiperOrigin-RevId: 188547477
Diffstat (limited to 'tensorflow/contrib/bayesflow')
-rw-r--r--tensorflow/contrib/bayesflow/BUILD44
-rw-r--r--tensorflow/contrib/bayesflow/__init__.py2
-rw-r--r--tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py212
-rw-r--r--tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py268
-rw-r--r--tensorflow/contrib/bayesflow/python/ops/optimizers.py36
-rw-r--r--tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py220
-rw-r--r--tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py279
7 files changed, 0 insertions, 1061 deletions
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD
index e1b34d6deb..88956f0512 100644
--- a/tensorflow/contrib/bayesflow/BUILD
+++ b/tensorflow/contrib/bayesflow/BUILD
@@ -119,50 +119,6 @@ cuda_py_test(
tags = ["nomsan"],
)
-cuda_py_test(
- name = "sgld_optimizer_test",
- size = "small",
- srcs = ["python/kernel_tests/sgld_optimizer_test.py"],
- additional_deps = [
- ":bayesflow_py",
- "//third_party/py/numpy",
- "//tensorflow/contrib/distributions:distributions_py",
- "//tensorflow/contrib/layers:layers_py",
- "//tensorflow/python/ops/distributions",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:framework",
- "//tensorflow/python:framework_for_generated_wrappers",
- "//tensorflow/python:framework_test_lib",
- "//tensorflow/python:gradients",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:platform_test",
- "//tensorflow/python:random_seed",
- ],
- tags = ["notsan"],
-)
-
-cuda_py_test(
- name = "variational_sgd_optimizer_test",
- size = "small",
- srcs = ["python/kernel_tests/variational_sgd_optimizer_test.py"],
- additional_deps = [
- ":bayesflow_py",
- "//third_party/py/numpy",
- "//tensorflow/contrib/distributions:distributions_py",
- "//tensorflow/contrib/layers:layers_py",
- "//tensorflow/python/ops/distributions",
- "//tensorflow/python:client_testlib",
- "//tensorflow/python:framework",
- "//tensorflow/python:framework_for_generated_wrappers",
- "//tensorflow/python:framework_test_lib",
- "//tensorflow/python:gradients",
- "//tensorflow/python:math_ops",
- "//tensorflow/python:platform_test",
- "//tensorflow/python:random_seed",
- ],
- tags = ["notsan"],
-)
-
filegroup(
name = "all_files",
srcs = glob(
diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py
index bff8ac2cf6..89dfa583a4 100644
--- a/tensorflow/contrib/bayesflow/__init__.py
+++ b/tensorflow/contrib/bayesflow/__init__.py
@@ -25,7 +25,6 @@ from tensorflow.contrib.bayesflow.python.ops import custom_grad
from tensorflow.contrib.bayesflow.python.ops import hmc
from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings
from tensorflow.contrib.bayesflow.python.ops import monte_carlo
-from tensorflow.contrib.bayesflow.python.ops import optimizers
# pylint: enable=unused-import,line-too-long
from tensorflow.python.util.all_util import remove_undocumented
@@ -37,7 +36,6 @@ _allowed_symbols = [
'hmc',
'metropolis_hastings',
'monte_carlo',
- 'optimizers',
'special_math',
'stochastic_variables',
'variational_inference',
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
deleted file mode 100644
index 756c25683b..0000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/sgld_optimizer_test.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional test for GradientDescent."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import math
-from tensorflow.contrib.bayesflow.python.ops.optimizers import SGLDOptimizer
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class SGLDOptimizerTest(test.TestCase):
-
- def testBasic(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- decay_rate = 0.53
- sgd_optimizer = SGLDOptimizer(3.0, preconditioner_decay_rate=decay_rate)
- sgd_op = sgd_optimizer.apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
- # Validate updated params
- grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
- (1 - decay_rate) * 0.1**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
- grads_scaled = (0.5 * 0.01 / math.sqrt(
- decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
- self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
-
- def testBasicMultiInstance(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- vara = variables.Variable([1.1, 2.1], dtype=dtype)
- varb = variables.Variable([3.0, 4.0], dtype=dtype)
- gradsa = constant_op.constant([0.1, 0.1], dtype=dtype)
- gradsb = constant_op.constant([0.01, 0.01], dtype=dtype)
- decay_rate = 0.5
- sgd_optimizer = SGLDOptimizer(3.0, preconditioner_decay_rate=decay_rate)
- sgd_op = sgd_optimizer.apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- sgd_optimizer2 = SGLDOptimizer(
- 3.0, preconditioner_decay_rate=decay_rate)
- sgd_op2 = sgd_optimizer2.apply_gradients(
- zip([gradsa, gradsb], [vara, varb]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- self.assertAllCloseAccordingToType([1.1, 2.1], vara.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], varb.eval())
-
- # Run 1 step of sgd
- sgd_op.run()
- sgd_op2.run()
- # Validate updated params
- grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
- (1 - decay_rate) * 0.1**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
- self.assertAllCloseAccordingToType(
- [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], vara.eval())
-
- grads_scaled = (0.5 * 0.01 / math.sqrt(
- decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
- self.assertAllCloseAccordingToType(
- [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], varb.eval())
- self.assertNotEqual(sgd_optimizer.variable_scope,
- sgd_optimizer2.variable_scope)
- self.assertNotEqual(sgd_optimizer.variable_scope.name,
- sgd_optimizer2.variable_scope.name)
- self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
- self.assertAllCloseAccordingToType(1, sgd_optimizer2._counter.eval())
-
- def testTensorLearningRate(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- lrate = constant_op.constant(3.0)
- decay_rate = 0.5
- sgd_op = SGLDOptimizer(
- lrate, preconditioner_decay_rate=constant_op.constant(
- decay_rate)).apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
- # Validate updated params
- grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
- (1 - decay_rate) * 0.1**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
- grads_scaled = (0.5 * 0.01 / math.sqrt(
- decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
-
- def testGradWrtRef(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- opt = SGLDOptimizer(3.0)
- values = [1.0, 3.0]
- vars_ = [variables.Variable([v], dtype=dtype) for v in values]
- grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
- variables.global_variables_initializer().run()
- for grad, _ in grads_and_vars:
- self.assertAllCloseAccordingToType([1.0], grad.eval())
-
- def testWithGlobalStep(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- global_step = variables.Variable(0, trainable=False)
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- decay_rate = 0.1
- sgd_op = SGLDOptimizer(
- 3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
- zip([grads0, grads1], [var0, var1]), global_step=global_step)
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
-
- # Validate updated params and global_step
- grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
- (1 - decay_rate) * 0.1**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [1.1 - 3.0 * grads_scaled, 2.1 - 3.0 * grads_scaled], var0.eval())
- grads_scaled = (0.5 * 0.01 / math.sqrt(
- decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [3.0 - 3.0 * grads_scaled, 4.0 - 3.0 * grads_scaled], var1.eval())
- self.assertAllCloseAccordingToType(1, global_step.eval())
-
- def testSparseBasic(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([[1.1], [2.1]], dtype=dtype)
- var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
- grads0 = ops.IndexedSlices(
- constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
- constant_op.constant([0]), constant_op.constant([2, 1]))
- grads1 = ops.IndexedSlices(
- constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
- constant_op.constant([1]), constant_op.constant([2, 1]))
- decay_rate = 0.9
- sgd_op = SGLDOptimizer(
- 3.0, preconditioner_decay_rate=decay_rate).apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([[1.1], [2.1]], var0.eval())
- self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
- # Validate updated params
- grads_scaled = (0.5 * 0.1 / math.sqrt(decay_rate +
- (1 - decay_rate) * 0.1**2 + 1e-8))
- self.assertAllCloseAccordingToType([[1.1 - 3.0 * grads_scaled], [2.1]],
- var0.eval())
- grads_scaled = (0.5 * 0.01 / math.sqrt(
- decay_rate + (1 - decay_rate) * 0.01**2 + 1e-8))
- self.assertAllCloseAccordingToType(
- [[3.0 - 3.0 * 0], [4.0 - 3.0 * grads_scaled]], var1.eval())
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py
deleted file mode 100644
index 83c64dbe0f..0000000000
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/variational_sgd_optimizer_test.py
+++ /dev/null
@@ -1,268 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Functional test for GradientDescent."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from tensorflow.contrib.bayesflow.python.ops.optimizers import VariationalSGDOptimizer
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class VariationalSGDOptimizerTest(test.TestCase):
-
- def testBasic(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- decay_rate = 0.53
- sgd_op = VariationalSGDOptimizer(
- 1,
- 1,
- preconditioner_decay_rate=decay_rate,
- max_learning_rate=3.0,
- burnin_max_learning_rate=3.0,
- use_single_learning_rate=True).apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
- self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
- var0.eval())
- self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
- var1.eval())
-
- def testBasicMultiInstance(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- vara = variables.Variable([1.1, 2.1], dtype=dtype)
- varb = variables.Variable([3.0, 4.0], dtype=dtype)
- gradsa = constant_op.constant([0.1, 0.1], dtype=dtype)
- gradsb = constant_op.constant([0.01, 0.01], dtype=dtype)
- decay_rate = 0.5
- batch_size = 2
- total_num_examples = 10
- optimizer = VariationalSGDOptimizer(
- batch_size,
- total_num_examples,
- max_learning_rate=1.0,
- burnin_max_learning_rate=3.0,
- preconditioner_decay_rate=decay_rate)
- sgd_op = optimizer.apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- optimizer2 = VariationalSGDOptimizer(
- batch_size,
- total_num_examples,
- max_learning_rate=1.0,
- burnin_max_learning_rate=10.0,
- burnin=0,
- preconditioner_decay_rate=decay_rate)
- sgd_op2 = optimizer2.apply_gradients(
- zip([gradsa, gradsb], [vara, varb]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- self.assertAllCloseAccordingToType([1.1, 2.1], vara.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], varb.eval())
-
- # Run 1 step of sgd
- sgd_op.run()
- sgd_op2.run()
- # Validate updated params
- self.assertAllCloseAccordingToType([1.1 - 3. * 0.1, 2.1 - 3. * 0.1],
- var0.eval())
- self.assertAllCloseAccordingToType([1.1 - 0.1, 2.1 - 0.1], vara.eval())
-
- self.assertAllCloseAccordingToType([3.0 - 3. * 0.01, 4.0 - 3. * 0.01],
- var1.eval())
- self.assertAllCloseAccordingToType([3.0 - 0.01, 4.0 - 0.01],
- varb.eval())
- self.assertNotEqual(optimizer.variable_scope,
- optimizer2.variable_scope)
- self.assertNotEqual(optimizer.variable_scope.name,
- optimizer2.variable_scope.name)
- self.assertAllCloseAccordingToType(1, optimizer._counter.eval())
- self.assertAllCloseAccordingToType(1, optimizer2._counter.eval())
-
- def testTensorLearningRate(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- lrate = constant_op.constant(3.0)
- decay_rate = 0.5
- batch_size = 2
- total_num_examples = 10
- sgd_op = VariationalSGDOptimizer(
- batch_size,
- total_num_examples,
- max_learning_rate=lrate,
- burnin=0,
- preconditioner_decay_rate=decay_rate).apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
- # Validate updated params
- self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
- var0.eval())
- self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
- var1.eval())
-
- def testTensorDecayLearningRate(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- lrate = variables.Variable(3.0)
- lrate_decay_op = lrate.assign_add(-3.)
- decay_rate = 0.5
- batch_size = 2
- total_num_examples = 10
- optimizer = VariationalSGDOptimizer(
- batch_size,
- total_num_examples,
- max_learning_rate=lrate,
- burnin=0,
- preconditioner_decay_rate=decay_rate)
- sgd_op = optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
- # Validate updated params
- self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
- var0.eval())
- self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
- var1.eval())
- # Update learning rate to 0
- lrate_decay_op.eval()
- sgd_op.run()
- # Validate params haven't changed
- self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
- var0.eval())
- self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
- var1.eval())
- lrate_decay_op.eval()
-
- with self.assertRaises(errors.InvalidArgumentError):
- sgd_op.run()
-
- def testGradWrtRef(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- opt = VariationalSGDOptimizer(1, 1, max_learning_rate=1.0)
- values = [1.0, 3.0]
- vars_ = [variables.Variable([v], dtype=dtype) for v in values]
- grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
- variables.global_variables_initializer().run()
- for grad, _ in grads_and_vars:
- self.assertAllCloseAccordingToType([1.0], grad.eval())
-
- def testWithGlobalStep(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- global_step = variables.Variable(0, trainable=False)
- var0 = variables.Variable([1.1, 2.1], dtype=dtype)
- var1 = variables.Variable([3.0, 4.0], dtype=dtype)
- grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
- grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
- decay_rate = 0.1
- batch_size = 2
- total_num_examples = 10
- sgd_optimizer = VariationalSGDOptimizer(
- batch_size,
- total_num_examples,
- max_learning_rate=3.0,
- burnin=0,
- preconditioner_decay_rate=decay_rate)
- sgd_op = sgd_optimizer.apply_gradients(
- zip([grads0, grads1], [var0, var1]), global_step=global_step)
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([1.1, 2.1], var0.eval())
- self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
-
- # Validate updated params and global_step
- self.assertAllCloseAccordingToType([1.1 - 3.0 * 0.1, 2.1 - 3.0 * 0.1],
- var0.eval())
- self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
- var1.eval())
- self.assertAllCloseAccordingToType(1, global_step.eval())
- self.assertAllCloseAccordingToType(1, sgd_optimizer._counter.eval())
-
- def testSparseBasic(self):
- for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
- with self.test_session():
- var0 = variables.Variable([[1.1], [2.1]], dtype=dtype)
- var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
- grads0 = ops.IndexedSlices(
- constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
- constant_op.constant([0]), constant_op.constant([2, 1]))
- grads1 = ops.IndexedSlices(
- constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
- constant_op.constant([1]), constant_op.constant([2, 1]))
- decay_rate = 0.1
- batch_size = 2
- total_num_examples = 10
- sgd_op = VariationalSGDOptimizer(
- batch_size,
- total_num_examples,
- max_learning_rate=3.0,
- burnin=0,
- preconditioner_decay_rate=decay_rate).apply_gradients(
- zip([grads0, grads1], [var0, var1]))
- variables.global_variables_initializer().run()
- # Fetch params to validate initial values
- self.assertAllCloseAccordingToType([[1.1], [2.1]], var0.eval())
- self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
- # Run 1 step of sgd
- sgd_op.run()
- # Validate updated params
- self.assertAllCloseAccordingToType([[1.1 - 3.0 * 0.1], [2.1]],
- var0.eval())
- self.assertAllCloseAccordingToType(
- [[3.0 - 3.0 * 0], [4.0 - 3.0 * 0.01]], var1.eval())
-
-
-if __name__ == "__main__":
- test.main()
diff --git a/tensorflow/contrib/bayesflow/python/ops/optimizers.py b/tensorflow/contrib/bayesflow/python/ops/optimizers.py
deleted file mode 100644
index bff6bb7948..0000000000
--- a/tensorflow/contrib/bayesflow/python/ops/optimizers.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Probabilistic optimizer modules.
-
-See @{tf.contrib.bayesflow.optimizers}.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# go/tf-wildcard-import
-# pylint: disable=wildcard-import
-from tensorflow.contrib.bayesflow.python.ops.sgld_optimizer import *
-from tensorflow.contrib.bayesflow.python.ops.variational_sgd_optimizer import *
-# pylint: enable=wildcard-import
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
- 'SGLDOptimizer',
- 'VariationalSGDOptimizer',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py b/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
deleted file mode 100644
index 7786656398..0000000000
--- a/tensorflow/contrib/bayesflow/python/ops/sgld_optimizer.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An optimizer module for stochastic gradient Langevin dynamics."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variable_scope as varscope_ops
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import training_ops
-
-
-class SGLDOptimizer(optimizer.Optimizer):
- """An optimizer module for stochastic gradient Langevin dynamics.
-
- This implements the preconditioned Stochastic Gradient Langevin Dynamics
- optimizer [1]. The optimization variable is regarded as a sample from the
- posterior under Stochastic Gradient Langevin Dynamics with noise rescaled in
- each dimension according to RMSProp [2].
-
- Note: If a prior is included in the loss, it should be scaled by
- `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
- in the data. I.e., it should be divided by the `num_pseudo_batches` term
- described below.
-
- [1]: "Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural
- Networks." Chunyuan Li, Changyou Chen, David Carlson, Lawrence Carin.
- ArXiv:1512.07666, 2015. https://arxiv.org/abs/1512.07666
- [2]: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-
- Args:
- learning_rate: Scalar `float`-like `Tensor`. The base learning rate for the
- optimizer. Must be tuned to the specific function being minimized.
- preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
- decay rate of the rescaling of the preconditioner (RMSprop). (This is
- "alpha" in [1]). Should be smaller than but nearly `1` to approximate
- sampling from the posterior. (Default: `0.95`)
- num_pseudo_batches: Scalar `int`-like `Tensor`. The effective number of
- minibatches in the data set. Trades off noise and prior with the SGD
- likelihood term. Note: Assumes the loss is taken as the mean over a
- minibatch. Otherwise if the sum was taken, divide this number by the
- batch size. (Default: `1`)
- burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
- gradient statistics to update the preconditioner before starting to draw
- noisy samples. (Default: `25`)
- diagonal_bias: Scalar `float`-like `Tensor`. Term added to the diagonal of
- the preconditioner to prevent the preconditioner from degenerating.
- (Default: `1e-8`)
- name: Python `str` describing ops managed by this function.
- (Default: `"SGLDOptimizer"`)
- variable_scope: Variable scope used for calls to `tf.get_variable`.
- If `None`, a new variable scope is created using name
- `ops.get_default_graph().unique_name(name or default_name)`.
-
- Raises:
- InvalidArgumentError: If preconditioner_decay_rate is a `Tensor` not in
- `(0,1]`.
- """
-
- def __init__(self,
- learning_rate,
- preconditioner_decay_rate=0.95,
- num_pseudo_batches=1,
- burnin=25,
- diagonal_bias=1e-8,
- name=None,
- variable_scope=None):
- default_name = 'SGLDOptimizer'
- with ops.name_scope(name, default_name, [
- learning_rate, preconditioner_decay_rate, num_pseudo_batches, burnin,
- diagonal_bias
- ]):
- if variable_scope is None:
- var_scope_name = ops.get_default_graph().unique_name(
- name or default_name)
- with varscope_ops.variable_scope(var_scope_name) as scope:
- self._variable_scope = scope
- else:
- self._variable_scope = variable_scope
-
- self._preconditioner_decay_rate = ops.convert_to_tensor(
- preconditioner_decay_rate, name='preconditioner_decay_rate')
- self._num_pseudo_batches = ops.convert_to_tensor(
- num_pseudo_batches, name='num_pseudo_batches')
- self._burnin = ops.convert_to_tensor(burnin, name='burnin')
- self._diagonal_bias = ops.convert_to_tensor(
- diagonal_bias, name='diagonal_bias')
- self._learning_rate = ops.convert_to_tensor(
- learning_rate, name='learning_rate')
-
- with varscope_ops.variable_scope(self._variable_scope):
- self._counter = varscope_ops.get_variable(
- 'counter', initializer=0, trainable=False)
-
- self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._preconditioner_decay_rate,
- message='`preconditioner_decay_rate` must be non-negative'),
- check_ops.assert_less_equal(
- self._preconditioner_decay_rate,
- 1.,
- message='`preconditioner_decay_rate` must be at most 1.'),
- ], self._preconditioner_decay_rate)
-
- self._num_pseudo_batches = control_flow_ops.with_dependencies([
- check_ops.assert_greater(
- self._num_pseudo_batches,
- 0,
- message='`num_pseudo_batches` must be greater than zero')
- ], self._num_pseudo_batches)
-
- self._burnin = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._burnin, message='`burnin` must be non-negative'),
- check_ops.assert_integer(
- self._burnin, message='`burnin` must be an integer')
- ], self._burnin)
-
- self._diagonal_bias = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._diagonal_bias,
- message='`diagonal_bias` must be non-negative')
- ], self._diagonal_bias)
-
- super(SGLDOptimizer, self).__init__(use_locking=False,
- name=name or default_name)
-
- def _create_slots(self, var_list):
- for v in var_list:
- init_rms = init_ops.ones_initializer(dtype=v.dtype)
- self._get_or_make_slot_with_initializer(v, init_rms, v.get_shape(),
- v.dtype, 'rms', self._name)
-
- def _prepare(self):
- # We need to put the conversion and check here because a user will likely
- # want to decay the learning rate dynamically.
- self._learning_rate_tensor = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._learning_rate, message='`learning_rate` must be non-negative')
- ], ops.convert_to_tensor(self._learning_rate, name='learning_rate_tensor'))
- self._decay_tensor = ops.convert_to_tensor(
- self._preconditioner_decay_rate, name='preconditioner_decay_rate')
-
- super(SGLDOptimizer, self)._prepare()
-
- def _apply_dense(self, grad, var):
- rms = self.get_slot(var, 'rms')
-
- with ops.control_dependencies([
- self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor,
- var.dtype.base_dtype))]):
- new_grad = self._apply_noisy_update(rms, grad)
-
- return training_ops.apply_gradient_descent(
- var,
- math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
- new_grad,
- use_locking=self._use_locking).op
-
- def _apply_sparse(self, grad, var):
- rms = self.get_slot(var, 'rms')
-
- with ops.control_dependencies([
- self._update_momentum(rms, grad, math_ops.cast(self._decay_tensor,
- var.dtype.base_dtype))]):
- new_grad = self._apply_noisy_update(rms, grad)
-
- return training_ops.apply_gradient_descent(
- var,
- math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
- new_grad,
- use_locking=self._use_locking).op
-
- def _finish(self, update_ops, name_scope):
- update_ops.append([self._counter.assign_add(1)])
- return control_flow_ops.group(*update_ops, name=name_scope)
-
- @property
- def variable_scope(self):
- """Variable scope of all calls to `tf.get_variable`."""
- return self._variable_scope
-
- def _apply_noisy_update(self, mom, grad):
- # Compute and apply the gradient update following
- # preconditioned Langevin dynamics
- stddev = array_ops.where(
- array_ops.squeeze(self._counter > self._burnin),
- math_ops.cast(math_ops.rsqrt(self._learning_rate), grad.dtype),
- array_ops.zeros([], grad.dtype))
-
- preconditioner = math_ops.rsqrt(
- mom + math_ops.cast(self._diagonal_bias, grad.dtype))
- return (
- 0.5 * preconditioner * grad * math_ops.cast(self._num_pseudo_batches,
- grad.dtype) +
- random_ops.random_normal(array_ops.shape(grad), 1.0, dtype=grad.dtype) *
- stddev * math_ops.sqrt(preconditioner))
-
- def _update_momentum(self, mom, grad, decay):
- # Keep an exponentially weighted moving average of squared gradients.
- # Not thread safe
- return mom.assign_add((1.0 - decay) * (math_ops.square(grad) - mom))
diff --git a/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py b/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py
deleted file mode 100644
index 4d5f0cfe97..0000000000
--- a/tensorflow/contrib/bayesflow/python/ops/variational_sgd_optimizer.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An optimizer module for constant stochastic gradient descent."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
-from tensorflow.python.ops import variable_scope as varscope_ops
-from tensorflow.python.training import optimizer
-from tensorflow.python.training import training_ops
-
-
-class VariationalSGDOptimizer(optimizer.Optimizer):
- """An optimizer module for constant stochastic gradient descent.
-
- This implements an optimizer module for the constant stochastic gradient
- descent algorithm [1]. The optimization variable is regarded as an
- approximate sample from the posterior .
-
- Note: If a prior is included in the loss, it should be scaled by
- `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
- in the data. I.e., it should be divided by the `num_pseudo_batches` term
- described below.
-
- [1]: "Stochastic Gradient Descent as Approximate Bayesian Inference
- Stephan Mandt, Matthew D. Hoffman, David M. Blei.
- ArXiv:1704.04289, 2017. https://arxiv.org/abs/1704.04289
-
- Args:
- batch_size: Scalar `int`-like `Tensor`. The number of examples in a
- minibatch in the data set. Note: Assumes the loss is taken as the mean
- over a minibatch. Otherwise if the sum was taken set this to 1.
- total_num_examples: Scalar `int`-like `Tensor`. The total number of examples
- in the data set.
- max_learning_rate: Scalar `float`-like `Tensor`. A maximum allowable
- effective coordinate-wise learning rate. The algorithm scales down any
- effective learning rate (i.e. after preconditioning) that is larger than
- this. (Default: `1`)
- preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
- decay rate of the rescaling of the preconditioner (RMSprop). (This is
- "alpha" in [1]). Should be smaller than but nearly `1` to approximate
- sampling from the posterior. (Default: `0.95`)
- burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
- gradient statistics to update the preconditioner before starting to draw
- noisy samples. (Default: `25`)
- burnin_max_learning_rate: Scalar `float`-like `Tensor`. Maximum learning
- rate to use during the burnin period.
- (Default: `1e-8`)
- use_single_learning_rate: Boolean Indicates whether one single learning
- rate is used or coordinate_wise learning rates are used.
- (Default: `False`)
- name: Python `str` describing ops managed by this function.
- (Default: `"VariationalSGDOptimizer"`)
- variable_scope: Variable scope used for calls to `tf.get_variable`.
- If `None`, a new variable scope is created using name
- `ops.get_default_graph().unique_name(name or default_name)`.
-
- Raises:
- InvalidArgumentError: If preconditioner_decay_rate is a `Tensor` not in
- `(0,1]`.
- """
-
- def __init__(self,
- batch_size,
- total_num_examples,
- max_learning_rate=1.0,
- preconditioner_decay_rate=0.95,
- burnin=25,
- burnin_max_learning_rate=1e-6,
- use_single_learning_rate=False,
- name=None,
- variable_scope=None):
- default_name = 'VariationalSGDOptimizer'
- with ops.name_scope(name, default_name, [
- max_learning_rate, preconditioner_decay_rate, batch_size, burnin,
- burnin_max_learning_rate
- ]):
- if variable_scope is None:
- var_scope_name = ops.get_default_graph().unique_name(
- name or default_name)
- with varscope_ops.variable_scope(var_scope_name) as scope:
- self._variable_scope = scope
- else:
- self._variable_scope = variable_scope
-
- self._preconditioner_decay_rate = ops.convert_to_tensor(
- preconditioner_decay_rate, name='preconditioner_decay_rate')
- self._batch_size = ops.convert_to_tensor(batch_size, name='batch_size')
- self._total_num_examples = ops.convert_to_tensor(
- total_num_examples, name='total_num_examples')
- self._burnin = ops.convert_to_tensor(burnin, name='burnin')
- self._burnin_max_learning_rate = ops.convert_to_tensor(
- burnin_max_learning_rate, name='burnin_max_learning_rate')
- self._max_learning_rate = ops.convert_to_tensor(
- max_learning_rate, name='max_learning_rate')
- self._use_single_learning_rate = use_single_learning_rate
-
- with varscope_ops.variable_scope(self._variable_scope):
- self._counter = varscope_ops.get_variable(
- 'counter', initializer=0, trainable=False)
-
- self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._preconditioner_decay_rate,
- message='`preconditioner_decay_rate` must be non-negative'),
- check_ops.assert_less_equal(
- self._preconditioner_decay_rate,
- 1.,
- message='`preconditioner_decay_rate` must be at most 1.'),
- ], self._preconditioner_decay_rate)
-
- self._batch_size = control_flow_ops.with_dependencies([
- check_ops.assert_greater(
- self._batch_size,
- 0,
- message='`batch_size` must be greater than zero')
- ], self._batch_size)
-
- self._total_num_examples = control_flow_ops.with_dependencies([
- check_ops.assert_greater(
- self._total_num_examples,
- 0,
- message='`total_num_examples` must be greater than zero')
- ], self._total_num_examples)
-
- self._burnin = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._burnin, message='`burnin` must be non-negative'),
- check_ops.assert_integer(
- self._burnin, message='`burnin` must be an integer')
- ], self._burnin)
-
- self._burnin_max_learning_rate = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._burnin_max_learning_rate,
- message='`burnin_max_learning_rate` must be non-negative')
- ], self._burnin_max_learning_rate)
-
- self._max_learning_rate = control_flow_ops.with_dependencies([
- check_ops.assert_non_negative(
- self._max_learning_rate,
- message='`max_learning_rate` must be non-negative')
- ], self._max_learning_rate)
-
- super(VariationalSGDOptimizer, self).__init__(
- use_locking=False, name=name or default_name)
-
- def _create_slots(self, var_list):
- for v in var_list:
- init_moment = init_ops.zeros_initializer(dtype=v.dtype)
- self._get_or_make_slot_with_initializer(
- v, init_moment, v.get_shape(), v.dtype, 'first_moment', self._name)
- self._get_or_make_slot_with_initializer(
- v, init_moment, v.get_shape(), v.dtype, 'second_moment', self._name)
-
- def _prepare(self):
- self._decay_tensor = ops.convert_to_tensor(
- self._preconditioner_decay_rate, name='preconditioner_decay_rate')
- self._batch_size_tensor = ops.convert_to_tensor(
- self._batch_size, name='batch_size_tensor')
-
- super(VariationalSGDOptimizer, self)._prepare()
-
- def _get_coordinatewise_learning_rate(self, grad, var):
- # Compute the learning rate using a moving average for the diagonal of BB^T
- avg_first = self.get_slot(var, 'first_moment')
- avg_second = self.get_slot(var, 'second_moment')
- decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
- batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)
-
- # Create an estimator for the moving average of gradient mean and variance
- # via Welford's algorithm
- if isinstance(grad, ops.Tensor):
- delta = grad - avg_first
- first_moment_update = avg_first.assign_add(
- array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
- 1. - decay_tensor) * delta)
-
- with ops.control_dependencies([first_moment_update]):
- second_moment_update = avg_second.assign_add(
- math_ops.cast(self._counter < 1, var.dtype) *
- -(1. - decay_tensor) * (
- avg_second - decay_tensor * math_ops.square(delta)))
- diag_preconditioner = control_flow_ops.with_dependencies(
- [second_moment_update],
- clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
- elif isinstance(grad, ops.IndexedSlices):
- delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
- first_moment_update = state_ops.scatter_add(
- avg_first,
- grad.indices,
- array_ops.where(self._counter < 1,
- math_ops.cast(1., var.dtype),
- 1. - decay_tensor) * delta)
-
- with ops.control_dependencies([first_moment_update]):
- avg_second = state_ops.scatter_add(
- avg_second,
- grad.indices,
- math_ops.cast(self._counter < 1, var.dtype) *
- -(1. - decay_tensor) * (
- array_ops.gather_nd(avg_second, grad.indices) - decay_tensor *
- math_ops.square(delta)))
- avg_second = array_ops.gather_nd(avg_second, grad.indices)
- # TODO(b/70783772)
- diag_preconditioner = clip_ops.clip_by_value(avg_second, 1e-12, 1e12)
- else:
- raise errors.InvalidArgumentError(
- None, None, 'grad must of type Tensor or IndexedSlice')
-
- diag_preconditioner *= batch_size
-
- if self._use_single_learning_rate:
- diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)
-
- # From Theorem 2 Corollary 1 of Mandt et al. 2017
- return 2. * batch_size / (
- math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
- diag_preconditioner)
-
- def _apply_dense(self, grad, var):
-
- max_learning_rate = array_ops.where(self._counter < self._burnin,
- self._burnin_max_learning_rate,
- self._max_learning_rate)
-
- learn_rates = clip_ops.clip_by_value(
- self._get_coordinatewise_learning_rate(grad, var), 0.0,
- math_ops.cast(max_learning_rate, var.dtype.base_dtype))
-
- newgrad = grad * learn_rates
- return training_ops.apply_gradient_descent(
- var,
- math_ops.cast(1.0, var.dtype),
- newgrad,
- use_locking=self._use_locking).op
-
- def _apply_sparse(self, grad, var):
-
- max_learning_rate = array_ops.where(self._counter < self._burnin,
- self._burnin_max_learning_rate,
- self._max_learning_rate)
-
- learn_rate = clip_ops.clip_by_value(
- self._get_coordinatewise_learning_rate(grad, var), 0.0,
- math_ops.cast(max_learning_rate, var.dtype))
- delta = grad.values * learn_rate
-
- return state_ops.scatter_sub(var, grad.indices, delta,
- use_locking=self._use_locking)
-
- def _finish(self, update_ops, name_scope):
- update_ops.append([self._counter.assign_add(1)])
- return control_flow_ops.group(*update_ops, name=name_scope)
-
- @property
- def variable_scope(self):
- """Variable scope of all calls to `tf.get_variable`."""
- return self._variable_scope