diff options
5 files changed, 1173 insertions, 5 deletions
diff --git a/tensorflow/contrib/bayesflow/BUILD b/tensorflow/contrib/bayesflow/BUILD index 9f3650e8f9..a262d4aecd 100644 --- a/tensorflow/contrib/bayesflow/BUILD +++ b/tensorflow/contrib/bayesflow/BUILD @@ -19,6 +19,7 @@ py_library( srcs = ["__init__.py"] + glob(["python/ops/*.py"]), srcs_version = "PY2AND3", deps = [ + "//tensorflow/contrib/distributions:distributions_py", "//tensorflow/contrib/framework:framework_py", "//tensorflow/python:array_ops", "//tensorflow/python:control_flow_ops", @@ -32,7 +33,6 @@ py_library( "//tensorflow/python:random_ops", "//tensorflow/python:state_ops", "//tensorflow/python:util", - "//tensorflow/python/ops/distributions", "//third_party/py/numpy", ], ) @@ -100,6 +100,25 @@ cuda_py_test( ) cuda_py_test( + name = "layers_dense_variational_test", + size = "small", + srcs = ["python/kernel_tests/layers_dense_variational_test.py"], + additional_deps = [ + ":bayesflow_py", + "//third_party/py/numpy", + "//tensorflow/contrib/distributions:distributions_py", + "//tensorflow/python/ops/distributions", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:gradients", + "//tensorflow/python:linalg_ops", + "//tensorflow/python:math_ops", + "//tensorflow/python:nn_ops", + ], +) + +cuda_py_test( name = "monte_carlo_test", size = "small", srcs = ["python/kernel_tests/monte_carlo_test.py"], diff --git a/tensorflow/contrib/bayesflow/__init__.py b/tensorflow/contrib/bayesflow/__init__.py index a638753f2f..95b9452b1a 100644 --- a/tensorflow/contrib/bayesflow/__init__.py +++ b/tensorflow/contrib/bayesflow/__init__.py @@ -25,6 +25,7 @@ from tensorflow.contrib.bayesflow.python.ops import csiszar_divergence from tensorflow.contrib.bayesflow.python.ops import custom_grad from tensorflow.contrib.bayesflow.python.ops import halton_sequence from tensorflow.contrib.bayesflow.python.ops import hmc +from tensorflow.contrib.bayesflow.python.ops import layers from tensorflow.contrib.bayesflow.python.ops import metropolis_hastings from tensorflow.contrib.bayesflow.python.ops import monte_carlo from tensorflow.contrib.bayesflow.python.ops import optimizers @@ -33,9 +34,19 @@ from tensorflow.contrib.bayesflow.python.ops import optimizers from tensorflow.python.util.all_util import remove_undocumented -_allowed_symbols = ['csiszar_divergence', 'custom_grad', 'entropy', - 'metropolis_hastings', 'monte_carlo', 'halton_sequence', - 'hmc', 'optimizers', 'special_math', 'stochastic_variables', - 'variational_inference'] +_allowed_symbols = [ + 'csiszar_divergence', + 'custom_grad', + 'entropy', + 'halton_sequence', + 'hmc', + 'layers', + 'metropolis_hastings', + 'monte_carlo', + 'optimizers', + 'special_math', + 'stochastic_variables', + 'variational_inference', +] remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py new file mode 100644 index 0000000000..50358fd1c2 --- /dev/null +++ b/tensorflow/contrib/bayesflow/python/kernel_tests/layers_dense_variational_test.py @@ -0,0 +1,304 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for dense Bayesian layers.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.contrib.bayesflow.python.ops import layers_dense_variational_impl as prob_layers_lib +from tensorflow.python.framework import ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops +from tensorflow.python.ops.distributions import normal as normal_lib +from tensorflow.python.platform import test + + +class Counter(object): + """Helper class to manage incrementing a counting `int`.""" + + def __init__(self): + self._value = -1 + + @property + def value(self): + return self._value + + def __call__(self): + self._value += 1 + return self._value + + +class MockDistribution(normal_lib.Normal): + """Monitors DenseVariational calls to the underlying distribution.""" + + def __init__(self, result_sample, result_log_prob, loc=None, scale=None): + self.result_sample = result_sample + self.result_log_prob = result_log_prob + self.result_loc = loc + self.result_scale = scale + self.called_log_prob = Counter() + self.called_sample = Counter() + self.called_loc = Counter() + self.called_scale = Counter() + + def log_prob(self, *args, **kwargs): + self.called_log_prob() + return self.result_log_prob + + def sample(self, *args, **kwargs): + self.called_sample() + return self.result_sample + + @property + def loc(self): + self.called_loc() + return self.result_loc + + @property + def scale(self): + self.called_scale() + return self.result_scale + + +class MockKLDivergence(object): + """Monitors DenseVariational calls to the divergence implementation.""" + + def __init__(self, result): + self.result = result + self.args = [] + self.called = Counter() + + def __call__(self, *args, **kwargs): + self.called() + self.args.append(args) + return self.result + + +class DenseVariationalLocalReparametrization(test.TestCase): + + def testKLPenaltyKernel(self): + with self.test_session(): + dense_vi = prob_layers_lib.DenseVariational(units=2) + inputs = random_ops.random_uniform([2, 3], seed=1) + + # No keys. + loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES) + self.assertEqual(len(loss_keys), 0) + self.assertListEqual(dense_vi.losses, loss_keys) + + _ = dense_vi(inputs) + + # Yes keys. + loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES) + self.assertEqual(len(loss_keys), 1) + self.assertListEqual(dense_vi.losses, loss_keys) + + def testKLPenaltyBoth(self): + def _make_normal(dtype, *args): # pylint: disable=unused-argument + return normal_lib.Normal( + loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)) + with self.test_session(): + dense_vi = prob_layers_lib.DenseVariational( + units=2, + bias_posterior_fn=prob_layers_lib.default_mean_field_normal_fn(), + bias_prior_fn=_make_normal) + inputs = random_ops.random_uniform([2, 3], seed=1) + + # No keys. + loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES) + self.assertEqual(len(loss_keys), 0) + self.assertListEqual(dense_vi.losses, loss_keys) + + _ = dense_vi(inputs) + + # Yes keys. + loss_keys = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES) + self.assertEqual(len(loss_keys), 2) + self.assertListEqual(dense_vi.losses, loss_keys) + + def testVariationalNonLocal(self): + batch_size, in_size, out_size = 2, 3, 4 + with self.test_session() as sess: + seed = Counter() + inputs = random_ops.random_uniform([batch_size, in_size], seed=seed()) + + kernel_size = [in_size, out_size] + kernel_posterior = MockDistribution( + result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()), + result_sample=random_ops.random_uniform(kernel_size, seed=seed())) + kernel_prior = MockDistribution( + result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()), + result_sample=random_ops.random_uniform(kernel_size, seed=seed())) + kernel_divergence = MockKLDivergence( + result=random_ops.random_uniform(kernel_size, seed=seed())) + + bias_size = [out_size] + bias_posterior = MockDistribution( + result_log_prob=random_ops.random_uniform(bias_size, seed=seed()), + result_sample=random_ops.random_uniform(bias_size, seed=seed())) + bias_prior = MockDistribution( + result_log_prob=random_ops.random_uniform(bias_size, seed=seed()), + result_sample=random_ops.random_uniform(bias_size, seed=seed())) + bias_divergence = MockKLDivergence( + result=random_ops.random_uniform(bias_size, seed=seed())) + + expected_outputs = ( + math_ops.matmul(inputs, kernel_posterior.result_sample) + + bias_posterior.result_sample) + + dense_vi = prob_layers_lib.DenseVariational( + units=2, + kernel_use_local_reparameterization=False, + kernel_posterior_fn=lambda *args: kernel_posterior, + kernel_posterior_tensor_fn=lambda d: d.sample(seed=42), + kernel_prior_fn=lambda *args: kernel_prior, + kernel_divergence_fn=kernel_divergence, + bias_posterior_fn=lambda *args: bias_posterior, + bias_posterior_tensor_fn=lambda d: d.sample(seed=43), + bias_prior_fn=lambda *args: bias_prior, + bias_divergence_fn=bias_divergence) + + outputs = dense_vi(inputs) + + kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES) + + [ + expected_outputs_, actual_outputs_, + expected_kernel_, actual_kernel_, + expected_kernel_divergence_, actual_kernel_divergence_, + expected_bias_, actual_bias_, + expected_bias_divergence_, actual_bias_divergence_, + ] = sess.run([ + expected_outputs, outputs, + kernel_posterior.result_sample, dense_vi.kernel.posterior_tensor, + kernel_divergence.result, kl_penalty[0], + bias_posterior.result_sample, dense_vi.bias.posterior_tensor, + bias_divergence.result, kl_penalty[1], + ]) + + self.assertAllClose( + expected_kernel_, actual_kernel_, + rtol=1e-6, atol=0.) + self.assertAllClose( + expected_bias_, actual_bias_, + rtol=1e-6, atol=0.) + self.assertAllClose( + expected_outputs_, actual_outputs_, + rtol=1e-6, atol=0.) + self.assertAllClose( + expected_kernel_divergence_, actual_kernel_divergence_, + rtol=1e-6, atol=0.) + self.assertAllClose( + expected_bias_divergence_, actual_bias_divergence_, + rtol=1e-6, atol=0.) + + self.assertAllEqual( + [[kernel_posterior, kernel_prior, kernel_posterior.result_sample]], + kernel_divergence.args) + + self.assertAllEqual( + [[bias_posterior, bias_prior, bias_posterior.result_sample]], + bias_divergence.args) + + def testVariationalLocal(self): + batch_size, in_size, out_size = 2, 3, 4 + with self.test_session() as sess: + seed = Counter() + inputs = random_ops.random_uniform([batch_size, in_size], seed=seed()) + + kernel_size = [in_size, out_size] + kernel_posterior = MockDistribution( + loc=random_ops.random_uniform(kernel_size, seed=seed()), + scale=random_ops.random_uniform(kernel_size, seed=seed()), + result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()), + result_sample=random_ops.random_uniform(kernel_size, seed=seed())) + kernel_prior = MockDistribution( + result_log_prob=random_ops.random_uniform(kernel_size, seed=seed()), + result_sample=random_ops.random_uniform(kernel_size, seed=seed())) + kernel_divergence = MockKLDivergence( + result=random_ops.random_uniform(kernel_size, seed=seed())) + + bias_size = [out_size] + bias_posterior = MockDistribution( + result_log_prob=random_ops.random_uniform(bias_size, seed=seed()), + result_sample=random_ops.random_uniform(bias_size, seed=seed())) + bias_prior = MockDistribution( + result_log_prob=random_ops.random_uniform(bias_size, seed=seed()), + result_sample=random_ops.random_uniform(bias_size, seed=seed())) + bias_divergence = MockKLDivergence( + result=random_ops.random_uniform(bias_size, seed=seed())) + + expected_kernel_posterior_affine = normal_lib.Normal( + loc=math_ops.matmul(inputs, kernel_posterior.result_loc), + scale=math_ops.matmul( + inputs**2., kernel_posterior.result_scale**2)**0.5) + expected_kernel_posterior_affine_tensor = ( + expected_kernel_posterior_affine.sample(seed=42)) + expected_outputs = (expected_kernel_posterior_affine_tensor + + bias_posterior.result_sample) + + dense_vi = prob_layers_lib.DenseVariational( + units=2, + kernel_use_local_reparameterization=True, + kernel_posterior_fn=lambda *args: kernel_posterior, + kernel_posterior_tensor_fn=lambda d: d.sample(seed=42), + kernel_prior_fn=lambda *args: kernel_prior, + kernel_divergence_fn=kernel_divergence, + bias_posterior_fn=lambda *args: bias_posterior, + bias_posterior_tensor_fn=lambda d: d.sample(seed=43), + bias_prior_fn=lambda *args: bias_prior, + bias_divergence_fn=bias_divergence) + + outputs = dense_vi(inputs) + + kl_penalty = ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES) + + [ + expected_outputs_, actual_outputs_, + expected_kernel_divergence_, actual_kernel_divergence_, + expected_bias_, actual_bias_, + expected_bias_divergence_, actual_bias_divergence_, + ] = sess.run([ + expected_outputs, outputs, + kernel_divergence.result, kl_penalty[0], + bias_posterior.result_sample, dense_vi.bias.posterior_tensor, + bias_divergence.result, kl_penalty[1], + ]) + + self.assertAllClose( + expected_bias_, actual_bias_, + rtol=1e-6, atol=0.) + self.assertAllClose( + expected_outputs_, actual_outputs_, + rtol=1e-6, atol=0.) + self.assertAllClose( + expected_kernel_divergence_, actual_kernel_divergence_, + rtol=1e-6, atol=0.) + self.assertAllClose( + expected_bias_divergence_, actual_bias_divergence_, + rtol=1e-6, atol=0.) + + self.assertAllEqual( + [[kernel_posterior, kernel_prior, None]], + kernel_divergence.args) + + self.assertAllEqual( + [[bias_posterior, bias_prior, bias_posterior.result_sample]], + bias_divergence.args) + + +if __name__ == "__main__": + test.main() diff --git a/tensorflow/contrib/bayesflow/python/ops/layers.py b/tensorflow/contrib/bayesflow/python/ops/layers.py new file mode 100644 index 0000000000..dcead38af8 --- /dev/null +++ b/tensorflow/contrib/bayesflow/python/ops/layers.py @@ -0,0 +1,37 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Probabilistic neural layers. + +See ${python/contrib.bayesflow.layers}. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# go/tf-wildcard-import +# pylint: disable=wildcard-import +from tensorflow.contrib.bayesflow.python.ops.layers_dense_variational_impl import * +# pylint: enable=wildcard-import +from tensorflow.python.util.all_util import remove_undocumented + +_allowed_symbols = [ + 'DenseVariational', + 'dense_variational', + 'default_loc_scale_fn', + 'default_mean_field_normal_fn', +] + +remove_undocumented(__name__, _allowed_symbols) diff --git a/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py new file mode 100644 index 0000000000..b05ce0ffc1 --- /dev/null +++ b/tensorflow/contrib/bayesflow/python/ops/layers_dense_variational_impl.py @@ -0,0 +1,797 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Dense Bayesian layer using KL-divergence based variational inference. + +@@DenseVariational +@@dense_variational + +@@default_loc_scale_fn +@@default_mean_field_normal_fn +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_shape +from tensorflow.python.layers import base as layers_lib +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import nn +from tensorflow.python.ops import nn_ops +from tensorflow.python.ops import standard_ops +from tensorflow.python.ops.distributions import kullback_leibler as kl_lib +from tensorflow.python.ops.distributions import normal as normal_lib + + +__all__ = [ + "DenseVariational", + "dense_variational", + "default_loc_scale_fn", + "default_mean_field_normal_fn", +] + + +def default_loc_scale_fn( + is_singular=False, + loc_initializer=init_ops.random_normal_initializer(stddev=0.1), + untransformed_scale_initializer=init_ops.random_normal_initializer( + mean=-3., stddev=0.1), + loc_regularizer=None, + untransformed_scale_regularizer=None, + loc_constraint=None, + untransformed_scale_constraint=None): + """Makes closure which creates `loc`, `scale` params from `tf.get_variable`. + + This function produces a closure which produces `loc`, `scale` using + `tf.get_variable`. The closure accepts the following arguments: + + dtype: Type of parameter's event. + shape: Python `list`-like representing the parameter's event shape. + name: Python `str` name prepended to any created (or existing) + `tf.Variable`s. + trainable: Python `bool` indicating all created `tf.Variable`s should be + added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. + add_variable_fn: `tf.get_variable`-like `callable` used to create (or + access existing) `tf.Variable`s. + + Args: + is_singular: Python `bool` indicating if `scale is None`. Default: `False`. + loc_initializer: Initializer function for the `loc` parameters. + The default is `tf.random_normal_initializer(mean=0., stddev=0.1)`. + untransformed_scale_initializer: Initializer function for the `scale` + parameters. Default value: `tf.random_normal_initializer(mean=-3., + stddev=0.1)`. This implies the softplus transformed result has mean + approximately `0.05` and std. deviation approximately `0.005`. + loc_regularizer: Regularizer function for the `loc` parameters. + The default (`None`) is to use the `tf.get_variable` default. + untransformed_scale_regularizer: Regularizer function for the `scale` + parameters. The default (`None`) is to use the `tf.get_variable` default. + loc_constraint: An optional projection function to be applied to the + loc after being updated by an `Optimizer`. The function must take as input + the unprojected variable and must return the projected variable (which + must have the same shape). Constraints are not safe to use when doing + asynchronous distributed training. + The default (`None`) is to use the `tf.get_variable` default. + untransformed_scale_constraint: An optional projection function to be + applied to the `scale` parameters after being updated by an `Optimizer` + (e.g. used to implement norm constraints or value constraints). The + function must take as input the unprojected variable and must return the + projected variable (which must have the same shape). Constraints are not + safe to use when doing asynchronous distributed training. The default + (`None`) is to use the `tf.get_variable` default. + + Returns: + default_loc_scale_fn: Python `callable` which instantiates `loc`, `scale` + parameters from args: `dtype, shape, name, trainable, add_variable_fn`. + """ + def _fn(dtype, shape, name, trainable, add_variable_fn): + """Creates `loc`, `scale` parameters.""" + loc = add_variable_fn( + name=name + "_loc", + shape=shape, + initializer=loc_initializer, + regularizer=loc_regularizer, + constraint=loc_constraint, + dtype=dtype, + trainable=trainable) + if is_singular: + return loc, None + untransformed_scale = add_variable_fn( + name=name + "_untransformed_scale", + shape=shape, + initializer=untransformed_scale_initializer, + regularizer=untransformed_scale_regularizer, + constraint=untransformed_scale_constraint, + dtype=dtype, + trainable=trainable) + scale = (np.finfo(dtype.as_numpy_dtype).eps + + nn_ops.softplus(untransformed_scale)) + return loc, scale + return _fn + + +def default_mean_field_normal_fn( + is_singular=False, + loc_initializer=None, + untransformed_scale_initializer=None, + loc_regularizer=None, + untransformed_scale_regularizer=None, + loc_constraint=None, + untransformed_scale_constraint=None): + """Creates a function to build Normal distributions with trainable params. + + This function produces a closure which produces `tf.distributions.Normal` + parameterized by a loc` and `scale` each created using `tf.get_variable`. The + produced closure accepts the following arguments: + + name: Python `str` name prepended to any created (or existing) + `tf.Variable`s. + shape: Python `list`-like representing the parameter's event shape. + dtype: Type of parameter's event. + trainable: Python `bool` indicating all created `tf.Variable`s should be + added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. + add_variable_fn: `tf.get_variable`-like `callable` used to create (or + access existing) `tf.Variable`s. + + Args: + is_singular: Python `bool` if `True`, forces the special case limit of + `scale->0`, i.e., a `Deterministic` distribution. + loc_initializer: Initializer function for the `loc` parameters. + If `None` (default), values are initialized using the default + initializer used by `tf.get_variable`. + untransformed_scale_initializer: Initializer function for the `scale` + parameters. If `None` (default), values are initialized using the default + initializer used by `tf.get_variable`. + loc_regularizer: Regularizer function for the `loc` parameters. + untransformed_scale_regularizer: Regularizer function for the `scale` + parameters. + loc_constraint: An optional projection function to be applied to the + loc after being updated by an `Optimizer`. The function must take as input + the unprojected variable and must return the projected variable (which + must have the same shape). Constraints are not safe to use when doing + asynchronous distributed training. + untransformed_scale_constraint: An optional projection function to be + applied to the `scale` parameters after being updated by an `Optimizer` + (e.g. used to implement norm constraints or value constraints). The + function must take as input the unprojected variable and must return the + projected variable (which must have the same shape). Constraints are not + safe to use when doing asynchronous distributed training. + + Returns: + make_normal_fn: Python `callable` which creates a `tf.distributions.Normal` + using from args: `dtype, shape, name, trainable, add_variable_fn`. + """ + loc_scale_fn_ = default_loc_scale_fn( + is_singular, + loc_initializer, + untransformed_scale_initializer, + loc_regularizer, + untransformed_scale_regularizer, + loc_constraint, + untransformed_scale_constraint) + def _fn(dtype, shape, name, trainable, add_variable_fn): + """Creates a batch of `Deterministic` or `Normal` distributions.""" + loc, scale = loc_scale_fn_(dtype, shape, name, trainable, add_variable_fn) + if scale is None: + return deterministic_lib.Deterministic(loc=loc) + return normal_lib.Normal(loc=loc, scale=scale) + return _fn + + +class DenseVariational(layers_lib.Layer): + """Densely-connected variational class. + + This layer implements the Bayesian variational inference analogue to: + `outputs = activation(matmul(inputs, kernel) + bias)` + by assuming the `kernel` and/or the `bias` are random variables. + + The layer implements a stochastic dense calculation by making a Monte Carlo + approximation of a [variational Bayesian method based on KL divergence]( + https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e., + + ```none + -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw + = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw + <= E_q(W|x)[-log p(y,W|x) + log q(W|x)] # Jensen's + = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)] + ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m } + + KL[q(W|x), p(W)] + ``` + + where `W` denotes the (independent) `kernel` and `bias` random variables, `w` + is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`, + and `~=` denotes an approximation which becomes exact as `m->inf`. The above + bound is sometimes referred to as the negative Evidence Lower BOund or + negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this + layer is appropriate to use when the final loss is a negative log-likelihood. + + The Monte-Carlo sum portion is used for the feed-forward calculation of the + DNN. The KL divergence portion can be added to the final loss via: + `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`. + + The arguments permit separate specification of the surrogate posterior + (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias` + random variables (which together comprise `W`). + + Args: + units: Integer or Long, dimensionality of the output space. + activation: Activation function (`callable`). Set it to None to maintain a + linear activation. + activity_regularizer: Regularizer function for the output. + trainable: Boolean, if `True` also add variables to the graph collection + `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). + kernel_use_local_reparameterization: Python `bool` indicating whether + `kernel` calculation should employ the Local Reparameterization Trick. + When `True`, `kernel_posterior_fn` must create an instance of + `tf.distributions.Normal`. + kernel_posterior_fn: Python `callable` which creates + `tf.distributions.Distribution` instance representing the surrogate + posterior of the `kernel` parameter. Default value: + `default_mean_field_normal_fn()`. + kernel_posterior_tensor_fn: Python `callable` which takes a + `tf.distributions.Distribution` instance and returns a representative + value. Default value: `lambda d: d.sample()`. + kernel_prior_fn: Python `callable` which creates `tf.distributions` + instance. See `default_mean_field_normal_fn` docstring for required + parameter signature. + Default value: `tf.distributions.Normal(loc=0., scale=1.)`. + kernel_divergence_fn: Python `callable` which takes the surrogate posterior + distribution, prior distribution and random variate sample(s) from the + surrogate posterior and computes or approximates the KL divergence. The + distributions are `tf.distributions.Distribution`-like instances and the + sample is a `Tensor`. + bias_posterior_fn: Python `callable` which creates + `tf.distributions.Distribution` instance representing the surrogate + posterior of the `bias` parameter. Default value: + `default_mean_field_normal_fn(is_singular=True)` (which creates an + instance of `tf.distributions.Deterministic`). + bias_posterior_tensor_fn: Python `callable` which takes a + `tf.distributions.Distribution` instance and returns a representative + value. Default value: `lambda d: d.sample()`. + bias_prior_fn: Python `callable` which creates `tf.distributions` instance. + See `default_mean_field_normal_fn` docstring for required parameter + signature. Default value: `None` (no prior, no variational inference) + bias_divergence_fn: Python `callable` which takes the surrogate posterior + distribution, prior distribution and random variate sample(s) from the + surrogate posterior and computes or approximates the KL divergence. The + distributions are `tf.distributions.Distribution`-like instances and the + sample is a `Tensor`. + name: Python `str`, the name of the layer. Layers with the same name will + share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in + such cases. + reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous + layer by the same name. + + Properties: + units: Python integer, dimensionality of the output space. + activation: Activation function (`callable`). + activity_regularizer: Regularizer function for the output. + kernel_use_local_reparameterization: Python `bool` indicating whether + `kernel` calculation should employ the Local Reparameterization Trick. + kernel: `VariationalKernelParamater` instance containing all `kernel` + related properties and `callable`s. + bias: `VariationalParameter` instance containing all `kernel` + related properties and `callable`s. + """ + + def __init__( + self, + units, + activation=None, + activity_regularizer=None, + trainable=True, + kernel_use_local_reparameterization=True, + kernel_posterior_fn=default_mean_field_normal_fn(), + kernel_posterior_tensor_fn=lambda d: d.sample(), + kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda + loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), + kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), + bias_posterior_fn=default_mean_field_normal_fn(is_singular=True), + bias_posterior_tensor_fn=lambda d: d.sample(), + bias_prior_fn=None, + bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), + name=None, + **kwargs): + super(DenseVariational, self).__init__( + trainable=trainable, + name=name, + activity_regularizer=activity_regularizer, + **kwargs) + self._units = units + self._activation = activation + self._input_spec = layers_lib.InputSpec(min_ndim=2) + self._kernel_use_local_reparameterization = ( + kernel_use_local_reparameterization) + self._kernel = VariationalKernelParameter( + kernel_posterior_fn, + kernel_posterior_tensor_fn, + kernel_prior_fn, + kernel_divergence_fn) + self._bias = VariationalParameter( + bias_posterior_fn, + bias_posterior_tensor_fn, + bias_prior_fn, + bias_divergence_fn) + + @property + def units(self): + return self._units + + @property + def activation(self): + return self._activation + + @property + def input_spec(self): + return self._input_spec + + @input_spec.setter + def input_spec(self, value): + self._input_spec = value + + @property + def kernel_use_local_reparameterization(self): + return self._kernel_use_local_reparameterization + + @property + def kernel(self): + return self._kernel + + @property + def bias(self): + return self._bias + + def build(self, input_shape): + input_shape = tensor_shape.TensorShape(input_shape) + in_size = input_shape.with_rank_at_least(2)[-1].value + if in_size is None: + raise ValueError("The last dimension of the inputs to `Dense` " + "should be defined. Found `None`.") + self._input_spec = layers_lib.InputSpec(min_ndim=2, axes={-1: in_size}) + dtype = dtypes.as_dtype(self.dtype) + + # Must have a posterior kernel. + self.kernel.posterior = self.kernel.posterior_fn( + dtype, [in_size, self.units], "kernel_posterior", + self.trainable, self.add_variable) + + if self.kernel.prior_fn is None: + self.kernel_prior = None + else: + self.kernel.prior = self.kernel.prior_fn( + dtype, [in_size, self.units], "kernel_prior", + self.trainable, self.add_variable) + self._built_kernel_divergence = False + + if self.bias.posterior_fn is None: + self.bias.posterior = None + else: + self.bias.posterior = self.bias.posterior_fn( + dtype, [self.units], "bias_posterior", + self.trainable, self.add_variable) + + if self.bias.prior_fn is None: + self.bias.prior = None + else: + self.bias.prior = self.bias.prior_fn( + dtype, [self.units], "bias_prior", + self.trainable, self.add_variable) + self._built_bias_divergence = False + + self.built = True + + def call(self, inputs): + inputs = ops.convert_to_tensor(inputs, dtype=self.dtype) + + outputs = self._apply_variational_kernel(inputs) + outputs = self._apply_variational_bias(outputs) + if self.activation is not None: + outputs = self.activation(outputs) # pylint: disable=not-callable + if not self._built_kernel_divergence: + self._apply_divergence(self.kernel, name="divergence_kernel") + self._built_kernel_divergence = True + if not self._built_bias_divergence: + self._apply_divergence(self.bias, name="divergence_bias") + self._built_bias_divergence = True + return outputs + + def _apply_variational_kernel(self, inputs): + if not self.kernel_use_local_reparameterization: + self.kernel.posterior_tensor = self.kernel.posterior_tensor_fn( + self.kernel.posterior) + self.kernel.posterior_affine = None + self.kernel.posterior_affine_tensor = None + return self._matmul(inputs, self.kernel.posterior_tensor) + if not isinstance(self.kernel.posterior, normal_lib.Normal): + raise TypeError("`kernel_use_local_reparameterization=True` requires " + "`kernel_posterior_fn` produce an instance of " + "`tf.distributions.Normal` (saw: \"{}\").".format( + type(self.kernel.posterior).__name__)) + self.kernel.posterior_affine = normal_lib.Normal( + loc=self._matmul(inputs, self.kernel.posterior.loc), + scale=standard_ops.sqrt(self._matmul( + standard_ops.square(inputs), + standard_ops.square(self.kernel.posterior.scale)))) + self.kernel.posterior_affine_tensor = ( + self.kernel.posterior_tensor_fn(self.kernel.posterior_affine)) + self.kernel.posterior_tensor = None + return self.kernel.posterior_affine_tensor + + def _apply_variational_bias(self, inputs): + if self.bias.posterior is None: + self.bias.posterior_tensor = None + return inputs + self.bias.posterior_tensor = self.bias.posterior_tensor_fn( + self.bias.posterior) + return nn.bias_add(inputs, self.bias.posterior_tensor) + + def _apply_divergence(self, param, name): + if (param.divergence_fn is None or + param.posterior is None or + param.prior is None): + param.divergence = None + return + param.divergence = standard_ops.identity( + param.divergence_fn( + param.posterior, param.prior, param.posterior_tensor), + name=name) + self.add_loss(param.divergence) + + def _matmul(self, inputs, kernel): + if inputs.shape.ndims <= 2: + return standard_ops.matmul(inputs, kernel) + # To handle broadcasting, we must use `tensordot`. + return standard_ops.tensordot(inputs, kernel, axes=[[-1], [0]]) + + def _compute_output_shape(self, input_shape): + input_shape = tensor_shape.TensorShape(input_shape).with_rank_at_least(2) + if input_shape[-1].value is None: + raise ValueError( + "The innermost dimension of input_shape must be defined, " + "but saw: {}".format(input_shape)) + return input_shape[:-1].concatenate(self.units) + + +def dense_variational( + inputs, + units, + activation=None, + activity_regularizer=None, + trainable=True, + kernel_use_local_reparameterization=True, + kernel_posterior_fn=default_mean_field_normal_fn(), + kernel_posterior_tensor_fn=lambda d: d.sample(), + kernel_prior_fn=lambda dtype, *args: normal_lib.Normal( # pylint: disable=g-long-lambda + loc=dtype.as_numpy_dtype(0.), scale=dtype.as_numpy_dtype(1.)), + kernel_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), + bias_posterior_fn=default_mean_field_normal_fn(is_singular=True), + bias_posterior_tensor_fn=lambda d: d.sample(), + bias_prior_fn=None, + bias_divergence_fn=lambda q, p, ignore: kl_lib.kl_divergence(q, p), + name=None, + reuse=None): + """Densely-connected variational layer. + + This layer implements the Bayesian variational inference analogue to: + `outputs = activation(matmul(inputs, kernel) + bias)` + by assuming the `kernel` and/or the `bias` are random variables. + + The layer implements a stochastic dense calculation by making a Monte Carlo + approximation of a [variational Bayesian method based on KL divergence]( + https://en.wikipedia.org/wiki/Variational_Bayesian_methods), i.e., + + ```none + -log p(y|x) = -log int_{R**d} p(y|x,w) p(w) dw + = -log int_{R**d} p(y,w|x) q(w|x) / q(w|x) dw + <= E_q(W|x)[-log p(y,W|x) + log q(W|x)] # Jensen's + = E_q(W|x)[-log p(y|x,W)] + KL[q(W|x), p(W)] + ~= m**-1 sum{ -log(y|x,w[j]) : w[j] ~ q(W|x), j=1..m } + + KL[q(W|x), p(W)] + ``` + + where `W` denotes the (independent) `kernel` and `bias` random variables, `w` + is a random variate or outcome of `W`, `y` is the label, `x` is the evidence`, + and `~=` denotes an approximation which becomes exact as `m->inf`. The above + bound is sometimes referred to as the negative Evidence Lower BOund or + negative [ELBO](https://arxiv.org/abs/1601.00670). In context of a DNN, this + layer is appropriate to use when the final loss is a negative log-likelihood. + + The Monte-Carlo sum portion is used for the feed-forward calculation of the + DNN. The KL divergence portion can be added to the final loss via: + `loss += sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))`. + + The arguments permit separate specification of the surrogate posterior + (`q(W|x)`), prior (`p(W)`), and divergence for both the `kernel` and `bias` + random variables (which together comprise `W`). + + Args: + inputs: Tensor input. + units: Integer or Long, dimensionality of the output space. + activation: Activation function (`callable`). Set it to None to maintain a + linear activation. + activity_regularizer: Regularizer function for the output. + trainable: Boolean, if `True` also add variables to the graph collection + `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). + kernel_use_local_reparameterization: Python `bool` indicating whether + `kernel` calculation should employ the Local Reparameterization Trick. + When `True`, `kernel_posterior_fn` must create an instance of + `tf.distributions.Normal`. + kernel_posterior_fn: Python `callable` which creates + `tf.distributions.Distribution` instance representing the surrogate + posterior of the `kernel` parameter. Default value: + `default_mean_field_normal_fn()`. + kernel_posterior_tensor_fn: Python `callable` which takes a + `tf.distributions.Distribution` instance and returns a representative + value. Default value: `lambda d: d.sample()`. + kernel_prior_fn: Python `callable` which creates `tf.distributions` + instance. See `default_mean_field_normal_fn` docstring for required + parameter signature. + Default value: `tf.distributions.Normal(loc=0., scale=1.)`. + kernel_divergence_fn: Python `callable` which takes the surrogate posterior + distribution, prior distribution and random variate sample(s) from the + surrogate posterior and computes or approximates the KL divergence. The + distributions are `tf.distributions.Distribution`-like instances and the + sample is a `Tensor`. + bias_posterior_fn: Python `callable` which creates + `tf.distributions.Distribution` instance representing the surrogate + posterior of the `bias` parameter. Default value: + `default_mean_field_normal_fn(is_singular=True)` (which creates an + instance of `tf.distributions.Deterministic`). + bias_posterior_tensor_fn: Python `callable` which takes a + `tf.distributions.Distribution` instance and returns a representative + value. Default value: `lambda d: d.sample()`. + bias_prior_fn: Python `callable` which creates `tf.distributions` instance. + See `default_mean_field_normal_fn` docstring for required parameter + signature. Default value: `None` (no prior, no variational inference) + bias_divergence_fn: Python `callable` which takes the surrogate posterior + distribution, prior distribution and random variate sample(s) from the + surrogate posterior and computes or approximates the KL divergence. The + distributions are `tf.distributions.Distribution`-like instances and the + sample is a `Tensor`. + name: Python `str`, the name of the layer. Layers with the same name will + share `tf.Variable`s, but to avoid mistakes we require `reuse=True` in + such cases. + reuse: Python `bool`, whether to reuse the `tf.Variable`s of a previous + layer by the same name. + + Returns: + output: `Tensor` representing a the affine transformed input under a random + draw from the surrogate posterior distribution. + """ + layer = DenseVariational( + units, + activation=activation, + activity_regularizer=activity_regularizer, + trainable=trainable, + kernel_use_local_reparameterization=( + kernel_use_local_reparameterization), + kernel_posterior_fn=kernel_posterior_fn, + kernel_posterior_tensor_fn=kernel_posterior_tensor_fn, + kernel_prior_fn=kernel_prior_fn, + kernel_divergence_fn=kernel_divergence_fn, + bias_posterior_fn=bias_posterior_fn, + bias_posterior_tensor_fn=bias_posterior_tensor_fn, + bias_prior_fn=bias_prior_fn, + bias_divergence_fn=bias_divergence_fn, + name=name, + dtype=inputs.dtype.base_dtype, + _scope=name, + _reuse=reuse) + return layer.apply(inputs) + + +class NotSet(object): + """Helper to track whether a `VariationalParameter` value has been set.""" + pass + + +class VariationalParameter(object): + """Struct-like container of variational parameter properties. + + A `VariationalParameter` is intitialized with Python `callable`s which set the + value of correspondingly named members. Corresponding values have "set once" + semantics, i.e., once set to any value they are immutable. + """ + + def __init__( + self, + posterior_fn, + posterior_tensor_fn, + prior_fn, + divergence_fn): + """Creates the `VariationalParameter` struct-like object. + + Args: + posterior_fn: Python `callable` which creates a + `tf.distribution.Distribution` like object representing the posterior + distribution. See `VariationalParameter.posterior_fn` for `callable`'s + required parameters. + posterior_tensor_fn: Python `callable` which computes a `Tensor` + which represents the `posterior`. + prior_fn: Python `callable` which creates a + `tf.distribution.Distribution` like object representing the prior + distribution. See `VariationalParameter.prior_fn` for `callable`'s + required parameters. + divergence_fn: Python `callable` which computes the KL divergence from + `posterior` to `prior`. See `VariationalParameter.divergence_fn` for + required `callable`'s parameters. + """ + self._posterior_fn = posterior_fn + self._posterior = NotSet() + self._posterior_tensor_fn = posterior_tensor_fn + self._posterior_tensor = NotSet() + self._prior_fn = prior_fn + self._prior = NotSet() + self._divergence_fn = divergence_fn + self._divergence = NotSet() + self._init_helper() + + @property + def posterior_fn(self): + """`callable` which creates `tf.distributions.Distribution`-like posterior. + + The `callable` must accept the following parameters: + name: Python `str` name prepended to any created (or existing) + `tf.Variable`s. + shape: Python `list`-like representing the parameter's event shape. + dtype: Type of parameter's event. + trainable: Python `bool` indicating all created `tf.Variable`s should be + added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. + add_variable_fn: `tf.get_variable`-like `callable` used to create (or + access existing) `tf.Variable`s. + + Returns: + posterior_fn: The Python `callable` specified in `__init__`. + """ + return self._posterior_fn + + @property + def posterior(self): + """`tf.distributions.Distribution`-like instance representing posterior.""" + return self._posterior + + @posterior.setter + def posterior(self, value): + """One-time setter of the `posterior` distribution.""" + if not isinstance(self._posterior, NotSet): + raise ValueError("Cannot override already set attribute.") + self._posterior = value + + @property + def posterior_tensor_fn(self): + """Creates `Tensor` representing the `posterior` distribution. + + The `callable` must accept the following parameters: + posterior: `tf.distributions.Distribution`-like instance. + + Returns: + posterior_tensor_fn: The Python `callable` specified in + `__init__`. + """ + return self._posterior_tensor_fn + + @property + def posterior_tensor(self): + """`Tensor` representing the `posterior` distribution.""" + return self._posterior_tensor + + @posterior_tensor.setter + def posterior_tensor(self, value): + """One-time setter of the `posterior_tensor`.""" + if not isinstance(self._posterior_tensor, NotSet): + raise ValueError("Cannot override already set attribute.") + self._posterior_tensor = value + + @property + def prior_fn(self): + """`callable` which creates `tf.distributions.Distribution`-like prior. + + The `callable` must accept the following parameters: + name: Python `str` name prepended to any created (or existing) + `tf.Variable`s. + shape: Python `list`-like representing the parameter's event shape. + dtype: Type of parameter's event. + trainable: Python `bool` indicating all created `tf.Variable`s should be + added to the graph collection `GraphKeys.TRAINABLE_VARIABLES`. + add_variable_fn: `tf.get_variable`-like `callable` used to create (or + access existing) `tf.Variable`s. + + Returns: + prior_fn: The Python `callable` specified in `__init__`. + """ + return self._prior_fn + + @property + def prior(self): + """`tf.distributions.Distribution`-like instance representing posterior.""" + return self._prior + + @prior.setter + def prior(self, value): + """One-time setter of the `prior` distribution.""" + if not isinstance(self._prior, NotSet): + raise ValueError("Cannot override already set attribute.") + self._prior = value + + @property + def divergence_fn(self): + """`callable` which computes KL-divergence `Tensor` from posterior to prior. + + The `callable` must accept the following parameters: + posterior: `tf.distributions.Distribution`-like instance. + prior: `tf.distributions.Distribution`-like instance. + posterior_tensor: `Tensor` representing value of posterior. + + Returns: + divergence_fn: The Python `callable` specified in `__init__`. + """ + return self._divergence_fn + + @property + def divergence(self): + """`Tensor` representing KL-divergence from posterior to prior.""" + return self._divergence + + @divergence.setter + def divergence(self, value): + """One-time setter of the `divergence`.""" + if not isinstance(self._divergence, NotSet): + raise ValueError("Cannot override already set attribute.") + self._divergence = value + + def _init_helper(self): + pass + + +class VariationalKernelParameter(VariationalParameter): + """Struct-like container of variational kernel properties. + + A `VariationalKernelParameter` is intitialized with Python `callable`s which + set the value of correspondingly named members. Corresponding values have "set + once" semantics, i.e., once set to any value they are immutable. + """ + + @property + def posterior_affine(self): + """`tf.distributions.Distribution` affine transformed posterior.""" + return self._posterior_affine + + @posterior_affine.setter + def posterior_affine(self, value): + """One-time setter of `posterior_affine`.""" + if not isinstance(self._posterior_affine, NotSet): + raise ValueError("Cannot override already set attribute.") + self._posterior_affine = value + + @property + def posterior_affine_tensor(self): + """`Tensor` representing the `posterior_affine` distribution.""" + return self._posterior_affine_tensor + + @posterior_affine_tensor.setter + def posterior_affine_tensor(self, value): + """One-time setter of the `posterior_affine_tensor`.""" + if not isinstance(self._posterior_affine_tensor, NotSet): + raise ValueError("Cannot override already set attribute.") + self._posterior_affine_tensor = value + + def _init_helper(self): + self._posterior_affine = NotSet() + self._posterior_affine_tensor = NotSet() |