tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369

# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Monte Carlo integration and helpers.

See the @{$python/contrib.bayesflow.monte_carlo} guide.

@@expectation
@@expectation_importance_sampler
@@expectation_importance_sampler_logspace
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn

__all__ = [
    'expectation',
    'expectation_importance_sampler',
    'expectation_importance_sampler_logspace',
]


def expectation_importance_sampler(f,
                                   log_p,
                                   sampling_dist_q,
                                   z=None,
                                   n=None,
                                   seed=None,
                                   name='expectation_importance_sampler'):
  r"""Monte Carlo estimate of \\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\).

  With \\(p(z) := exp^{log_p(z)}\\), this `Op` returns

  \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ],  z_i ~ q,\\)
  \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\)
  \\(=       E_p[f(Z)]\\)

  This integral is done in log-space with max-subtraction to better handle the
  often extreme values that `f(z) p(z) / q(z)` can take on.

  If `f >= 0`, it is up to 2x more efficient to exponentiate the result of
  `expectation_importance_sampler_logspace` applied to `Log[f]`.

  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`

  Args:
    f: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape
      broadcastable to `q.batch_shape`.
      For example, `f` works "just like" `q.log_prob`.
    log_p:  Callable mapping samples from `sampling_dist_q` to `Tensors` with
      shape broadcastable to `q.batch_shape`.
      For example, `log_p` works "just like" `sampling_dist_q.log_prob`.
    sampling_dist_q:  The sampling distribution.
      `tf.contrib.distributions.Distribution`.
      `float64` `dtype` recommended.
      `log_p` and `q` should be supported on the same set.
    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
    seed:  Python integer to seed the random number generator.
    name:  A name to give this `Op`.

  Returns:
    The importance sampling estimate.  `Tensor` with `shape` equal
      to batch shape of `q`, and `dtype` = `q.dtype`.
  """
  q = sampling_dist_q
  with ops.name_scope(name, values=[z, n]):
    z = _get_samples(q, z, n, seed)

    log_p_z = log_p(z)
    q_log_prob_z = q.log_prob(z)

    def _importance_sampler_positive_f(log_f_z):
      # Same as expectation_importance_sampler_logspace, but using Tensors
      # rather than samples and functions.  Allows us to sample once.
      log_values = log_f_z + log_p_z - q_log_prob_z
      return _logspace_mean(log_values)

    # With \\(f_{plus}(z) = max(0, f(z)), f_{minus}(z) = max(0, -f(z))\\),
    # \\(E_p[f(Z)] = E_p[f_{plus}(Z)] - E_p[f_{minus}(Z)]\\)
    # \\(          = E_p[f_{plus}(Z) + 1] - E_p[f_{minus}(Z) + 1]\\)
    # Without incurring bias, 1 is added to each to prevent zeros in logspace.
    # The logarithm is approximately linear around 1 + epsilon, so this is good
    # for small values of 'z' as well.
    f_z = f(z)
    log_f_plus_z = math_ops.log(nn.relu(f_z) + 1.)
    log_f_minus_z = math_ops.log(nn.relu(-1. * f_z) + 1.)

    log_f_plus_integral = _importance_sampler_positive_f(log_f_plus_z)
    log_f_minus_integral = _importance_sampler_positive_f(log_f_minus_z)

  return math_ops.exp(log_f_plus_integral) - math_ops.exp(log_f_minus_integral)


def expectation_importance_sampler_logspace(
    log_f,
    log_p,
    sampling_dist_q,
    z=None,
    n=None,
    seed=None,
    name='expectation_importance_sampler_logspace'):
  r"""Importance sampling with a positive function, in log-space.

  With \\(p(z) := exp^{log_p(z)}\\), and \\(f(z) = exp{log_f(z)}\\),
  this `Op` returns

  \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ],  z_i ~ q,\\)
  \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\)
  \\(=       Log[E_p[f(Z)]]\\)

  This integral is done in log-space with max-subtraction to better handle the
  often extreme values that `f(z) p(z) / q(z)` can take on.

  In contrast to `expectation_importance_sampler`, this `Op` returns values in
  log-space.


  User supplies either `Tensor` of samples `z`, or number of samples to draw `n`

  Args:
    log_f: Callable mapping samples from `sampling_dist_q` to `Tensors` with
      shape broadcastable to `q.batch_shape`.
      For example, `log_f` works "just like" `sampling_dist_q.log_prob`.
    log_p:  Callable mapping samples from `sampling_dist_q` to `Tensors` with
      shape broadcastable to `q.batch_shape`.
      For example, `log_p` works "just like" `q.log_prob`.
    sampling_dist_q:  The sampling distribution.
      `tf.contrib.distributions.Distribution`.
      `float64` `dtype` recommended.
      `log_p` and `q` should be supported on the same set.
    z:  `Tensor` of samples from `q`, produced by `q.sample` for some `n`.
    n:  Integer `Tensor`.  Number of samples to generate if `z` is not provided.
    seed:  Python integer to seed the random number generator.
    name:  A name to give this `Op`.

  Returns:
    Logarithm of the importance sampling estimate.  `Tensor` with `shape` equal
      to batch shape of `q`, and `dtype` = `q.dtype`.
  """
  q = sampling_dist_q
  with ops.name_scope(name, values=[z, n]):
    z = _get_samples(q, z, n, seed)
    log_values = log_f(z) + log_p(z) - q.log_prob(z)
    return _logspace_mean(log_values)


def _logspace_mean(log_values):
  """Evaluate `Log[E[values]]` in a stable manner.

  Args:
    log_values:  `Tensor` holding `Log[values]`.

  Returns:
    `Tensor` of same `dtype` as `log_values`, reduced across dim 0.
      `Log[Mean[values]]`.
  """
  # center = Max[Log[values]],  with stop-gradient
  # The center hopefully keep the exponentiated term small.  It is canceled
  # from the final result, so putting stop gradient on it will not change the
  # final result.  We put stop gradient on to eliminate unnecessary computation.
  center = array_ops.stop_gradient(_sample_max(log_values))

  # centered_values = exp{Log[values] - E[Log[values]]}
  centered_values = math_ops.exp(log_values - center)

  # log_mean_of_values = Log[ E[centered_values] ] + center
  #                    = Log[ E[exp{log_values - E[log_values]}] ] + center
  #                    = Log[E[values]] - E[log_values] + center
  #                    = Log[E[values]]
  log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center

  return log_mean_of_values


def expectation(f, samples, log_prob=None, use_reparametrization=True,
                axis=0, keep_dims=False, name=None):
  r"""Computes the Monte-Carlo approximation of \\(E_p[f(X)]\\).

  This function computes the Monte-Carlo approximation of an expectation, i.e.,

  \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j),  x_j\  ~iid\ p(X)\\)

  where:

  - `x_j = samples[j, ...]`,
  - `log(p(samples)) = log_prob(samples)` and
  - `m = prod(shape(samples)[axis])`.

  Tricks: Reparameterization and Score-Gradient

  When p is "reparameterized", i.e., a diffeomorphic transformation of a
  parameterless distribution (e.g.,
  `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and
  expectation, i.e.,
  grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n } where
  S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\).

  However, if p is not reparameterized, TensorFlow's gradient will be incorrect
  since the chain-rule stops at samples of non-reparameterized distributions.
  (The non-differentiated result, `approx_expectation`, is the same regardless
  of `use_reparametrization`.) In this circumstance using the Score-Gradient
  trick results in an unbiased gradient, i.e.,

  ```none
  grad[ E_p[f(X)] ]
  = grad[ int dx p(x) f(x) ]
  = int dx grad[ p(x) f(x) ]
  = int dx [ p'(x) f(x) + p(x) f'(x) ]
  = int dx p(x) [p'(x) / p(x) f(x) + f'(x) ]
  = int dx p(x) grad[ f(x) p(x) / stop_grad[p(x)] ]
  = E_p[ grad[ f(x) p(x) / stop_grad[p(x)] ] ]
  ```

  Unless p is not reparametrized, it is usually preferable to
  `use_reparametrization = True`.

  Warning: users are responsible for verifying `p` is a "reparameterized"
  distribution.

  Example Use:

  ```python
  bf = tf.contrib.bayesflow
  ds = tf.contrib.distributions

  # Monte-Carlo approximation of a reparameterized distribution, e.g., Normal.

  num_draws = int(1e5)
  p = ds.Normal(loc=0., scale=1.)
  q = ds.Normal(loc=1., scale=2.)
  exact_kl_normal_normal = ds.kl_divergence(p, q)
  # ==> 0.44314718
  approx_kl_normal_normal = bf.expectation(
      f=lambda x: p.log_prob(x) - q.log_prob(x),
      samples=p.sample(num_draws, seed=42),
      log_prob=p.log_prob,
      use_reparametrization=(p.reparameterization_type
                             == distribution.FULLY_REPARAMETERIZED))
  # ==> 0.44632751
  # Relative Error: <1%

  # Monte-Carlo approximation of non-reparameterized distribution, e.g., Gamma.

  num_draws = int(1e5)
  p = ds.Gamma(concentration=1., rate=1.)
  q = ds.Gamma(concentration=2., rate=3.)
  exact_kl_gamma_gamma = ds.kl_divergence(p, q)
  # ==> 0.37999129
  approx_kl_gamma_gamma = bf.expectation(
      f=lambda x: p.log_prob(x) - q.log_prob(x),
      samples=p.sample(num_draws, seed=42),
      log_prob=p.log_prob,
      use_reparametrization=(p.reparameterization_type
                             == distribution.FULLY_REPARAMETERIZED))
  # ==> 0.37696719
  # Relative Error: <1%

  # For comparing the gradients, see `monte_carlo_test.py`.
  ```

  Note: The above example is for illustration only. To compute approximate
  KL-divergence, the following is preferred:

  ```python
  approx_kl_p_q = bf.monte_carlo_csiszar_f_divergence(
      f=bf.kl_reverse,
      p_log_prob=q.log_prob,
      q=p,
      num_draws=num_draws)
  ```

  Args:
    f: Python callable which can return `f(samples)`.
    samples: `Tensor` of samples used to form the Monte-Carlo approximation of
      \\(E_p[f(X)]\\).  A batch of samples should be indexed by `axis`
      dimensions.
    log_prob: Python callable which can return `log_prob(samples)`. Must
      correspond to the natural-logarithm of the pdf/pmf of each sample. Only
      required/used if `use_reparametrization=False`.
      Default value: `None`.
    use_reparametrization: Python `bool` indicating that the approximation
      should use the fact that the gradient of samples is unbiased. Whether
      `True` or `False`, this arg only affects the gradient of the resulting
      `approx_expectation`.
      Default value: `True`.
    axis: The dimensions to average. If `None`, averages all
      dimensions.
      Default value: `0` (the left-most dimension).
    keep_dims: If True, retains averaged dimensions using size `1`.
      Default value: `False`.
    name: A `name_scope` for operations created by this function.
      Default value: `None` (which implies "expectation").

  Returns:
    approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation
      of \\(E_p[f(X)]\\).

  Raises:
    ValueError: if `f` is not a Python `callable`.
    ValueError: if `use_reparametrization=False` and `log_prob` is not a Python
      `callable`.
  """

  with ops.name_scope(name, 'expectation', [samples]):
    if not callable(f):
      raise ValueError('`f` must be a callable function.')
    if use_reparametrization:
      return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims)
    else:
      if not callable(log_prob):
        raise ValueError('`log_prob` must be a callable function.')
      stop = array_ops.stop_gradient  # For readability.
      x = stop(samples)
      logpx = log_prob(x)
      fx = f(x)  # Call `f` once in case it has side-effects.
      # We now rewrite f(x) so that:
      #   `grad[f(x)] := grad[f(x)] + f(x) * grad[logqx]`.
      # To achieve this, we use a trick that
      #   `h(x) - stop(h(x)) == zeros_like(h(x))`
      # but its gradient is grad[h(x)].
      # Note that IEEE754 specifies that `x - x == 0.` and `x + 0. == x`, hence
      # this trick loses no precision. For more discussion regarding the
      # relevant portions of the IEEE754 standard, see the StackOverflow
      # question,
      # "Is there a floating point value of x, for which x-x == 0 is false?"
      # http://stackoverflow.com/q/2686644
      fx += stop(fx) * (logpx - stop(logpx))  # Add zeros_like(logpx).
      return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims)


def _sample_mean(values):
  """Mean over sample indices.  In this module this is always [0]."""
  return math_ops.reduce_mean(values, reduction_indices=[0])


def _sample_max(values):
  """Max over sample indices.  In this module this is always [0]."""
  return math_ops.reduce_max(values, reduction_indices=[0])


def _get_samples(dist, z, n, seed):
  """Check args and return samples."""
  with ops.name_scope('get_samples', values=[z, n]):
    if (n is None) == (z is None):
      raise ValueError(
          'Must specify exactly one of arguments "n" and "z".  Found: '
          'n = %s, z = %s' % (n, z))
    if n is not None:
      return dist.sample(n, seed=seed)
    else:
      return ops.convert_to_tensor(z, name='z')