blob: a3d178357b79b9d0d15c738603d5019321eff112 [file] [log] [blame]
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""The VectorDiffeomixture distribution class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from tensorflow.contrib.distributions.python.ops import distribution_util
from tensorflow.contrib.distributions.python.ops.bijectors.affine_linear_operator import AffineLinearOperator
from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape
from tensorflow.python.framework import tensor_util
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import check_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops.distributions import categorical as categorical_lib
from tensorflow.python.ops.distributions import distribution as distribution_lib
from tensorflow.python.ops.distributions import normal as normal_lib
from tensorflow.python.ops.linalg import linear_operator_addition as linop_add_lib
from tensorflow.python.ops.linalg import linear_operator_diag as linop_diag_lib
from tensorflow.python.ops.linalg import linear_operator_full_matrix as linop_full_lib
from tensorflow.python.ops.linalg import linear_operator_identity as linop_identity_lib
from tensorflow.python.ops.linalg import linear_operator_lower_triangular as linop_tril_lib
from tensorflow.python.util import deprecation
__all__ = [
"VectorDiffeomixture",
"quadrature_scheme_softmaxnormal_gauss_hermite",
"quadrature_scheme_softmaxnormal_quantiles",
]
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def quadrature_scheme_softmaxnormal_gauss_hermite(
normal_loc, normal_scale, quadrature_size,
validate_args=False, name=None):
"""Use Gauss-Hermite quadrature to form quadrature on `K - 1` simplex.
A `SoftmaxNormal` random variable `Y` may be generated via
```
Y = SoftmaxCentered(X),
X = Normal(normal_loc, normal_scale)
```
Note: for a given `quadrature_size`, this method is generally less accurate
than `quadrature_scheme_softmaxnormal_quantiles`.
Args:
normal_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
The location parameter of the Normal used to construct the SoftmaxNormal.
normal_scale: `float`-like `Tensor`. Broadcastable with `normal_loc`.
The scale parameter of the Normal used to construct the SoftmaxNormal.
quadrature_size: Python `int` scalar representing the number of quadrature
points.
validate_args: Python `bool`, default `False`. When `True` distribution
parameters are checked for validity despite possibly degrading runtime
performance. When `False` invalid inputs may silently render incorrect
outputs.
name: Python `str` name prefixed to Ops created by this class.
Returns:
grid: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
convex combination of affine parameters for `K` components.
`grid[..., :, n]` is the `n`-th grid point, living in the `K - 1` simplex.
probs: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
associated with each grid point.
"""
with ops.name_scope(name, "quadrature_scheme_softmaxnormal_gauss_hermite",
[normal_loc, normal_scale]):
normal_loc = ops.convert_to_tensor(normal_loc, name="normal_loc")
dt = normal_loc.dtype.base_dtype
normal_scale = ops.convert_to_tensor(
normal_scale, dtype=dt, name="normal_scale")
normal_scale = maybe_check_quadrature_param(
normal_scale, "normal_scale", validate_args)
grid, probs = np.polynomial.hermite.hermgauss(deg=quadrature_size)
grid = grid.astype(dt.dtype.as_numpy_dtype)
probs = probs.astype(dt.dtype.as_numpy_dtype)
probs /= np.linalg.norm(probs, ord=1, keepdims=True)
probs = ops.convert_to_tensor(probs, name="probs", dtype=dt)
grid = softmax(
-distribution_util.pad(
(normal_loc[..., array_ops.newaxis] +
np.sqrt(2.) * normal_scale[..., array_ops.newaxis] * grid),
axis=-2,
front=True),
axis=-2) # shape: [B, components, deg]
return grid, probs
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def quadrature_scheme_softmaxnormal_quantiles(
normal_loc, normal_scale, quadrature_size,
validate_args=False, name=None):
"""Use SoftmaxNormal quantiles to form quadrature on `K - 1` simplex.
A `SoftmaxNormal` random variable `Y` may be generated via
```
Y = SoftmaxCentered(X),
X = Normal(normal_loc, normal_scale)
```
Args:
normal_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`, B>=0.
The location parameter of the Normal used to construct the SoftmaxNormal.
normal_scale: `float`-like `Tensor`. Broadcastable with `normal_loc`.
The scale parameter of the Normal used to construct the SoftmaxNormal.
quadrature_size: Python `int` scalar representing the number of quadrature
points.
validate_args: Python `bool`, default `False`. When `True` distribution
parameters are checked for validity despite possibly degrading runtime
performance. When `False` invalid inputs may silently render incorrect
outputs.
name: Python `str` name prefixed to Ops created by this class.
Returns:
grid: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
convex combination of affine parameters for `K` components.
`grid[..., :, n]` is the `n`-th grid point, living in the `K - 1` simplex.
probs: Shape `[b1, ..., bB, K, quadrature_size]` `Tensor` representing the
associated with each grid point.
"""
with ops.name_scope(name, "softmax_normal_grid_and_probs",
[normal_loc, normal_scale]):
normal_loc = ops.convert_to_tensor(normal_loc, name="normal_loc")
dt = normal_loc.dtype.base_dtype
normal_scale = ops.convert_to_tensor(
normal_scale, dtype=dt, name="normal_scale")
normal_scale = maybe_check_quadrature_param(
normal_scale, "normal_scale", validate_args)
dist = normal_lib.Normal(loc=normal_loc, scale=normal_scale)
def _get_batch_ndims():
"""Helper to get dist.batch_shape.ndims, statically if possible."""
ndims = dist.batch_shape.ndims
if ndims is None:
ndims = array_ops.shape(dist.batch_shape_tensor())[0]
return ndims
batch_ndims = _get_batch_ndims()
def _get_final_shape(qs):
"""Helper to build `TensorShape`."""
bs = dist.batch_shape.with_rank_at_least(1)
num_components = bs[-1].value
if num_components is not None:
num_components += 1
tail = tensor_shape.TensorShape([num_components, qs])
return bs[:-1].concatenate(tail)
def _compute_quantiles():
"""Helper to build quantiles."""
# Omit {0, 1} since they might lead to Inf/NaN.
zero = array_ops.zeros([], dtype=dist.dtype)
edges = math_ops.linspace(zero, 1., quadrature_size + 3)[1:-1]
# Expand edges so its broadcast across batch dims.
edges = array_ops.reshape(edges, shape=array_ops.concat([
[-1], array_ops.ones([batch_ndims], dtype=dtypes.int32)], axis=0))
quantiles = dist.quantile(edges)
quantiles = SoftmaxCentered().forward(quantiles)
# Cyclically permute left by one.
perm = array_ops.concat([
math_ops.range(1, 1 + batch_ndims), [0]], axis=0)
quantiles = array_ops.transpose(quantiles, perm)
quantiles.set_shape(_get_final_shape(quadrature_size + 1))
return quantiles
quantiles = _compute_quantiles()
# Compute grid as quantile midpoints.
grid = (quantiles[..., :-1] + quantiles[..., 1:]) / 2.
# Set shape hints.
grid.set_shape(_get_final_shape(quadrature_size))
# By construction probs is constant, i.e., `1 / quadrature_size`. This is
# important, because non-constant probs leads to non-reparameterizable
# samples.
probs = array_ops.fill(
dims=[quadrature_size],
value=1. / math_ops.cast(quadrature_size, dist.dtype))
return grid, probs
class VectorDiffeomixture(distribution_lib.Distribution):
"""VectorDiffeomixture distribution.
A vector diffeomixture (VDM) is a distribution parameterized by a convex
combination of `K` component `loc` vectors, `loc[k], k = 0,...,K-1`, and `K`
`scale` matrices `scale[k], k = 0,..., K-1`. It approximates the following
[compound distribution]
(https://en.wikipedia.org/wiki/Compound_probability_distribution)
```none
p(x) = int p(x | z) p(z) dz,
where z is in the K-simplex, and
p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k])
```
The integral `int p(x | z) p(z) dz` is approximated with a quadrature scheme
adapted to the mixture density `p(z)`. The `N` quadrature points `z_{N, n}`
and weights `w_{N, n}` (which are non-negative and sum to 1) are chosen
such that
```q_N(x) := sum_{n=1}^N w_{n, N} p(x | z_{N, n}) --> p(x)```
as `N --> infinity`.
Since `q_N(x)` is in fact a mixture (of `N` points), we may sample from
`q_N` exactly. It is important to note that the VDM is *defined* as `q_N`
above, and *not* `p(x)`. Therefore, sampling and pdf may be implemented as
exact (up to floating point error) methods.
A common choice for the conditional `p(x | z)` is a multivariate Normal.
The implemented marginal `p(z)` is the `SoftmaxNormal`, which is a
`K-1` dimensional Normal transformed by a `SoftmaxCentered` bijector, making
it a density on the `K`-simplex. That is,
```
Z = SoftmaxCentered(X),
X = Normal(mix_loc / temperature, 1 / temperature)
```
The default quadrature scheme chooses `z_{N, n}` as `N` midpoints of
the quantiles of `p(z)` (generalized quantiles if `K > 2`).
See [Dillon and Langmore (2018)][1] for more details.
#### About `Vector` distributions in TensorFlow.
The `VectorDiffeomixture` is a non-standard distribution that has properties
particularly useful in [variational Bayesian
methods](https://en.wikipedia.org/wiki/Variational_Bayesian_methods).
Conditioned on a draw from the SoftmaxNormal, `X|z` is a vector whose
components are linear combinations of affine transformations, thus is itself
an affine transformation.
Note: The marginals `X_1|v, ..., X_d|v` are *not* generally identical to some
parameterization of `distribution`. This is due to the fact that the sum of
draws from `distribution` are not generally itself the same `distribution`.
#### About `Diffeomixture`s and reparameterization.
The `VectorDiffeomixture` is designed to be reparameterized, i.e., its
parameters are only used to transform samples from a distribution which has no
trainable parameters. This property is important because backprop stops at
sources of stochasticity. That is, as long as the parameters are used *after*
the underlying source of stochasticity, the computed gradient is accurate.
Reparametrization means that we can use gradient-descent (via backprop) to
optimize Monte-Carlo objectives. Such objectives are a finite-sample
approximation of an expectation and arise throughout scientific computing.
WARNING: If you backprop through a VectorDiffeomixture sample and the "base"
distribution is both: not `FULLY_REPARAMETERIZED` and a function of trainable
variables, then the gradient is not guaranteed correct!
#### Examples
```python
import tensorflow_probability as tfp
tfd = tfp.distributions
# Create two batches of VectorDiffeomixtures, one with mix_loc=[0.],
# another with mix_loc=[1]. In both cases, `K=2` and the affine
# transformations involve:
# k=0: loc=zeros(dims) scale=LinearOperatorScaledIdentity
# k=1: loc=[2.]*dims scale=LinOpDiag
dims = 5
vdm = tfd.VectorDiffeomixture(
mix_loc=[[0.], [1]],
temperature=[1.],
distribution=tfd.Normal(loc=0., scale=1.),
loc=[
None, # Equivalent to `np.zeros(dims, dtype=np.float32)`.
np.float32([2.]*dims),
],
scale=[
tf.linalg.LinearOperatorScaledIdentity(
num_rows=dims,
multiplier=np.float32(1.1),
is_positive_definite=True),
tf.linalg.LinearOperatorDiag(
diag=np.linspace(2.5, 3.5, dims, dtype=np.float32),
is_positive_definite=True),
],
validate_args=True)
```
#### References
[1]: Joshua Dillon and Ian Langmore. Quadrature Compound: An approximating
family of distributions. _arXiv preprint arXiv:1801.03080_, 2018.
https://arxiv.org/abs/1801.03080
"""
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def __init__(self,
mix_loc,
temperature,
distribution,
loc=None,
scale=None,
quadrature_size=8,
quadrature_fn=quadrature_scheme_softmaxnormal_quantiles,
validate_args=False,
allow_nan_stats=True,
name="VectorDiffeomixture"):
"""Constructs the VectorDiffeomixture on `R^d`.
The vector diffeomixture (VDM) approximates the compound distribution
```none
p(x) = int p(x | z) p(z) dz,
where z is in the K-simplex, and
p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k])
```
Args:
mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`.
In terms of samples, larger `mix_loc[..., k]` ==>
`Z` is more likely to put more weight on its `kth` component.
temperature: `float`-like `Tensor`. Broadcastable with `mix_loc`.
In terms of samples, smaller `temperature` means one component is more
likely to dominate. I.e., smaller `temperature` makes the VDM look more
like a standard mixture of `K` components.
distribution: `tf.Distribution`-like instance. Distribution from which `d`
iid samples are used as input to the selected affine transformation.
Must be a scalar-batch, scalar-event distribution. Typically
`distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is
a function of non-trainable parameters. WARNING: If you backprop through
a VectorDiffeomixture sample and the `distribution` is not
`FULLY_REPARAMETERIZED` yet is a function of trainable variables, then
the gradient will be incorrect!
loc: Length-`K` list of `float`-type `Tensor`s. The `k`-th element
represents the `shift` used for the `k`-th affine transformation. If
the `k`-th item is `None`, `loc` is implicitly `0`. When specified,
must have shape `[B1, ..., Bb, d]` where `b >= 0` and `d` is the event
size.
scale: Length-`K` list of `LinearOperator`s. Each should be
positive-definite and operate on a `d`-dimensional vector space. The
`k`-th element represents the `scale` used for the `k`-th affine
transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`,
`b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices
quadrature_size: Python `int` scalar representing number of
quadrature points. Larger `quadrature_size` means `q_N(x)` better
approximates `p(x)`.
quadrature_fn: Python callable taking `normal_loc`, `normal_scale`,
`quadrature_size`, `validate_args` and returning `tuple(grid, probs)`
representing the SoftmaxNormal grid and corresponding normalized weight.
normalized) weight.
Default value: `quadrature_scheme_softmaxnormal_quantiles`.
validate_args: Python `bool`, default `False`. When `True` distribution
parameters are checked for validity despite possibly degrading runtime
performance. When `False` invalid inputs may silently render incorrect
outputs.
allow_nan_stats: Python `bool`, default `True`. When `True`,
statistics (e.g., mean, mode, variance) use the value "`NaN`" to
indicate the result is undefined. When `False`, an exception is raised
if one or more of the statistic's batch members are undefined.
name: Python `str` name prefixed to Ops created by this class.
Raises:
ValueError: if `not scale or len(scale) < 2`.
ValueError: if `len(loc) != len(scale)`
ValueError: if `quadrature_grid_and_probs is not None` and
`len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])`
ValueError: if `validate_args` and any not scale.is_positive_definite.
TypeError: if any scale.dtype != scale[0].dtype.
TypeError: if any loc.dtype != scale[0].dtype.
NotImplementedError: if `len(scale) != 2`.
ValueError: if `not distribution.is_scalar_batch`.
ValueError: if `not distribution.is_scalar_event`.
"""
parameters = dict(locals())
with ops.name_scope(name, values=[mix_loc, temperature]) as name:
if not scale or len(scale) < 2:
raise ValueError("Must specify list (or list-like object) of scale "
"LinearOperators, one for each component with "
"num_component >= 2.")
if loc is None:
loc = [None]*len(scale)
if len(loc) != len(scale):
raise ValueError("loc/scale must be same-length lists "
"(or same-length list-like objects).")
dtype = scale[0].dtype.base_dtype
loc = [ops.convert_to_tensor(loc_, dtype=dtype, name="loc{}".format(k))
if loc_ is not None else None
for k, loc_ in enumerate(loc)]
for k, scale_ in enumerate(scale):
if validate_args and not scale_.is_positive_definite:
raise ValueError("scale[{}].is_positive_definite = {} != True".format(
k, scale_.is_positive_definite))
if scale_.dtype.base_dtype != dtype:
raise TypeError(
"dtype mismatch; scale[{}].base_dtype=\"{}\" != \"{}\"".format(
k, scale_.dtype.base_dtype.name, dtype.name))
self._endpoint_affine = [
AffineLinearOperator(shift=loc_,
scale=scale_,
validate_args=validate_args,
name="endpoint_affine_{}".format(k))
for k, (loc_, scale_) in enumerate(zip(loc, scale))]
# TODO(jvdillon): Remove once we support k-mixtures.
# We make this assertion here because otherwise `grid` would need to be a
# vector not a scalar.
if len(scale) != 2:
raise NotImplementedError("Currently only bimixtures are supported; "
"len(scale)={} is not 2.".format(len(scale)))
mix_loc = ops.convert_to_tensor(
mix_loc, dtype=dtype, name="mix_loc")
temperature = ops.convert_to_tensor(
temperature, dtype=dtype, name="temperature")
self._grid, probs = tuple(quadrature_fn(
mix_loc / temperature,
1. / temperature,
quadrature_size,
validate_args))
# Note: by creating the logits as `log(prob)` we ensure that
# `self.mixture_distribution.logits` is equivalent to
# `math_ops.log(self.mixture_distribution.probs)`.
self._mixture_distribution = categorical_lib.Categorical(
logits=math_ops.log(probs),
validate_args=validate_args,
allow_nan_stats=allow_nan_stats)
asserts = distribution_util.maybe_check_scalar_distribution(
distribution, dtype, validate_args)
if asserts:
self._grid = control_flow_ops.with_dependencies(
asserts, self._grid)
self._distribution = distribution
self._interpolated_affine = [
AffineLinearOperator(shift=loc_,
scale=scale_,
validate_args=validate_args,
name="interpolated_affine_{}".format(k))
for k, (loc_, scale_) in enumerate(zip(
interpolate_loc(self._grid, loc),
interpolate_scale(self._grid, scale)))]
[
self._batch_shape_,
self._batch_shape_tensor_,
self._event_shape_,
self._event_shape_tensor_,
] = determine_batch_event_shapes(self._grid,
self._endpoint_affine)
super(VectorDiffeomixture, self).__init__(
dtype=dtype,
# We hard-code `FULLY_REPARAMETERIZED` because when
# `validate_args=True` we verify that indeed
# `distribution.reparameterization_type == FULLY_REPARAMETERIZED`. A
# distribution which is a function of only non-trainable parameters
# also implies we can use `FULLY_REPARAMETERIZED`. However, we cannot
# easily test for that possibility thus we use `validate_args=False`
# as a "back-door" to allow users a way to use non
# `FULLY_REPARAMETERIZED` distribution. In such cases IT IS THE USERS
# RESPONSIBILITY to verify that the base distribution is a function of
# non-trainable parameters.
reparameterization_type=distribution_lib.FULLY_REPARAMETERIZED,
validate_args=validate_args,
allow_nan_stats=allow_nan_stats,
parameters=parameters,
graph_parents=(
distribution._graph_parents # pylint: disable=protected-access
+ [loc_ for loc_ in loc if loc_ is not None]
+ [p for scale_ in scale for p in scale_.graph_parents]),
name=name)
@property
def mixture_distribution(self):
"""Distribution used to select a convex combination of affine transforms."""
return self._mixture_distribution
@property
def distribution(self):
"""Base scalar-event, scalar-batch distribution."""
return self._distribution
@property
def grid(self):
"""Grid of mixing probabilities, one for each grid point."""
return self._grid
@property
def endpoint_affine(self):
"""Affine transformation for each of `K` components."""
return self._endpoint_affine
@property
def interpolated_affine(self):
"""Affine transformation for each convex combination of `K` components."""
return self._interpolated_affine
def _batch_shape_tensor(self):
return self._batch_shape_tensor_
def _batch_shape(self):
return self._batch_shape_
def _event_shape_tensor(self):
return self._event_shape_tensor_
def _event_shape(self):
return self._event_shape_
def _sample_n(self, n, seed=None):
x = self.distribution.sample(
sample_shape=concat_vectors(
[n],
self.batch_shape_tensor(),
self.event_shape_tensor()),
seed=seed) # shape: [n, B, e]
x = [aff.forward(x) for aff in self.endpoint_affine]
# Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get
# ids as a [n]-shaped vector.
batch_size = self.batch_shape.num_elements()
if batch_size is None:
batch_size = array_ops.reduce_prod(self.batch_shape_tensor())
mix_batch_size = self.mixture_distribution.batch_shape.num_elements()
if mix_batch_size is None:
mix_batch_size = math_ops.reduce_prod(
self.mixture_distribution.batch_shape_tensor())
ids = self.mixture_distribution.sample(
sample_shape=concat_vectors(
[n],
distribution_util.pick_vector(
self.is_scalar_batch(),
np.int32([]),
[batch_size // mix_batch_size])),
seed=distribution_util.gen_new_seed(
seed, "vector_diffeomixture"))
# We need to flatten batch dims in case mixture_distribution has its own
# batch dims.
ids = array_ops.reshape(ids, shape=concat_vectors(
[n],
distribution_util.pick_vector(
self.is_scalar_batch(),
np.int32([]),
np.int32([-1]))))
# Stride `components * quadrature_size` for `batch_size` number of times.
stride = self.grid.shape.with_rank_at_least(
2)[-2:].num_elements()
if stride is None:
stride = array_ops.reduce_prod(
array_ops.shape(self.grid)[-2:])
offset = math_ops.range(start=0,
limit=batch_size * stride,
delta=stride,
dtype=ids.dtype)
weight = array_ops.gather(
array_ops.reshape(self.grid, shape=[-1]),
ids + offset)
# At this point, weight flattened all batch dims into one.
# We also need to append a singleton to broadcast with event dims.
if self.batch_shape.is_fully_defined():
new_shape = [-1] + self.batch_shape.as_list() + [1]
else:
new_shape = array_ops.concat(
([-1], self.batch_shape_tensor(), [1]), axis=0)
weight = array_ops.reshape(weight, shape=new_shape)
if len(x) != 2:
# We actually should have already triggered this exception. However as a
# policy we're putting this exception wherever we exploit the bimixture
# assumption.
raise NotImplementedError("Currently only bimixtures are supported; "
"len(scale)={} is not 2.".format(len(x)))
# Alternatively:
# x = weight * x[0] + (1. - weight) * x[1]
x = weight * (x[0] - x[1]) + x[1]
return x
def _log_prob(self, x):
# By convention, we always put the grid points right-most.
y = array_ops.stack(
[aff.inverse(x) for aff in self.interpolated_affine],
axis=-1)
log_prob = math_ops.reduce_sum(self.distribution.log_prob(y), axis=-2)
# Because the affine transformation has a constant Jacobian, it is the case
# that `affine.fldj(x) = -affine.ildj(x)`. This is not true in general.
fldj = array_ops.stack([
aff.forward_log_det_jacobian(
x,
event_ndims=array_ops.rank(self.event_shape_tensor())
) for aff in self.interpolated_affine], axis=-1)
return math_ops.reduce_logsumexp(
self.mixture_distribution.logits - fldj + log_prob, axis=-1)
def _mean(self):
p = self._expand_mix_distribution_probs()
m = self._expand_base_distribution_mean()
mean = None
for k, aff in enumerate(self.interpolated_affine):
# aff.forward is going to do this:
# y = array_ops.squeeze(aff.scale.matmul(m), axis=[-1])
# if aff.shift is not None:
# y += aff.shift
mean = add(mean, p[..., k] * aff.forward(m))
return mean
def _covariance(self):
# Law of total variance:
#
# Cov[Z] = E[Cov[Z | V]] + Cov[E[Z | V]]
#
# where,
#
# E[Cov[Z | V]] = sum_i mix_prob[i] Scale[i]
# Cov[E[Z | V]] = sum_i mix_prob[i] osquare(loc[i])
# - osquare(sum_i mix_prob[i] loc[i])
#
# osquare(x) = x.transpose @ x
return add(
self._mean_of_covariance_given_quadrature_component(diag_only=False),
self._covariance_of_mean_given_quadrature_component(diag_only=False))
def _variance(self):
# Equivalent to: tf.diag_part(self._covariance()),
return add(
self._mean_of_covariance_given_quadrature_component(diag_only=True),
self._covariance_of_mean_given_quadrature_component(diag_only=True))
def _mean_of_covariance_given_quadrature_component(self, diag_only):
p = self.mixture_distribution.probs
# To compute E[Cov(Z|V)], we'll add matrices within three categories:
# scaled-identity, diagonal, and full. Then we'll combine these at the end.
scale_identity_multiplier = None
diag = None
full = None
for k, aff in enumerate(self.interpolated_affine):
s = aff.scale # Just in case aff.scale has side-effects, we'll call once.
if (s is None
or isinstance(s, linop_identity_lib.LinearOperatorIdentity)):
scale_identity_multiplier = add(scale_identity_multiplier,
p[..., k, array_ops.newaxis])
elif isinstance(s, linop_identity_lib.LinearOperatorScaledIdentity):
scale_identity_multiplier = add(
scale_identity_multiplier,
(p[..., k, array_ops.newaxis] * math_ops.square(s.multiplier)))
elif isinstance(s, linop_diag_lib.LinearOperatorDiag):
diag = add(diag, (p[..., k, array_ops.newaxis] *
math_ops.square(s.diag_part())))
else:
x = (p[..., k, array_ops.newaxis, array_ops.newaxis] *
s.matmul(s.to_dense(), adjoint_arg=True))
if diag_only:
x = array_ops.matrix_diag_part(x)
full = add(full, x)
# We must now account for the fact that the base distribution might have a
# non-unity variance. Recall that, since X ~ iid Law(X_0),
# `Cov(SX+m) = S Cov(X) S.T = S S.T Diag(Var(X_0))`.
# We can scale by `Var(X)` (vs `Cov(X)`) since X corresponds to `d` iid
# samples from a scalar-event distribution.
v = self.distribution.variance()
if scale_identity_multiplier is not None:
scale_identity_multiplier *= v
if diag is not None:
diag *= v[..., array_ops.newaxis]
if full is not None:
full *= v[..., array_ops.newaxis]
if diag_only:
# Apparently we don't need the full matrix, just the diagonal.
r = add(diag, full)
if r is None and scale_identity_multiplier is not None:
ones = array_ops.ones(self.event_shape_tensor(), dtype=self.dtype)
return scale_identity_multiplier[..., array_ops.newaxis] * ones
return add(r, scale_identity_multiplier)
# `None` indicates we don't know if the result is positive-definite.
is_positive_definite = (True if all(aff.scale.is_positive_definite
for aff in self.endpoint_affine)
else None)
to_add = []
if diag is not None:
to_add.append(linop_diag_lib.LinearOperatorDiag(
diag=diag,
is_positive_definite=is_positive_definite))
if full is not None:
to_add.append(linop_full_lib.LinearOperatorFullMatrix(
matrix=full,
is_positive_definite=is_positive_definite))
if scale_identity_multiplier is not None:
to_add.append(linop_identity_lib.LinearOperatorScaledIdentity(
num_rows=self.event_shape_tensor()[0],
multiplier=scale_identity_multiplier,
is_positive_definite=is_positive_definite))
return (linop_add_lib.add_operators(to_add)[0].to_dense()
if to_add else None)
def _covariance_of_mean_given_quadrature_component(self, diag_only):
square = math_ops.square if diag_only else vec_osquare
p = self._expand_mix_distribution_probs()
if not diag_only:
p = p[..., array_ops.newaxis, :] # Assuming event.ndims=1.
m = self._expand_base_distribution_mean()
cov_e_z_given_v = None
e_z_given_v = self._mean()
for k, aff in enumerate(self.interpolated_affine):
y = aff.forward(m)
cov_e_z_given_v = add(cov_e_z_given_v,
p[..., k] * square(y - e_z_given_v))
return cov_e_z_given_v
def _expand_base_distribution_mean(self):
"""Ensures `self.distribution.mean()` has `[batch, event]` shape."""
single_draw_shape = concat_vectors(self.batch_shape_tensor(),
self.event_shape_tensor())
m = array_ops.reshape(
self.distribution.mean(), # A scalar.
shape=array_ops.ones_like(single_draw_shape,
dtype=dtypes.int32))
m = array_ops.tile(m, multiples=single_draw_shape)
m.set_shape(self.batch_shape.concatenate(self.event_shape))
return m
def _expand_mix_distribution_probs(self):
p = self.mixture_distribution.probs # [B, deg]
deg = p.shape.with_rank_at_least(1)[-1].value
if deg is None:
deg = array_ops.shape(p)[-1]
event_ndims = self.event_shape.ndims
if event_ndims is None:
event_ndims = array_ops.shape(self.event_shape_tensor())[0]
expand_shape = array_ops.concat([
self.mixture_distribution.batch_shape_tensor(),
array_ops.ones([event_ndims], dtype=dtypes.int32),
[deg],
], axis=0)
return array_ops.reshape(p, shape=expand_shape)
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def maybe_check_quadrature_param(param, name, validate_args):
"""Helper which checks validity of `loc` and `scale` init args."""
with ops.name_scope(name="check_" + name, values=[param]):
assertions = []
if param.shape.ndims is not None:
if param.shape.ndims == 0:
raise ValueError("Mixing params must be a (batch of) vector; "
"{}.rank={} is not at least one.".format(
name, param.shape.ndims))
elif validate_args:
assertions.append(check_ops.assert_rank_at_least(
param, 1,
message=("Mixing params must be a (batch of) vector; "
"{}.rank is not at least one.".format(
name))))
# TODO(jvdillon): Remove once we support k-mixtures.
if param.shape.with_rank_at_least(1)[-1] is not None:
if param.shape[-1].value != 1:
raise NotImplementedError("Currently only bimixtures are supported; "
"{}.shape[-1]={} is not 1.".format(
name, param.shape[-1].value))
elif validate_args:
assertions.append(check_ops.assert_equal(
array_ops.shape(param)[-1], 1,
message=("Currently only bimixtures are supported; "
"{}.shape[-1] is not 1.".format(name))))
if assertions:
return control_flow_ops.with_dependencies(assertions, param)
return param
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def determine_batch_event_shapes(grid, endpoint_affine):
"""Helper to infer batch_shape and event_shape."""
with ops.name_scope(name="determine_batch_event_shapes"):
# grid # shape: [B, k, q]
# endpoint_affine # len=k, shape: [B, d, d]
batch_shape = grid.shape[:-2]
batch_shape_tensor = array_ops.shape(grid)[:-2]
event_shape = None
event_shape_tensor = None
def _set_event_shape(shape, shape_tensor):
if event_shape is None:
return shape, shape_tensor
return (array_ops.broadcast_static_shape(event_shape, shape),
array_ops.broadcast_dynamic_shape(
event_shape_tensor, shape_tensor))
for aff in endpoint_affine:
if aff.shift is not None:
batch_shape = array_ops.broadcast_static_shape(
batch_shape, aff.shift.shape[:-1])
batch_shape_tensor = array_ops.broadcast_dynamic_shape(
batch_shape_tensor, array_ops.shape(aff.shift)[:-1])
event_shape, event_shape_tensor = _set_event_shape(
aff.shift.shape[-1:], array_ops.shape(aff.shift)[-1:])
if aff.scale is not None:
batch_shape = array_ops.broadcast_static_shape(
batch_shape, aff.scale.batch_shape)
batch_shape_tensor = array_ops.broadcast_dynamic_shape(
batch_shape_tensor, aff.scale.batch_shape_tensor())
event_shape, event_shape_tensor = _set_event_shape(
tensor_shape.TensorShape([aff.scale.range_dimension]),
aff.scale.range_dimension_tensor()[array_ops.newaxis])
return batch_shape, batch_shape_tensor, event_shape, event_shape_tensor
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def interpolate_loc(grid, loc):
"""Helper which interpolates between two locs."""
if len(loc) != 2:
raise NotImplementedError("Currently only bimixtures are supported; "
"len(scale)={} is not 2.".format(len(loc)))
deg = grid.shape.with_rank_at_least(1)[-1].value
if deg is None:
raise ValueError("Num quadrature grid points must be known prior "
"to graph execution.")
with ops.name_scope("interpolate_loc", values=[grid, loc]):
if loc is None or loc[0] is None and loc[1] is None:
return [None]*deg
# shape: [B, 1, k, deg]
w = grid[..., array_ops.newaxis, :, :]
loc = [x[..., array_ops.newaxis] # shape: [B, e, 1]
if x is not None else None for x in loc]
if loc[0] is None:
x = w[..., 1, :] * loc[1] # shape: [B, e, deg]
elif loc[1] is None:
x = w[..., 0, :] * loc[0] # shape: [B, e, deg]
else:
delta = loc[0] - loc[1]
x = w[..., 0, :] * delta + loc[1] # shape: [B, e, deg]
return [x[..., k] for k in range(deg)] # list(shape:[B, e])
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def interpolate_scale(grid, scale):
"""Helper which interpolates between two scales."""
if len(scale) != 2:
raise NotImplementedError("Currently only bimixtures are supported; "
"len(scale)={} is not 2.".format(len(scale)))
deg = grid.shape.with_rank_at_least(1)[-1].value
if deg is None:
raise ValueError("Num quadrature grid points must be known prior "
"to graph execution.")
with ops.name_scope("interpolate_scale", values=[grid]):
return [linop_add_lib.add_operators([
linop_scale(grid[..., k, q], s)
for k, s in enumerate(scale)
])[0] for q in range(deg)]
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def linop_scale(w, op):
# We assume w > 0. (This assumption only relates to the is_* attributes.)
with ops.name_scope("linop_scale", values=[w]):
# TODO(b/35301104): LinearOperatorComposition doesn't combine operators, so
# special case combinations here. Once it does, this function can be
# replaced by:
# return linop_composition_lib.LinearOperatorComposition([
# scaled_identity(w), op])
def scaled_identity(w):
return linop_identity_lib.LinearOperatorScaledIdentity(
num_rows=op.range_dimension_tensor(),
multiplier=w,
is_non_singular=op.is_non_singular,
is_self_adjoint=op.is_self_adjoint,
is_positive_definite=op.is_positive_definite)
if isinstance(op, linop_identity_lib.LinearOperatorIdentity):
return scaled_identity(w)
if isinstance(op, linop_identity_lib.LinearOperatorScaledIdentity):
return scaled_identity(w * op.multiplier)
if isinstance(op, linop_diag_lib.LinearOperatorDiag):
return linop_diag_lib.LinearOperatorDiag(
diag=w[..., array_ops.newaxis] * op.diag_part(),
is_non_singular=op.is_non_singular,
is_self_adjoint=op.is_self_adjoint,
is_positive_definite=op.is_positive_definite)
if isinstance(op, linop_tril_lib.LinearOperatorLowerTriangular):
return linop_tril_lib.LinearOperatorLowerTriangular(
tril=w[..., array_ops.newaxis, array_ops.newaxis] * op.to_dense(),
is_non_singular=op.is_non_singular,
is_self_adjoint=op.is_self_adjoint,
is_positive_definite=op.is_positive_definite)
raise NotImplementedError(
"Unsupported Linop type ({})".format(type(op).__name__))
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def concat_vectors(*args):
"""Concatenates input vectors, statically if possible."""
args_ = [distribution_util.static_value(x) for x in args]
if any(vec is None for vec in args_):
return array_ops.concat(args, axis=0)
return [val for vec in args_ for val in vec]
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def add(x, y):
"""Adds inputs; interprets `None` as zero."""
if x is None:
return y
if y is None:
return x
return x + y
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def vec_osquare(x):
"""Computes the outer-product of a (batch of) vector, i.e., x.T x."""
return x[..., :, array_ops.newaxis] * x[..., array_ops.newaxis, :]
@deprecation.deprecated(
"2018-10-01",
"The TensorFlow Distributions library has moved to "
"TensorFlow Probability "
"(https://github.com/tensorflow/probability). You "
"should update all references to use `tfp.distributions` "
"instead of `tf.contrib.distributions`.",
warn_once=True)
def softmax(x, axis, name=None):
"""Equivalent to tf.nn.softmax but works around b/70297725."""
with ops.name_scope(name, "softmax", [x, axis]):
x = ops.convert_to_tensor(x, name="x")
ndims = (x.shape.ndims if x.shape.ndims is not None
else array_ops.rank(x, name="ndims"))
axis = ops.convert_to_tensor(axis, dtype=dtypes.int32, name="axis")
axis_ = tensor_util.constant_value(axis)
if axis_ is not None:
axis = np.int(ndims + axis_ if axis_ < 0 else axis_)
else:
axis = array_ops.where(axis < 0, ndims + axis, axis)
return nn_ops.softmax(x, axis=axis)