tensorflow/contrib/distributions/python/ops/wishart.py - platform/external/tensorflow - Git at Google

 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """The Wishart distribution class."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import math
 import numpy as np

 from tensorflow.contrib.distributions.python.ops import distribution_util
 from tensorflow.contrib.framework.python.framework import tensor_util as contrib_tensor_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.util import deprecation

 __all__ = [
     "WishartCholesky",
     "WishartFull",
 ]


 class _WishartLinearOperator(distribution.Distribution):
   """The matrix Wishart distribution on positive definite matrices.

   This distribution is defined by a scalar number of degrees of freedom `df` and
   an instance of `LinearOperator`, which provides matrix-free access to a
   symmetric positive definite operator, which defines the scale matrix.

   #### Mathematical Details

   The probability density function (pdf) is,

   ```none
   pdf(X; df, scale) = det(X)**(0.5 (df-k-1)) exp(-0.5 tr[inv(scale) X]) / Z
   Z = 2**(0.5 df k) |det(scale)|**(0.5 df) Gamma_k(0.5 df)
   ```

   where:

   * `df >= k` denotes the degrees of freedom,
   * `scale` is a symmetric, positive definite, `k x k` matrix,
   * `Z` is the normalizing constant, and,
   * `Gamma_k` is the [multivariate Gamma function](
     https://en.wikipedia.org/wiki/Multivariate_gamma_function).

   #### Examples

   See `WishartFull`, `WishartCholesky` for examples of initializing and using
   this class.
   """

   @deprecation.deprecated(
       "2018-10-01",
       "The TensorFlow Distributions library has moved to "
       "TensorFlow Probability "
       "(https://github.com/tensorflow/probability). You "
       "should update all references to use `tfp.distributions` "
       "instead of `tf.contrib.distributions`.",
       warn_once=True)
   def __init__(self,
                df,
                scale_operator,
                cholesky_input_output_matrices=False,
                validate_args=False,
                allow_nan_stats=True,
                name=None):
     """Construct Wishart distributions.

     Args:
       df: `float` or `double` tensor, the degrees of freedom of the
         distribution(s). `df` must be greater than or equal to `k`.
       scale_operator: `float` or `double` instance of `LinearOperator`.
       cholesky_input_output_matrices: Python `bool`. Any function which whose
         input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
       allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
         result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.

     Raises:
       TypeError: if scale is not floating-type
       TypeError: if scale.dtype != df.dtype
       ValueError: if df < k, where scale operator event shape is
         `(k, k)`
     """
     parameters = dict(locals())
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[df, scale_operator]):
         if not scale_operator.dtype.is_floating:
           raise TypeError(
               "scale_operator.dtype=%s is not a floating-point type" %
               scale_operator.dtype)
         if not scale_operator.is_square:
           print(scale_operator.to_dense().eval())
           raise ValueError("scale_operator must be square.")

         self._scale_operator = scale_operator
         self._df = ops.convert_to_tensor(
             df,
             dtype=scale_operator.dtype,
             name="df")
         contrib_tensor_util.assert_same_float_dtype(
             (self._df, self._scale_operator))
         if (self._scale_operator.shape.ndims is None or
             self._scale_operator.shape.dims[-1].value is None):
           self._dimension = math_ops.cast(
               self._scale_operator.domain_dimension_tensor(),
               dtype=self._scale_operator.dtype, name="dimension")
         else:
           self._dimension = ops.convert_to_tensor(
               self._scale_operator.shape.dims[-1].value,
               dtype=self._scale_operator.dtype, name="dimension")
         df_val = tensor_util.constant_value(self._df)
         dim_val = tensor_util.constant_value(self._dimension)
         if df_val is not None and dim_val is not None:
           df_val = np.asarray(df_val)
           if not df_val.shape:
             df_val = [df_val]
           if any(df_val < dim_val):
             raise ValueError(
                 "Degrees of freedom (df = %s) cannot be less than "
                 "dimension of scale matrix (scale.dimension = %s)"
                 % (df_val, dim_val))
         elif validate_args:
           assertions = check_ops.assert_less_equal(
               self._dimension, self._df,
               message=("Degrees of freedom (df = %s) cannot be "
                        "less than dimension of scale matrix "
                        "(scale.dimension = %s)" %
                        (self._dimension, self._df)))
           self._df = control_flow_ops.with_dependencies(
               [assertions], self._df)
     super(_WishartLinearOperator, self).__init__(
         dtype=self._scale_operator.dtype,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         parameters=parameters,
         graph_parents=([self._df, self._dimension] +
                        self._scale_operator.graph_parents),
         name=name)

   @property
   def df(self):
     """Wishart distribution degree(s) of freedom."""
     return self._df

   def _square_scale_operator(self):
     return self.scale_operator.matmul(
         self.scale_operator.to_dense(), adjoint_arg=True)

   def scale(self):
     """Wishart distribution scale matrix."""
     if self._cholesky_input_output_matrices:
       return self.scale_operator.to_dense()
     else:
       return self._square_scale_operator()

   @property
   def scale_operator(self):
     """Wishart distribution scale matrix as an Linear Operator."""
     return self._scale_operator

   @property
   def cholesky_input_output_matrices(self):
     """Boolean indicating if `Tensor` input/outputs are Cholesky factorized."""
     return self._cholesky_input_output_matrices

   @property
   def dimension(self):
     """Dimension of underlying vector space. The `p` in `R^(p*p)`."""
     return self._dimension

   def _event_shape_tensor(self):
     dimension = self.scale_operator.domain_dimension_tensor()
     return array_ops.stack([dimension, dimension])

   def _event_shape(self):
     dimension = self.scale_operator.domain_dimension
     return tensor_shape.TensorShape([dimension, dimension])

   def _batch_shape_tensor(self):
     return self.scale_operator.batch_shape_tensor()

   def _batch_shape(self):
     return self.scale_operator.batch_shape

   def _sample_n(self, n, seed):
     batch_shape = self.batch_shape_tensor()
     event_shape = self.event_shape_tensor()
     batch_ndims = array_ops.shape(batch_shape)[0]

     ndims = batch_ndims + 3  # sample_ndims=1, event_ndims=2
     shape = array_ops.concat([[n], batch_shape, event_shape], 0)

     # Complexity: O(nbk**2)
     x = random_ops.random_normal(shape=shape,
                                  mean=0.,
                                  stddev=1.,
                                  dtype=self.dtype,
                                  seed=seed)

     # Complexity: O(nbk)
     # This parametrization is equivalent to Chi2, i.e.,
     # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
     expanded_df = self.df * array_ops.ones(
         self.scale_operator.batch_shape_tensor(),
         dtype=self.df.dtype.base_dtype)
     g = random_ops.random_gamma(shape=[n],
                                 alpha=self._multi_gamma_sequence(
                                     0.5 * expanded_df, self.dimension),
                                 beta=0.5,
                                 dtype=self.dtype,
                                 seed=distribution_util.gen_new_seed(
                                     seed, "wishart"))

     # Complexity: O(nbk**2)
     x = array_ops.matrix_band_part(x, -1, 0)  # Tri-lower.

     # Complexity: O(nbk)
     x = array_ops.matrix_set_diag(x, math_ops.sqrt(g))

     # Make batch-op ready.
     # Complexity: O(nbk**2)
     perm = array_ops.concat([math_ops.range(1, ndims), [0]], 0)
     x = array_ops.transpose(x, perm)
     shape = array_ops.concat([batch_shape, [event_shape[0]], [-1]], 0)
     x = array_ops.reshape(x, shape)

     # Complexity: O(nbM) where M is the complexity of the operator solving a
     # vector system. E.g., for LinearOperatorDiag, each matmul is O(k**2), so
     # this complexity is O(nbk**2). For LinearOperatorLowerTriangular,
     # each matmul is O(k^3) so this step has complexity O(nbk^3).
     x = self.scale_operator.matmul(x)

     # Undo make batch-op ready.
     # Complexity: O(nbk**2)
     shape = array_ops.concat([batch_shape, event_shape, [n]], 0)
     x = array_ops.reshape(x, shape)
     perm = array_ops.concat([[ndims - 1], math_ops.range(0, ndims - 1)], 0)
     x = array_ops.transpose(x, perm)

     if not self.cholesky_input_output_matrices:
       # Complexity: O(nbk^3)
       x = math_ops.matmul(x, x, adjoint_b=True)

     return x

   def _log_prob(self, x):
     if self.cholesky_input_output_matrices:
       x_sqrt = x
     else:
       # Complexity: O(nbk^3)
       x_sqrt = linalg_ops.cholesky(x)

     batch_shape = self.batch_shape_tensor()
     event_shape = self.event_shape_tensor()
     ndims = array_ops.rank(x_sqrt)
     # sample_ndims = ndims - batch_ndims - event_ndims
     sample_ndims = ndims - array_ops.shape(batch_shape)[0] - 2
     sample_shape = array_ops.strided_slice(
         array_ops.shape(x_sqrt), [0], [sample_ndims])

     # We need to be able to pre-multiply each matrix by its corresponding
     # batch scale matrix. Since a Distribution Tensor supports multiple
     # samples per batch, this means we need to reshape the input matrix `x`
     # so that the first b dimensions are batch dimensions and the last two
     # are of shape [dimension, dimensions*number_of_samples]. Doing these
     # gymnastics allows us to do a batch_solve.
     #
     # After we're done with sqrt_solve (the batch operation) we need to undo
     # this reshaping so what we're left with is a Tensor partitionable by
     # sample, batch, event dimensions.

     # Complexity: O(nbk**2) since transpose must access every element.
     scale_sqrt_inv_x_sqrt = x_sqrt
     perm = array_ops.concat([math_ops.range(sample_ndims, ndims),
                              math_ops.range(0, sample_ndims)], 0)
     scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt, perm)
     shape = array_ops.concat(
         (batch_shape, (math_ops.cast(
             self.dimension, dtype=dtypes.int32), -1)),
         0)
     scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)

     # Complexity: O(nbM*k) where M is the complexity of the operator solving
     # a vector system. E.g., for LinearOperatorDiag, each solve is O(k), so
     # this complexity is O(nbk**2). For LinearOperatorLowerTriangular,
     # each solve is O(k**2) so this step has complexity O(nbk^3).
     scale_sqrt_inv_x_sqrt = self.scale_operator.solve(
         scale_sqrt_inv_x_sqrt)

     # Undo make batch-op ready.
     # Complexity: O(nbk**2)
     shape = array_ops.concat([batch_shape, event_shape, sample_shape], 0)
     scale_sqrt_inv_x_sqrt = array_ops.reshape(scale_sqrt_inv_x_sqrt, shape)
     perm = array_ops.concat([math_ops.range(ndims - sample_ndims, ndims),
                              math_ops.range(0, ndims - sample_ndims)], 0)
     scale_sqrt_inv_x_sqrt = array_ops.transpose(scale_sqrt_inv_x_sqrt, perm)

     # Write V = SS', X = LL'. Then:
     # tr[inv(V) X] = tr[inv(S)' inv(S) L L']
     #              = tr[inv(S) L L' inv(S)']
     #              = tr[(inv(S) L) (inv(S) L)']
     #              = sum_{ik} (inv(S) L)_{ik}**2
     # The second equality follows from the cyclic permutation property.
     # Complexity: O(nbk**2)
     trace_scale_inv_x = math_ops.reduce_sum(
         math_ops.square(scale_sqrt_inv_x_sqrt),
         axis=[-2, -1])

     # Complexity: O(nbk)
     half_log_det_x = math_ops.reduce_sum(
         math_ops.log(array_ops.matrix_diag_part(x_sqrt)),
         axis=[-1])

     # Complexity: O(nbk**2)
     log_prob = ((self.df - self.dimension - 1.) * half_log_det_x -
                 0.5 * trace_scale_inv_x -
                 self.log_normalization())

     # Set shape hints.
     # Try to merge what we know from the input then what we know from the
     # parameters of this distribution.
     if x.get_shape().ndims is not None:
       log_prob.set_shape(x.get_shape()[:-2])
     if (log_prob.get_shape().ndims is not None and
         self.batch_shape.ndims is not None and
         self.batch_shape.ndims > 0):
       log_prob.get_shape()[-self.batch_shape.ndims:].merge_with(
           self.batch_shape)

     return log_prob

   def _prob(self, x):
     return math_ops.exp(self._log_prob(x))

   def _entropy(self):
     half_dp1 = 0.5 * self.dimension + 0.5
     half_df = 0.5 * self.df
     return (self.dimension * (half_df + half_dp1 * math.log(2.)) +
             2 * half_dp1 * self.scale_operator.log_abs_determinant() +
             self._multi_lgamma(half_df, self.dimension) +
             (half_dp1 - half_df) * self._multi_digamma(half_df, self.dimension))

   def _mean(self):
     if self.cholesky_input_output_matrices:
       return (math_ops.sqrt(self.df)
               * self.scale_operator.to_dense())
     return self.df * self._square_scale_operator()

   def _variance(self):
     x = math_ops.sqrt(self.df) * self._square_scale_operator()
     d = array_ops.expand_dims(array_ops.matrix_diag_part(x), -1)
     v = math_ops.square(x) + math_ops.matmul(d, d, adjoint_b=True)
     if self.cholesky_input_output_matrices:
       return linalg_ops.cholesky(v)
     return v

   def _stddev(self):
     if self.cholesky_input_output_matrices:
       raise ValueError(
           "Computing std. dev. when is cholesky_input_output_matrices=True "
           "does not make sense.")
     return linalg_ops.cholesky(self.variance())

   def _mode(self):
     s = self.df - self.dimension - 1.
     s = array_ops.where_v2(
         math_ops.less(s, 0.),
         constant_op.constant(float("NaN"), dtype=self.dtype, name="nan"), s)
     if self.cholesky_input_output_matrices:
       return math_ops.sqrt(s) * self.scale_operator.to_dense()
     return s * self._square_scale_operator()

   def mean_log_det(self, name="mean_log_det"):
     """Computes E[log(det(X))] under this Wishart distribution."""
     with self._name_scope(name):
       return (self._multi_digamma(0.5 * self.df, self.dimension) +
               self.dimension * math.log(2.) +
               2 * self.scale_operator.log_abs_determinant())

   def log_normalization(self, name="log_normalization"):
     """Computes the log normalizing constant, log(Z)."""
     with self._name_scope(name):
       return (self.df * self.scale_operator.log_abs_determinant() +
               0.5 * self.df * self.dimension * math.log(2.) +
               self._multi_lgamma(0.5 * self.df, self.dimension))

   def _multi_gamma_sequence(self, a, p, name="multi_gamma_sequence"):
     """Creates sequence used in multivariate (di)gamma; shape = shape(a)+[p]."""
     with self._name_scope(name, values=[a, p]):
       # Linspace only takes scalars, so we'll add in the offset afterwards.
       seq = math_ops.linspace(
           constant_op.constant(0., dtype=self.dtype),
           0.5 - 0.5 * p,
           math_ops.cast(p, dtypes.int32))
       return seq + array_ops.expand_dims(a, [-1])

   def _multi_lgamma(self, a, p, name="multi_lgamma"):
     """Computes the log multivariate gamma function; log(Gamma_p(a))."""
     with self._name_scope(name, values=[a, p]):
       seq = self._multi_gamma_sequence(a, p)
       return (0.25 * p * (p - 1.) * math.log(math.pi) +
               math_ops.reduce_sum(math_ops.lgamma(seq),
                                   axis=[-1]))

   def _multi_digamma(self, a, p, name="multi_digamma"):
     """Computes the multivariate digamma function; Psi_p(a)."""
     with self._name_scope(name, values=[a, p]):
       seq = self._multi_gamma_sequence(a, p)
       return math_ops.reduce_sum(math_ops.digamma(seq),
                                  axis=[-1])


 class WishartCholesky(_WishartLinearOperator):
   """The matrix Wishart distribution on positive definite matrices.

   This distribution is defined by a scalar degrees of freedom `df` and a
   lower, triangular Cholesky factor which characterizes the scale matrix.

   Using WishartCholesky is a constant-time improvement over WishartFull. It
   saves an O(nbk^3) operation, i.e., a matrix-product operation for sampling
   and a Cholesky factorization in log_prob. For most use-cases it often saves
   another O(nbk^3) operation since most uses of Wishart will also use the
   Cholesky factorization.

   #### Mathematical Details

   The probability density function (pdf) is,

   ```none
   pdf(X; df, scale) = det(X)**(0.5 (df-k-1)) exp(-0.5 tr[inv(scale) X]) / Z
   Z = 2**(0.5 df k) |det(scale)|**(0.5 df) Gamma_k(0.5 df)
   ```

   where:
   * `df >= k` denotes the degrees of freedom,
   * `scale` is a symmetric, positive definite, `k x k` matrix,
   * `Z` is the normalizing constant, and,
   * `Gamma_k` is the [multivariate Gamma function](
     https://en.wikipedia.org/wiki/Multivariate_gamma_function).


   #### Examples

   ```python
   import tensorflow_probability as tfp
   tfd = tfp.distributions

   # Initialize a single 3x3 Wishart with Cholesky factored scale matrix and 5
   # degrees-of-freedom.(*)
   df = 5
   chol_scale = tf.linalg.cholesky(...)  # Shape is [3, 3].
   dist = tfd.WishartCholesky(df=df, scale=chol_scale)

   # Evaluate this on an observation in R^3, returning a scalar.
   x = ...  # A 3x3 positive definite matrix.
   dist.prob(x)  # Shape is [], a scalar.

   # Evaluate this on a two observations, each in R^{3x3}, returning a length two
   # Tensor.
   x = [x0, x1]  # Shape is [2, 3, 3].
   dist.prob(x)  # Shape is [2].

   # Initialize two 3x3 Wisharts with Cholesky factored scale matrices.
   df = [5, 4]
   chol_scale = tf.linalg.cholesky(...)  # Shape is [2, 3, 3].
   dist = tfd.WishartCholesky(df=df, scale=chol_scale)

   # Evaluate this on four observations.
   x = [[x0, x1], [x2, x3]]  # Shape is [2, 2, 3, 3].
   dist.prob(x)  # Shape is [2, 2].

   # (*) - To efficiently create a trainable covariance matrix, see the example
   #   in tfp.distributions.matrix_diag_transform.
   ```

   """

   @deprecation.deprecated(
       "2018-10-01",
       "The TensorFlow Distributions library has moved to "
       "TensorFlow Probability "
       "(https://github.com/tensorflow/probability). You "
       "should update all references to use `tfp.distributions` "
       "instead of `tf.contrib.distributions`.",
       warn_once=True)
   def __init__(self,
                df,
                scale,
                cholesky_input_output_matrices=False,
                validate_args=False,
                allow_nan_stats=True,
                name="WishartCholesky"):
     """Construct Wishart distributions.

     Args:
       df: `float` or `double` `Tensor`. Degrees of freedom, must be greater than
         or equal to dimension of the scale matrix.
       scale: `float` or `double` `Tensor`. The Cholesky factorization of
         the symmetric positive definite scale matrix of the distribution.
       cholesky_input_output_matrices: Python `bool`. Any function which whose
         input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
       allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
         result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = dict(locals())
     with ops.name_scope(name, values=[scale]) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
           scale = control_flow_ops.with_dependencies([
               check_ops.assert_positive(
                   array_ops.matrix_diag_part(scale),
                   message="scale must be positive definite"),
               check_ops.assert_equal(
                   array_ops.shape(scale)[-1],
                   array_ops.shape(scale)[-2],
                   message="scale must be square")
           ] if validate_args else [], scale)

       super(WishartCholesky, self).__init__(
           df=df,
           scale_operator=linalg.LinearOperatorLowerTriangular(
               tril=scale,
               is_non_singular=True,
               is_positive_definite=True,
               is_square=True),
           cholesky_input_output_matrices=cholesky_input_output_matrices,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
           name=name)
     self._parameters = parameters


 class WishartFull(_WishartLinearOperator):
   """The matrix Wishart distribution on positive definite matrices.

   This distribution is defined by a scalar degrees of freedom `df` and a
   symmetric, positive definite scale matrix.

   Evaluation of the pdf, determinant, and sampling are all `O(k^3)` operations
   where `(k, k)` is the event space shape.

   #### Mathematical Details

   The probability density function (pdf) is,

   ```none
   pdf(X; df, scale) = det(X)**(0.5 (df-k-1)) exp(-0.5 tr[inv(scale) X]) / Z
   Z = 2**(0.5 df k) |det(scale)|**(0.5 df) Gamma_k(0.5 df)
   ```

   where:
   * `df >= k` denotes the degrees of freedom,
   * `scale` is a symmetric, positive definite, `k x k` matrix,
   * `Z` is the normalizing constant, and,
   * `Gamma_k` is the [multivariate Gamma function](
     https://en.wikipedia.org/wiki/Multivariate_gamma_function).

   #### Examples

   ```python
   import tensorflow_probability as tfp
   tfd = tfp.distributions

   # Initialize a single 3x3 Wishart with Full factored scale matrix and 5
   # degrees-of-freedom.(*)
   df = 5
   scale = ...  # Shape is [3, 3]; positive definite.
   dist = tfd.WishartFull(df=df, scale=scale)

   # Evaluate this on an observation in R^3, returning a scalar.
   x = ...  # A 3x3 positive definite matrix.
   dist.prob(x)  # Shape is [], a scalar.

   # Evaluate this on a two observations, each in R^{3x3}, returning a length two
   # Tensor.
   x = [x0, x1]  # Shape is [2, 3, 3].
   dist.prob(x)  # Shape is [2].

   # Initialize two 3x3 Wisharts with Full factored scale matrices.
   df = [5, 4]
   scale = ...  # Shape is [2, 3, 3].
   dist = tfd.WishartFull(df=df, scale=scale)

   # Evaluate this on four observations.
   x = [[x0, x1], [x2, x3]]  # Shape is [2, 2, 3, 3]; xi is positive definite.
   dist.prob(x)  # Shape is [2, 2].

   # (*) - To efficiently create a trainable covariance matrix, see the example
   #   in tfd.matrix_diag_transform.
   ```

   """

   @deprecation.deprecated(
       "2018-10-01",
       "The TensorFlow Distributions library has moved to "
       "TensorFlow Probability "
       "(https://github.com/tensorflow/probability). You "
       "should update all references to use `tfp.distributions` "
       "instead of `tf.contrib.distributions`.",
       warn_once=True)
   def __init__(self,
                df,
                scale,
                cholesky_input_output_matrices=False,
                validate_args=False,
                allow_nan_stats=True,
                name="WishartFull"):
     """Construct Wishart distributions.

     Args:
       df: `float` or `double` `Tensor`. Degrees of freedom, must be greater than
         or equal to dimension of the scale matrix.
       scale: `float` or `double` `Tensor`. The symmetric positive definite
         scale matrix of the distribution.
       cholesky_input_output_matrices: Python `bool`. Any function which whose
         input or output is a matrix assumes the input is Cholesky and returns a
         Cholesky factored matrix. Example `log_prob` input takes a Cholesky and
         `sample_n` returns a Cholesky when
         `cholesky_input_output_matrices=True`.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
       allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
         (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
         result is undefined. When `False`, an exception is raised if one or
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
     parameters = dict(locals())
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
         if validate_args:
           scale = distribution_util.assert_symmetric(scale)
         chol = linalg_ops.cholesky(scale)
         chol = control_flow_ops.with_dependencies([
             check_ops.assert_positive(array_ops.matrix_diag_part(chol))
         ] if validate_args else [], chol)
     super(WishartFull, self).__init__(
         df=df,
         scale_operator=linalg.LinearOperatorLowerTriangular(
             tril=chol,
             is_non_singular=True,
             is_positive_definite=True,
             is_square=True),
         cholesky_input_output_matrices=cholesky_input_output_matrices,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         name=name)
     self._parameters = parameters