tensorflow/python/ops/distributions/student_t.py - platform/external/tensorflow - Git at Google

 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Student's t distribution class."""

 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import numpy as np

 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.util.tf_export import tf_export


 __all__ = [
     "StudentT",
     "StudentTWithAbsDfSoftplusScale",
 ]


 @tf_export("distributions.StudentT")
 class StudentT(distribution.Distribution):
   """Student's t-distribution.

   This distribution has parameters: degree of freedom `df`, location `loc`,
   and `scale`.

   #### Mathematical details

   The probability density function (pdf) is,

   ```none
   pdf(x; df, mu, sigma) = (1 + y**2 / df)**(-0.5 (df + 1)) / Z
   where,
   y = (x - mu) / sigma
   Z = abs(sigma) sqrt(df pi) Gamma(0.5 df) / Gamma(0.5 (df + 1))
   ```

   where:
   * `loc = mu`,
   * `scale = sigma`, and,
   * `Z` is the normalization constant, and,
   * `Gamma` is the [gamma function](
     https://en.wikipedia.org/wiki/Gamma_function).

   The StudentT distribution is a member of the [location-scale family](
   https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
   constructed as,

   ```none
   X ~ StudentT(df, loc=0, scale=1)
   Y = loc + scale * X
   ```

   Notice that `scale` has semantics more similar to standard deviation than
   variance. However it is not actually the std. deviation; the Student's
   t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`.

   Samples of this distribution are reparameterized (pathwise differentiable).
   The derivatives are computed using the approach described in the paper

   [Michael Figurnov, Shakir Mohamed, Andriy Mnih.
   Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498)

   #### Examples

   Examples of initialization of one or a batch of distributions.

   ```python
   # Define a single scalar Student t distribution.
   single_dist = tf.distributions.StudentT(df=3)

   # Evaluate the pdf at 1, returning a scalar Tensor.
   single_dist.prob(1.)

   # Define a batch of two scalar valued Student t's.
   # The first has degrees of freedom 2, mean 1, and scale 11.
   # The second 3, 2 and 22.
   multi_dist = tf.distributions.StudentT(df=[2, 3],
                                                  loc=[1, 2.],
                                                  scale=[11, 22.])

   # Evaluate the pdf of the first distribution on 0, and the second on 1.5,
   # returning a length two tensor.
   multi_dist.prob([0, 1.5])

   # Get 3 samples, returning a 3 x 2 tensor.
   multi_dist.sample(3)
   ```

   Arguments are broadcast when possible.

   ```python
   # Define a batch of two Student's t distributions.
   # Both have df 2 and mean 1, but different scales.
   dist = tf.distributions.StudentT(df=2, loc=1, scale=[11, 22.])

   # Evaluate the pdf of both distributions on the same point, 3.0,
   # returning a length 2 tensor.
   dist.prob(3.0)
   ```

   Compute the gradients of samples w.r.t. the parameters:

   ```python
   df = tf.constant(2.0)
   loc = tf.constant(2.0)
   scale = tf.constant(11.0)
   dist = tf.distributions.StudentT(df=df, loc=loc, scale=scale)
   samples = dist.sample(5)  # Shape [5]
   loss = tf.reduce_mean(tf.square(samples))  # Arbitrary loss function
   # Unbiased stochastic gradients of the loss function
   grads = tf.gradients(loss, [df, loc, scale])
   ```

   """
   # pylint: enable=line-too-long

   def __init__(self,
                df,
                loc,
                scale,
                validate_args=False,
                allow_nan_stats=True,
                name="StudentT"):
     """Construct Student's t distributions.

     The distributions have degree of freedom `df`, mean `loc`, and scale
     `scale`.

     The parameters `df`, `loc`, and `scale` must be shaped in a way that
     supports broadcasting (e.g. `df + loc + scale` is a valid operation).

     Args:
       df: Floating-point `Tensor`. The degrees of freedom of the
         distribution(s). `df` must contain only positive values.
       loc: Floating-point `Tensor`. The mean(s) of the distribution(s).
       scale: Floating-point `Tensor`. The scaling factor(s) for the
         distribution(s). Note that `scale` is not technically the standard
         deviation of this distribution but has semantics more similar to
         standard deviation than variance.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
         outputs.
       allow_nan_stats: Python `bool`, default `True`. When `True`,
         statistics (e.g., mean, mode, variance) use the value "`NaN`" to
         indicate the result is undefined. When `False`, an exception is raised
         if one or more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.

     Raises:
       TypeError: if loc and scale are different dtypes.
     """
     parameters = dict(locals())
     with ops.name_scope(name, values=[df, loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(df)]
                                     if validate_args else []):
         self._df = array_ops.identity(df, name="df")
         self._loc = array_ops.identity(loc, name="loc")
         self._scale = array_ops.identity(scale, name="scale")
         check_ops.assert_same_float_dtype(
             (self._df, self._loc, self._scale))
     super(StudentT, self).__init__(
         dtype=self._scale.dtype,
         reparameterization_type=distribution.FULLY_REPARAMETERIZED,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
         graph_parents=[self._df, self._loc, self._scale],
         name=name)

   @staticmethod
   def _param_shapes(sample_shape):
     return dict(
         zip(("df", "loc", "scale"), (
             [ops.convert_to_tensor(
                 sample_shape, dtype=dtypes.int32)] * 3)))

   @property
   def df(self):
     """Degrees of freedom in these Student's t distribution(s)."""
     return self._df

   @property
   def loc(self):
     """Locations of these Student's t distribution(s)."""
     return self._loc

   @property
   def scale(self):
     """Scaling factors of these Student's t distribution(s)."""
     return self._scale

   def _batch_shape_tensor(self):
     return array_ops.broadcast_dynamic_shape(
         array_ops.shape(self.df),
         array_ops.broadcast_dynamic_shape(
             array_ops.shape(self.loc), array_ops.shape(self.scale)))

   def _batch_shape(self):
     return array_ops.broadcast_static_shape(
         array_ops.broadcast_static_shape(self.df.get_shape(),
                                          self.loc.get_shape()),
         self.scale.get_shape())

   def _event_shape_tensor(self):
     return constant_op.constant([], dtype=math_ops.int32)

   def _event_shape(self):
     return tensor_shape.scalar()

   def _sample_n(self, n, seed=None):
     # The sampling method comes from the fact that if:
     #   X ~ Normal(0, 1)
     #   Z ~ Chi2(df)
     #   Y = X / sqrt(Z / df)
     # then:
     #   Y ~ StudentT(df).
     shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
     normal_sample = random_ops.random_normal(shape, dtype=self.dtype, seed=seed)
     df = self.df * array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)
     gamma_sample = random_ops.random_gamma(
         [n],
         0.5 * df,
         beta=0.5,
         dtype=self.dtype,
         seed=distribution_util.gen_new_seed(seed, salt="student_t"))
     samples = normal_sample * math_ops.rsqrt(gamma_sample / df)
     return samples * self.scale + self.loc  # Abs(scale) not wanted.

   def _log_prob(self, x):
     return self._log_unnormalized_prob(x) - self._log_normalization()

   def _log_unnormalized_prob(self, x):
     y = (x - self.loc) / self.scale  # Abs(scale) superfluous.
     return -0.5 * (self.df + 1.) * math_ops.log1p(y**2. / self.df)

   def _log_normalization(self):
     return (math_ops.log(math_ops.abs(self.scale)) +
             0.5 * math_ops.log(self.df) +
             0.5 * np.log(np.pi) +
             math_ops.lgamma(0.5 * self.df) -
             math_ops.lgamma(0.5 * (self.df + 1.)))

   def _cdf(self, x):
     # Take Abs(scale) to make subsequent where work correctly.
     y = (x - self.loc) / math_ops.abs(self.scale)
     x_t = self.df / (y**2. + self.df)
     neg_cdf = 0.5 * math_ops.betainc(0.5 * self.df, 0.5, x_t)
     return array_ops.where(math_ops.less(y, 0.), neg_cdf, 1. - neg_cdf)

   def _entropy(self):
     v = array_ops.ones(self.batch_shape_tensor(),
                        dtype=self.dtype)[..., array_ops.newaxis]
     u = v * self.df[..., array_ops.newaxis]
     beta_arg = array_ops.concat([u, v], -1) / 2.
     return (math_ops.log(math_ops.abs(self.scale)) +
             0.5 * math_ops.log(self.df) +
             special_math_ops.lbeta(beta_arg) +
             0.5 * (self.df + 1.) *
             (math_ops.digamma(0.5 * (self.df + 1.)) -
              math_ops.digamma(0.5 * self.df)))

   @distribution_util.AppendDocstring(
       """The mean of Student's T equals `loc` if `df > 1`, otherwise it is
       `NaN`. If `self.allow_nan_stats=True`, then an exception will be raised
       rather than returning `NaN`.""")
   def _mean(self):
     mean = self.loc * array_ops.ones(self.batch_shape_tensor(),
                                      dtype=self.dtype)
     if self.allow_nan_stats:
       nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
       return array_ops.where(
           math_ops.greater(
               self.df,
               array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)),
           mean,
           array_ops.fill(self.batch_shape_tensor(), nan, name="nan"))
     else:
       return control_flow_ops.with_dependencies(
           [
               check_ops.assert_less(
                   array_ops.ones([], dtype=self.dtype),
                   self.df,
                   message="mean not defined for components of df <= 1"),
           ],
           mean)

   @distribution_util.AppendDocstring("""
       The variance for Student's T equals

       ```
       df / (df - 2), when df > 2
       infinity, when 1 < df <= 2
       NaN, when df <= 1
       ```
       """)
   def _variance(self):
     # We need to put the tf.where inside the outer tf.where to ensure we never
     # hit a NaN in the gradient.
     denom = array_ops.where(math_ops.greater(self.df, 2.),
                             self.df - 2.,
                             array_ops.ones_like(self.df))
     # Abs(scale) superfluous.
     var = (array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype) *
            math_ops.square(self.scale) * self.df / denom)
     # When 1 < df <= 2, variance is infinite.
     inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype())
     result_where_defined = array_ops.where(
         self.df > array_ops.fill(self.batch_shape_tensor(), 2.),
         var,
         array_ops.fill(self.batch_shape_tensor(), inf, name="inf"))

     if self.allow_nan_stats:
       nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
       return array_ops.where(
           math_ops.greater(
               self.df,
               array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)),
           result_where_defined,
           array_ops.fill(self.batch_shape_tensor(), nan, name="nan"))
     else:
       return control_flow_ops.with_dependencies(
           [
               check_ops.assert_less(
                   array_ops.ones([], dtype=self.dtype),
                   self.df,
                   message="variance not defined for components of df <= 1"),
           ],
           result_where_defined)

   def _mode(self):
     return array_ops.identity(self.loc)


 class StudentTWithAbsDfSoftplusScale(StudentT):
   """StudentT with `df = floor(abs(df))` and `scale = softplus(scale)`."""

   def __init__(self,
                df,
                loc,
                scale,
                validate_args=False,
                allow_nan_stats=True,
                name="StudentTWithAbsDfSoftplusScale"):
     parameters = dict(locals())
     with ops.name_scope(name, values=[df, scale]) as name:
       super(StudentTWithAbsDfSoftplusScale, self).__init__(
           df=math_ops.floor(math_ops.abs(df)),
           loc=loc,
           scale=nn.softplus(scale, name="softplus_scale"),
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
           name=name)
     self._parameters = parameters
	# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Student's t distribution class."""

	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import numpy as np

	from tensorflow.python.framework import constant_op
	from tensorflow.python.framework import dtypes
	from tensorflow.python.framework import ops
	from tensorflow.python.framework import tensor_shape
	from tensorflow.python.ops import array_ops
	from tensorflow.python.ops import check_ops
	from tensorflow.python.ops import control_flow_ops
	from tensorflow.python.ops import math_ops
	from tensorflow.python.ops import nn
	from tensorflow.python.ops import random_ops
	from tensorflow.python.ops import special_math_ops
	from tensorflow.python.ops.distributions import distribution
	from tensorflow.python.ops.distributions import util as distribution_util
	from tensorflow.python.util.tf_export import tf_export


	__all__ = [
	"StudentT",
	"StudentTWithAbsDfSoftplusScale",
	]


	@tf_export("distributions.StudentT")
	class StudentT(distribution.Distribution):
	"""Student's t-distribution.

	This distribution has parameters: degree of freedom `df`, location `loc`,
	and `scale`.

	#### Mathematical details

	The probability density function (pdf) is,

	```none
	pdf(x; df, mu, sigma) = (1 + y2 / df)(-0.5 (df + 1)) / Z
	where,
	y = (x - mu) / sigma
	Z = abs(sigma) sqrt(df pi) Gamma(0.5 df) / Gamma(0.5 (df + 1))
	```

	where:
	* `loc = mu`,
	* `scale = sigma`, and,
	* `Z` is the normalization constant, and,
	* `Gamma` is the [gamma function](
	https://en.wikipedia.org/wiki/Gamma_function).

	The StudentT distribution is a member of the [location-scale family](
	https://en.wikipedia.org/wiki/Location-scale_family), i.e., it can be
	constructed as,

	```none
	X ~ StudentT(df, loc=0, scale=1)
	Y = loc + scale * X
	```

	Notice that `scale` has semantics more similar to standard deviation than
	variance. However it is not actually the std. deviation; the Student's
	t-distribution std. dev. is `scale sqrt(df / (df - 2))` when `df > 2`.

	Samples of this distribution are reparameterized (pathwise differentiable).
	The derivatives are computed using the approach described in the paper

	[Michael Figurnov, Shakir Mohamed, Andriy Mnih.
	Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498)

	#### Examples

	Examples of initialization of one or a batch of distributions.

	```python
	# Define a single scalar Student t distribution.
	single_dist = tf.distributions.StudentT(df=3)

	# Evaluate the pdf at 1, returning a scalar Tensor.
	single_dist.prob(1.)

	# Define a batch of two scalar valued Student t's.
	# The first has degrees of freedom 2, mean 1, and scale 11.
	# The second 3, 2 and 22.
	multi_dist = tf.distributions.StudentT(df=[2, 3],
	loc=[1, 2.],
	scale=[11, 22.])

	# Evaluate the pdf of the first distribution on 0, and the second on 1.5,
	# returning a length two tensor.
	multi_dist.prob([0, 1.5])

	# Get 3 samples, returning a 3 x 2 tensor.
	multi_dist.sample(3)
	```

	Arguments are broadcast when possible.

	```python
	# Define a batch of two Student's t distributions.
	# Both have df 2 and mean 1, but different scales.
	dist = tf.distributions.StudentT(df=2, loc=1, scale=[11, 22.])

	# Evaluate the pdf of both distributions on the same point, 3.0,
	# returning a length 2 tensor.
	dist.prob(3.0)
	```

	Compute the gradients of samples w.r.t. the parameters:

	```python
	df = tf.constant(2.0)
	loc = tf.constant(2.0)
	scale = tf.constant(11.0)
	dist = tf.distributions.StudentT(df=df, loc=loc, scale=scale)
	samples = dist.sample(5) # Shape [5]
	loss = tf.reduce_mean(tf.square(samples)) # Arbitrary loss function
	# Unbiased stochastic gradients of the loss function
	grads = tf.gradients(loss, [df, loc, scale])
	```

	"""
	# pylint: enable=line-too-long

	def __init__(self,
	df,
	loc,
	scale,
	validate_args=False,
	allow_nan_stats=True,
	name="StudentT"):
	"""Construct Student's t distributions.

	The distributions have degree of freedom `df`, mean `loc`, and scale
	`scale`.

	The parameters `df`, `loc`, and `scale` must be shaped in a way that
	supports broadcasting (e.g. `df + loc + scale` is a valid operation).

	Args:
	df: Floating-point `Tensor`. The degrees of freedom of the
	distribution(s). `df` must contain only positive values.
	loc: Floating-point `Tensor`. The mean(s) of the distribution(s).
	scale: Floating-point `Tensor`. The scaling factor(s) for the
	distribution(s). Note that `scale` is not technically the standard
	deviation of this distribution but has semantics more similar to
	standard deviation than variance.
	validate_args: Python `bool`, default `False`. When `True` distribution
	parameters are checked for validity despite possibly degrading runtime
	performance. When `False` invalid inputs may silently render incorrect
	outputs.
	allow_nan_stats: Python `bool`, default `True`. When `True`,
	statistics (e.g., mean, mode, variance) use the value "`NaN`" to
	indicate the result is undefined. When `False`, an exception is raised
	if one or more of the statistic's batch members are undefined.
	name: Python `str` name prefixed to Ops created by this class.

	Raises:
	TypeError: if loc and scale are different dtypes.
	"""
	parameters = dict(locals())
	with ops.name_scope(name, values=[df, loc, scale]) as name:
	with ops.control_dependencies([check_ops.assert_positive(df)]
	if validate_args else []):
	self._df = array_ops.identity(df, name="df")
	self._loc = array_ops.identity(loc, name="loc")
	self._scale = array_ops.identity(scale, name="scale")
	check_ops.assert_same_float_dtype(
	(self._df, self._loc, self._scale))
	super(StudentT, self).__init__(
	dtype=self._scale.dtype,
	reparameterization_type=distribution.FULLY_REPARAMETERIZED,
	validate_args=validate_args,
	allow_nan_stats=allow_nan_stats,
	parameters=parameters,
	graph_parents=[self._df, self._loc, self._scale],
	name=name)

	@staticmethod
	def _param_shapes(sample_shape):
	return dict(
	zip(("df", "loc", "scale"), (
	[ops.convert_to_tensor(
	sample_shape, dtype=dtypes.int32)] * 3)))

	@property
	def df(self):
	"""Degrees of freedom in these Student's t distribution(s)."""
	return self._df

	@property
	def loc(self):
	"""Locations of these Student's t distribution(s)."""
	return self._loc

	@property
	def scale(self):
	"""Scaling factors of these Student's t distribution(s)."""
	return self._scale

	def _batch_shape_tensor(self):
	return array_ops.broadcast_dynamic_shape(
	array_ops.shape(self.df),
	array_ops.broadcast_dynamic_shape(
	array_ops.shape(self.loc), array_ops.shape(self.scale)))

	def _batch_shape(self):
	return array_ops.broadcast_static_shape(
	array_ops.broadcast_static_shape(self.df.get_shape(),
	self.loc.get_shape()),
	self.scale.get_shape())

	def _event_shape_tensor(self):
	return constant_op.constant([], dtype=math_ops.int32)

	def _event_shape(self):
	return tensor_shape.scalar()

	def _sample_n(self, n, seed=None):
	# The sampling method comes from the fact that if:
	# X ~ Normal(0, 1)
	# Z ~ Chi2(df)
	# Y = X / sqrt(Z / df)
	# then:
	# Y ~ StudentT(df).
	shape = array_ops.concat([[n], self.batch_shape_tensor()], 0)
	normal_sample = random_ops.random_normal(shape, dtype=self.dtype, seed=seed)
	df = self.df * array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)
	gamma_sample = random_ops.random_gamma(
	[n],
	0.5 * df,
	beta=0.5,
	dtype=self.dtype,
	seed=distribution_util.gen_new_seed(seed, salt="student_t"))
	samples = normal_sample * math_ops.rsqrt(gamma_sample / df)
	return samples * self.scale + self.loc # Abs(scale) not wanted.

	def _log_prob(self, x):
	return self._log_unnormalized_prob(x) - self._log_normalization()

	def _log_unnormalized_prob(self, x):
	y = (x - self.loc) / self.scale # Abs(scale) superfluous.
	return -0.5 * (self.df + 1.) * math_ops.log1p(y**2. / self.df)

	def _log_normalization(self):
	return (math_ops.log(math_ops.abs(self.scale)) +
	0.5 * math_ops.log(self.df) +
	0.5 * np.log(np.pi) +
	math_ops.lgamma(0.5 * self.df) -
	math_ops.lgamma(0.5 * (self.df + 1.)))

	def _cdf(self, x):
	# Take Abs(scale) to make subsequent where work correctly.
	y = (x - self.loc) / math_ops.abs(self.scale)
	x_t = self.df / (y**2. + self.df)
	neg_cdf = 0.5 * math_ops.betainc(0.5 * self.df, 0.5, x_t)
	return array_ops.where(math_ops.less(y, 0.), neg_cdf, 1. - neg_cdf)

	def _entropy(self):
	v = array_ops.ones(self.batch_shape_tensor(),
	dtype=self.dtype)[..., array_ops.newaxis]
	u = v * self.df[..., array_ops.newaxis]
	beta_arg = array_ops.concat([u, v], -1) / 2.
	return (math_ops.log(math_ops.abs(self.scale)) +
	0.5 * math_ops.log(self.df) +
	special_math_ops.lbeta(beta_arg) +
	0.5 * (self.df + 1.) *
	(math_ops.digamma(0.5 * (self.df + 1.)) -
	math_ops.digamma(0.5 * self.df)))

	@distribution_util.AppendDocstring(
	"""The mean of Student's T equals `loc` if `df > 1`, otherwise it is
	`NaN`. If `self.allow_nan_stats=True`, then an exception will be raised
	rather than returning `NaN`.""")
	def _mean(self):
	mean = self.loc * array_ops.ones(self.batch_shape_tensor(),
	dtype=self.dtype)
	if self.allow_nan_stats:
	nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
	return array_ops.where(
	math_ops.greater(
	self.df,
	array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)),
	mean,
	array_ops.fill(self.batch_shape_tensor(), nan, name="nan"))
	else:
	return control_flow_ops.with_dependencies(
	[
	check_ops.assert_less(
	array_ops.ones([], dtype=self.dtype),
	self.df,
	message="mean not defined for components of df <= 1"),
	],
	mean)

	@distribution_util.AppendDocstring("""
	The variance for Student's T equals

	```
	df / (df - 2), when df > 2
	infinity, when 1 < df <= 2
	NaN, when df <= 1
	```
	""")
	def _variance(self):
	# We need to put the tf.where inside the outer tf.where to ensure we never
	# hit a NaN in the gradient.
	denom = array_ops.where(math_ops.greater(self.df, 2.),
	self.df - 2.,
	array_ops.ones_like(self.df))
	# Abs(scale) superfluous.
	var = (array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype) *
	math_ops.square(self.scale) * self.df / denom)
	# When 1 < df <= 2, variance is infinite.
	inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype())
	result_where_defined = array_ops.where(
	self.df > array_ops.fill(self.batch_shape_tensor(), 2.),
	var,
	array_ops.fill(self.batch_shape_tensor(), inf, name="inf"))

	if self.allow_nan_stats:
	nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype())
	return array_ops.where(
	math_ops.greater(
	self.df,
	array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)),
	result_where_defined,
	array_ops.fill(self.batch_shape_tensor(), nan, name="nan"))
	else:
	return control_flow_ops.with_dependencies(
	[
	check_ops.assert_less(
	array_ops.ones([], dtype=self.dtype),
	self.df,
	message="variance not defined for components of df <= 1"),
	],
	result_where_defined)

	def _mode(self):
	return array_ops.identity(self.loc)


	class StudentTWithAbsDfSoftplusScale(StudentT):
	"""StudentT with `df = floor(abs(df))` and `scale = softplus(scale)`."""

	def __init__(self,
	df,
	loc,
	scale,
	validate_args=False,
	allow_nan_stats=True,
	name="StudentTWithAbsDfSoftplusScale"):
	parameters = dict(locals())
	with ops.name_scope(name, values=[df, scale]) as name:
	super(StudentTWithAbsDfSoftplusScale, self).__init__(
	df=math_ops.floor(math_ops.abs(df)),
	loc=loc,
	scale=nn.softplus(scale, name="softplus_scale"),
	validate_args=validate_args,
	allow_nan_stats=allow_nan_stats,
	name=name)
	self._parameters = parameters