# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Gradient checker for functions.
The gradient checker verifies numerically that an function properly
computes the gradients
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from tensorflow.python.eager import backprop
from tensorflow.python.eager import context
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util.tf_export import tf_export
def _product(t):
if isinstance(t, int):
return t
y = 1
for x in t:
y *= x
return y
def _eval_indexed_slices(a):
"""Converts IndexedSlices to IndexedSlicesValue with numpy indices/values.
When eager execution is enabled, converts IndexedSlices
to IndexedSlicesValue with numpy indices/values.
a: any value.
If a is IndexedSlices and eager execution is enabled, calls numpy() on a's
fields. Otherwise returns a unchanged.
if isinstance(a, ops.IndexedSlices) and context.executing_eagerly():
return ops.IndexedSlicesValue(
indices=[x.numpy() for x in a.indices],
values=[x.numpy() for x in a.values],
return a
def _to_numpy(a):
"""Converts Tensors, EagerTensors, and IndexedSlicesValue to numpy arrays.
a: any value.
If a is EagerTensor or Tensor, returns the evaluation of a by calling
numpy() or run(). If a is IndexedSlicesValue, constructs the corresponding
dense numpy array. Otherwise returns a unchanged.
if isinstance(a, ops.EagerTensor):
return a.numpy()
if isinstance(a, ops.Tensor):
sess = ops.get_default_session()
if isinstance(a, ops.IndexedSlicesValue):
arr = np.zeros(a.dense_shape)
assert len(a.values) == len(a.indices), (
"IndexedSlicesValue has %s value slices but %s indices\n%s" %
(a.values, a.indices, a))
for values_slice, index in zip(a.values, a.indices):
assert 0 <= index < len(arr), (
"IndexedSlicesValue has invalid index %s\n%s" % (index, a))
arr[index] += values_slice
return arr
return a
def _prepare(f, xs_dtypes, xs_shapes):
"""Return a function that executes 'f'.
In TF 2.x, this is the same as `f`.
In TF 1.x, returns a Python function that executes the graph defined by `f`
in a Session.
f: the function.
xs_dtypes: dtypes of f's arguments.
xs_shapes: shapes of f's arguments.
if context.executing_eagerly():
def decorated_eager(*xs_data):
return f(*map(ops.convert_to_tensor, xs_data))
return decorated_eager
xs = [
array_ops.placeholder(x_dtype, shape=x_shape)
for x_dtype, x_shape in zip(xs_dtypes, xs_shapes)
y = f(*xs)
sess = ops.get_default_session()
def decorated_graph(*xs_data):
xs_data = [_to_numpy(a) for a in xs_data]
return, feed_dict=dict(zip(xs, xs_data)))
return decorated_graph
def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param):
"""Computes the theoretical Jacobian for f regarding xs[param].
One can think of the relation among f, xs and y as y = f(xs).
f: the function.
y_shape: the shape of the result.
y_dtype: the dtype of the result.
xs: a list of tensors.
param: the index of the target parameter.
A 2-d numpy array representing the Jacobian. It has "y_size" rows
and "x_size" columns where "x_size" is the number of elements in xs[param]
and "y_size" is the number of elements in the result.
ValueError: If result is empty but the gradient is nonzero.
x = xs[param]
# Complex vectors are treated as vectors of twice as many reals.
x_shape = tuple(x.shape) + (2,) if x.dtype.is_complex else x.shape
y_factor = 2 if y_dtype.is_complex else 1
# To compute the jacobian, we treat x and y as one-dimensional vectors.
x_size = _product(x_shape)
x_val_size = _product(x_shape[1:]) # This is used for sparse gradients
y_size = _product(y_shape) * y_factor
# Allocate 2-D Jacobian, with y dimensions smashed into the first
# dimension and x dimensions smashed into the second.
jacobian = np.zeros((y_size, x_size), dtype=x.dtype.real_dtype.as_numpy_dtype)
# For each of the entry of dy, we set this to be 1 and
# everything else to be 0 and compute the gradients -- this will give us one
# row of the Jacobian matrix.
dy_data = np.zeros(y_shape, dtype=y_dtype.as_numpy_dtype)
dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype)
grad_fn_unprep = backprop.gradients_function(f, [param])
grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy),
[y_dtype] + [z.dtype for z in xs],
[None] + [z.shape for z in xs])
for row in range(y_size):
dy_data_flat[row] = 1
grad = _to_numpy(grad_fn(dy_data, *xs)[0])
grad = _eval_indexed_slices(grad)
if isinstance(grad, ops.IndexedSlicesValue):
for i, v in zip(grad.indices, grad.values):
c_begin = i * x_val_size
c_end = c_begin + x_val_size
jacobian[row, c_begin:c_end] += v.flat
elif grad is not None:
jacobian[row, :] = grad.ravel().view(jacobian.dtype)
# This reset of `dy_data_flat` needs to happen after `grad` is copied to
# `jacobian` because `grad` and `dy_data_flat` may share memory.
dy_data_flat[row] = 0
# If the output is empty, run the gradients at least once and make sure
# they produce zeros.
if y_size == 0: # don't use 'not y_size', because y_size may not be an int
grad = _to_numpy(grad_fn(dy_data, *xs)[0])
if grad.shape != x.shape:
raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
(x.shape, grad.shape))
if np.any(grad):
raise ValueError("Empty tensor with nonzero gradients")
logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
return jacobian
def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param, delta):
"""Computes the numeric Jacobian for f regarding xs[param].
One can think of the relation among f, xs and y as y = f(xs).
f: the function.
y_size: the number of elements of the result.
y_dtype: the dtype of the result.
xs: a list of tensors.
param: the index of the target parameter.
delta: the amount of perturbation we give to the input.
A 2-d numpy array representing the Jacobian. It has "y_size" rows
and "x_size" columns where "x_size" is the number of elements in xs[param]
and "y_size" is the number of elements in the result.
x_shape = xs[param].shape
x_dtype = xs[param].dtype
# To compute the jacobian, we treat x and y as one-dimensional vectors
x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
y_size = y_size * (2 if y_dtype.is_complex else 1)
x_dtype = x_dtype.real_dtype.as_numpy_dtype
y_dtype = y_dtype.real_dtype.as_numpy_dtype
xs_dtypes = [x.dtype for x in xs]
xs_shapes = [x.shape for x in xs]
# Converts xs to numpy arrays to do in-place perturbation.
# Calls asarray() to avoid copying in ravel() later.
xs = [np.asarray(_to_numpy(x)) for x in xs]
x = xs[param]
# Make sure we have the right types
scale = np.asarray(2 * delta, dtype=y_dtype)[()]
jacobian = np.zeros((y_size, x_size), dtype=x_dtype)
# For each of the entry of x, we slightly perturbs this by adding and
# subtracting a delta and then compute difference between the outputs. This
# will give us one column of the Jacobian matrix.
f = _prepare(f, xs_dtypes, xs_shapes)
for col in range(x_size):
original = x.ravel().view(x_dtype)[col]
x.ravel().view(x_dtype)[col] += delta
y_pos = _to_numpy(f(*xs))
x.ravel().view(x_dtype)[col] = original
x.ravel().view(x_dtype)[col] -= delta
y_neg = _to_numpy(f(*xs))
x.ravel().view(x_dtype)[col] = original
diff = (y_pos - y_neg) / scale
jacobian[:, col] = diff.ravel().view(y_dtype)
logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
return jacobian
def _compute_gradient(f, y_shape, y_dtype, xs, param, delta):
"""Computes the theoretical and numerical jacobian."""
x = xs[param]
t = x.dtype
allowed_types = [
dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64,
dtypes.complex64, dtypes.complex128
assert t.base_dtype in allowed_types, ("Cannot compute gradient for "
"unsupported type %s of argument %s" %
(, param))
t2 = y_dtype
assert t2.base_dtype in allowed_types, ("Cannot compute gradient for "
"unsupported type %s of y" %
y_size = _product(y_shape)
jacob_t = _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param)
jacob_n = _compute_numeric_jacobian(f, y_size, y_dtype, xs, param, delta)
return jacob_t, jacob_n
def _compute_gradient_list(f, xs, delta):
"""Compute gradients for a list of x values."""
# convert xs to tensors so that dtype and shape have uniform types
xs = [ops.convert_to_tensor(x) for x in xs]
# run the function to get info of the result
xs_dtypes = [x.dtype for x in xs]
xs_shapes = [x.shape for x in xs]
f_temp = _prepare(f, xs_dtypes, xs_shapes)
y = f_temp(*xs)
return tuple(zip(*[
_compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype), xs, i, delta)
for i in range(len(xs))
@tf_export("test.compute_gradient", v1=[])
def compute_gradient(f, x, delta=None):
"""Computes the theoretical and numeric Jacobian of `f`.
With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
f: the function.
x: the arguments for the function as a list or tuple of values convertible
to a Tensor.
delta: (optional) perturbation used to compute numeric Jacobian.
A pair of lists, where the first is a list of 2-d numpy arrays representing
the theoretical Jacobians for each argument, and the second list is the
numerical ones. Each 2-d array has "y_size" rows
and "x_size" columns where "x_size" is the number of elements in the
corresponding argument and "y_size" is the number of elements in f(x).
ValueError: If result is empty but the gradient is nonzero.
ValueError: If x is not list, but any other type.
def test_func(x):
return x*x
theoretical, numerical = tf.test.compute_gradient(test_func, [1.0])
theoretical, numerical
# ((array([[2.]], dtype=float32),), (array([[2.000004]], dtype=float32),))
if not isinstance(x, (list, tuple)):
raise ValueError(
"`x` must be a list or tuple of values convertible to a Tensor "
"(arguments to `f`), not a %s" % type(x))
if delta is None:
# By default, we use a step size for the central finite difference
# approximation that is exactly representable as a binary floating
# point number, since this reduces the amount of noise due to rounding
# in the approximation of some functions.
delta = 1.0 / 1024
return _compute_gradient_list(f, x, delta)
def max_error(grad1, grad2):
"""Computes maximum elementwise gap.
Computes the maximum elementwise gap between two lists of tensors of the same
grad1: a lists of tensors.
grad2: a lists of tensors with the same shape as grad1.
The maximum elementwise gap between the two.
error = 0
for j_t, j_n in zip(grad1, grad2):
if j_t.size or j_n.size: # Handle zero size tensors correctly
error = np.maximum(error, np.fabs(j_t - j_n).max())
return error