| # Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # ============================================================================== |
| """Eager-graph unified check numerics callback.""" |
| |
| from __future__ import absolute_import |
| from __future__ import division |
| from __future__ import print_function |
| |
| import collections |
| import threading |
| |
| import numpy as np |
| |
| from tensorflow.core.protobuf import debug_event_pb2 |
| from tensorflow.python.debug.lib import op_callbacks_common |
| from tensorflow.python.debug.lib import source_utils |
| from tensorflow.python.framework import op_callbacks |
| from tensorflow.python.framework import ops |
| from tensorflow.python.ops import array_ops |
| from tensorflow.python.ops import gen_debug_ops |
| from tensorflow.python.platform import tf_logging as logging |
| from tensorflow.python.util import compat |
| from tensorflow.python.util.tf_export import tf_export |
| |
| |
| # Many ops have benign NaN outputs, and running them with check_numerics |
| # on will create unwanted errors |
| # TODO(b/142497024): Replace this whitelist with function decorators in the ops |
| IGNORE_OP_OUTPUTS = ( |
| # For FusedBatchNorm, if the input tensor is empty then batch_mean and |
| # batch_variance will be NaN. reserve_space holds intermediate values |
| # derived from batch_mean and batch_variance used for gradient calculation |
| (b"FusedBatchNorm", 1), # batch_mean |
| (b"FusedBatchNorm", 2), # batch_variance |
| (b"FusedBatchNorm", 3), # reserve_space_1 |
| (b"FusedBatchNorm", 4), # reserve_space_2 |
| |
| # Same as above |
| (b"FusedBatchNormV2", 1), # batch_mean |
| (b"FusedBatchNormV2", 2), # batch_variance |
| (b"FusedBatchNormV2", 3), # reserve_space_1 |
| (b"FusedBatchNormV2", 4), # reserve_space_2 |
| |
| # Same as above, but reserve_space_3 holds additional intermediate values |
| (b"FusedBatchNormV3", 1), # batch_mean |
| (b"FusedBatchNormV3", 2), # batch_variance |
| (b"FusedBatchNormV3", 3), # reserve_space_1 |
| (b"FusedBatchNormV3", 4), # reserve_space_2 |
| (b"FusedBatchNormV3", 5), # reserve_space_3 |
| ) |
| |
| # Some frequently used ops are generally safe and we can skip them to reduce |
| # overhead. NOTE: This list is compiled by observing operations called by |
| # models in practice and is not a comprehensive list of safe operations. |
| SAFE_OPS = ( |
| b"Concat", |
| b"ConcatV2", |
| b"ExpandDims", |
| b"Fill", |
| b"Gather", |
| b"Maximum", |
| b"Minimum", |
| b"Reshape", |
| b"Slice", |
| b"Squeeze", |
| b"Stack", |
| b"StridedSlice", |
| b"StridedSliceGrad", |
| b"TensorListConcatV2", |
| b"TensorListGather", |
| b"TensorListGetItem", |
| b"TensorListPopBack", |
| b"TensorListStack", |
| b"Transpose", |
| b"Unpack", |
| ) |
| |
| _state = threading.local() |
| |
| |
| def limit_string_length(string, max_len=50): |
| """Limit the length of input string. |
| |
| Args: |
| string: Input string. |
| max_len: (int or None) If int, the length limit. If None, no limit. |
| |
| Returns: |
| Possibly length-limited string. |
| """ |
| if max_len is None or len(string) <= max_len: |
| return string |
| else: |
| return "..." + string[len(string) - max_len:] |
| |
| |
| # A dictionary that supports looking up the original input tensor names. |
| _CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict) |
| |
| |
| def _maybe_lookup_original_input_tensor(graph, tensor): |
| if (graph and |
| graph in _CHECK_NUMERICS_INPUT_LOOKUP and |
| tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]): |
| return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name] |
| else: |
| return tensor |
| |
| |
| def get_check_numerics_error_message(slot, |
| num_outputs, |
| op_type, |
| tensor, |
| inputs, |
| graph=None, |
| traceback=None, |
| stack_height_limit=30, |
| path_length_limit=50): |
| """Create a meaningful and user-friendly error message about offending tensor. |
| |
| The error message reveals the following info about the op that outputs |
| NaN/Infinity: dtype, shape (to the extent known at graph-construction time), |
| input tensors, stack trace for op creation (if is graph mode). |
| |
| Args: |
| slot: (int) slot index of the tensor output. |
| num_outputs: (int) total number of outputs of the op. |
| op_type: (str) Type of the that generates `tensor`. |
| tensor: (Tensor) the offending tensor, i.e., the tensor that contains |
| Infinities or NaNs. |
| inputs: (array of Tensor) inputs to the op that generates `tensor`. |
| graph: (tf.Graph) the graph object that `tensor` belongs to. Available only |
| under graph mode. |
| traceback: (list of trace frames) the stack trace of the op's creation. |
| Available only under graph model. |
| stack_height_limit: (int or None) If int, limit to the height of the stack |
| trace printed in the error message. If None, no limit to the height. |
| path_length_limit: (int or None) Length limit for file paths included in the |
| formatted stack trace. |
| |
| Returns: |
| (str) A formatted error message. |
| """ |
| eager_vs_graph_qualifier = "graph" if graph else "eagerly-executing" |
| message = "\n" |
| message += ( |
| "\n!!! Detected Infinity or NaN in output %d of " |
| "%s op \"%s\" (# of outputs: %d) !!!\n" % |
| (slot, eager_vs_graph_qualifier, op_type, num_outputs)) |
| |
| message += " dtype: %s\n" % tensor.dtype |
| message += " shape: %s\n" % (tensor.shape,) |
| |
| if not graph: |
| # This is an eager tensor. We can get its numpy value and count |
| # NaNs and Infs. |
| is_inf = np.isinf(tensor) |
| |
| num_neg_inf = np.sum(np.logical_and(np.less(tensor, 0.), is_inf)) |
| num_pos_inf = np.sum(np.logical_and(np.greater(tensor, 0.), is_inf)) |
| num_nan = np.sum(np.isnan(tensor)) |
| if num_neg_inf > 0: |
| message += " # of -Inf elements: %s\n" % num_neg_inf |
| if num_pos_inf > 0: |
| message += " # of +Inf elements: %s\n" % num_pos_inf |
| if num_nan: |
| message += " # of +NaN elements: %s\n" % num_nan |
| |
| if len(inputs) > 1: |
| message += "\n Input tensors (%d):\n" % len(inputs) |
| for slot, input_tensor in enumerate(inputs): |
| message += " %d: %s\n" % ( |
| slot, _maybe_lookup_original_input_tensor(graph, input_tensor)) |
| elif len(inputs) == 1: |
| message += "\n Input tensor: %s\n" % ( |
| _maybe_lookup_original_input_tensor(graph, inputs[0])) |
| if graph and hasattr(graph, "name") and graph.name: |
| message += " Graph name: \"%s\"\n" % graph.name |
| |
| # Format the stack trace for the op's creation. We omit files that |
| # belong to tensorflow itself. |
| if graph and traceback: |
| message += ( |
| "\n Stack trace of op's creation (\"->\": inferred user code):\n") |
| if stack_height_limit is not None and len(traceback) > stack_height_limit: |
| num_omitted_frames = len(traceback) - stack_height_limit |
| message += " + ... (Omitted %d frames)\n" % num_omitted_frames |
| for filepath, lineno, function_name, source_line in traceback[ |
| -stack_height_limit:]: |
| user_code_indicator = " " |
| if not source_utils.guess_is_tensorflow_py_library(filepath): |
| user_code_indicator = " -> " |
| |
| message += " + %s (L%d) %s\n" % ( |
| limit_string_length(filepath, path_length_limit), lineno, |
| function_name) |
| if source_line is not None: |
| message += "%s| %s\n" % (user_code_indicator, source_line) |
| message += "\n" |
| return message |
| |
| |
| def _debug_summary(x): |
| return gen_debug_ops.debug_numeric_summary_v2( |
| x, |
| tensor_debug_mode=( |
| debug_event_pb2.TensorDebugMode.REDUCE_INF_NAN_THREE_SLOTS)) |
| |
| |
| class CheckNumericsCallback(object): |
| """Wrapper for the numerics-checking callback for thread locality.""" |
| |
| def __init__(self, stack_height_limit, path_length_limit): |
| self._stack_height_limit = stack_height_limit |
| self._path_length_limit = path_length_limit |
| |
| def callback(self, |
| op_type, |
| inputs, |
| attrs, |
| outputs, |
| op_name=None, |
| graph=None): |
| """Eager-function unified callback for checking numerics.""" |
| del attrs, op_name # Unused |
| op_type_bytes = compat.as_bytes(op_type) |
| is_v1_graph_mode = not ops.executing_eagerly_outside_functions() |
| if (op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS or |
| op_type_bytes in SAFE_OPS): |
| return None |
| if graph: |
| # Under graph mode. Insert check_numerics op. |
| instrumented_outputs = [] |
| for slot, output in enumerate(outputs): |
| if (output.dtype.is_floating and |
| (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS): |
| checked_output = array_ops.check_numerics_v2( |
| # TF v2 has automatic control dependencies added to stateful async |
| # ops, which allows us to run check_numerics asynchronously. |
| # In the above case we use debug_summary to reduce all output |
| # tensors asynchronously from the op being checked and then |
| # process the tensor summary with check_numerics. |
| output if is_v1_graph_mode else _debug_summary(output), |
| get_check_numerics_error_message( |
| slot, |
| len(outputs), |
| op_type, |
| output, |
| inputs, |
| graph=graph, |
| traceback=output.op.traceback)) |
| _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output |
| instrumented_outputs.append( |
| checked_output if is_v1_graph_mode else output) |
| else: |
| instrumented_outputs.append(output) |
| return instrumented_outputs |
| else: |
| if op_type_bytes == b"CheckNumericsV2": |
| # TODO(b/140334369): Remove this special casing logic once op_callback. |
| # automatically prevents infinite recursion in eager mode. |
| return None |
| # Under eager mode. Eagerly execute check_numerics op. |
| for slot, output in enumerate(outputs): |
| if (output.dtype.is_floating and |
| (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS): |
| array_ops.check_numerics_v2( |
| output, |
| get_check_numerics_error_message( |
| slot, len(outputs), op_type, output, inputs, |
| stack_height_limit=self._stack_height_limit, |
| path_length_limit=self._path_length_limit)) |
| |
| |
| @tf_export("debugging.enable_check_numerics") |
| def enable_check_numerics(stack_height_limit=30, |
| path_length_limit=50): |
| r"""Enable tensor numerics checking in an eager/graph unified fashion. |
| |
| The numerics checking mechanism will cause any TensorFlow eager execution or |
| graph execution to error out as soon as an op's output tensor contains |
| infinity or NaN. |
| |
| This method is idempotent. Calling it multiple times has the same effect |
| as calling it once. |
| |
| This method takes effect only on the thread in which it is called. |
| |
| When a op's float-type output tensor contains any Infinity or NaN, an |
| `tf.errors.InvalidArgumentError` will be thrown, with an error message that |
| reveals the following information: |
| - The type of the op that generated the tensor with bad numerics. |
| - Data type (dtype) of the tensor. |
| - Shape of the tensor (to the extent known at the time of eager execution |
| or graph construction). |
| - Name of the containing graph (if available). |
| - (Graph mode only): The stack trace of the intra-graph op's creation, |
| with a stack-height limit and a path-length limit for visual clarity. |
| The stack frames that belong to the user's code (as opposed to |
| tensorflow's internal code) are highlighted with a text arrow ("->"). |
| - (Eager mode only): How many of the offending tensor's elements are |
| `Infinity` and `NaN`, respectively. |
| |
| Once enabled, the check-numerics mechanism can be disabled by using |
| `tf.debugging.disable_check_numerics()`. |
| |
| Example usage: |
| |
| 1. Catching infinity during the execution of a `tf.function` graph: |
| |
| ```py |
| import tensorflow as tf |
| |
| tf.debugging.enable_check_numerics() |
| |
| @tf.function |
| def square_log_x_plus_1(x): |
| v = tf.math.log(x + 1) |
| return tf.math.square(v) |
| |
| x = -1.0 |
| |
| # When the following line runs, a function graph will be compiled |
| # from the Python function `log_x_plus_1()`. Due to the |
| # `enable_check_numerics()` call above, the graph will contain |
| # numerics checking ops that will run during the function graph's |
| # execution. The function call generates an -infinity when the Log |
| # (logarithm) op operates on the output tensor of the Add op. |
| # The program errors out at this line, printing an error message. |
| y = log_x_plus_1(x) |
| z = -y |
| ``` |
| |
| 2. Catching NaN during eager execution: |
| |
| ```py |
| import numpy as np |
| import tensorflow as tf |
| |
| tf.debugging.enable_check_numerics() |
| |
| x = np.array([[0.0, -1.0], [4.0, 3.0]]) |
| |
| # The following line executes the Sqrt op eagerly. Due to the negative |
| # element in the input array, a NaN is generated. Due to the |
| # `enable_check_numerics()` call above, the program errors immediately |
| # at this line, printing an error message. |
| y = tf.math.sqrt(x) |
| z = tf.matmul(y, y) |
| ``` |
| |
| Args: |
| stack_height_limit: Limit to the height of the printed stack trace. |
| Applicable only to ops in `tf.function`s (graphs). |
| path_length_limit: Limit to the file path included in the printed stack |
| trace. Applicable only to ops in `tf.function`s (graphs). |
| """ |
| if not hasattr(_state, "check_numerics_callback"): |
| _state.check_numerics_callback = CheckNumericsCallback( |
| stack_height_limit, path_length_limit) |
| op_callbacks.add_op_callback(_state.check_numerics_callback.callback) |
| |
| logging.info( |
| "Enabled check-numerics callback in thread %s", |
| threading.current_thread().name) |
| |
| |
| @tf_export("debugging.disable_check_numerics") |
| def disable_check_numerics(): |
| """Disable the eager/graph unified numerics checking mechanism. |
| |
| This method can be used after a call to `tf.debugging.enable_check_numerics()` |
| to disable the numerics-checking mechanism that catches inifnity and NaN |
| values output by ops executed eagerly or in tf.function-compiled graphs. |
| |
| This method is idempotent. Calling it multiple times has the same effect |
| as calling it once. |
| |
| This method takes effect only on the thread in which it is called. |
| """ |
| if not hasattr(_state, "check_numerics_callback"): |
| return |
| try: |
| op_callbacks.remove_op_callback(_state.check_numerics_callback.callback) |
| delattr(_state, "check_numerics_callback") |
| logging.info( |
| "Disabled check-numerics callback in thread %s", |
| threading.current_thread().name) |
| except KeyError: |
| # Tolerate disabling the check numerics callback without |
| # enable_check_numerics() being called first. |
| pass |