tensorflow/python/keras/layers/recurrent_v2.py - platform/external/tensorflow - Git at Google

 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Recurrent layers for TF 2.0.
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import uuid

 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.util.tf_export import keras_export


 # The following string constants are used by Defun approach for unified backend
 # of LSTM and GRU.
 _DEFUN_API_NAME_ATTRIBUTE = 'api_implements'
 _DEFUN_DEVICE_ATTRIBUTE = 'api_preferred_device'
 _CPU_DEVICE_NAME = 'CPU'
 _GPU_DEVICE_NAME = 'GPU'

 # The following number constants are used to represent the runtime of the defun
 # backend function. Since the CPU/GPU implementation are mathematically same, we
 # need some signal for the function to indicate which function is executed. This
 # is for testing purpose to verify the correctness of swapping backend function.
 _RUNTIME_UNKNOWN = 0
 _RUNTIME_CPU = 1
 _RUNTIME_GPU = 2


 @keras_export('keras.layers.GRUCell', v1=[])
 class GRUCell(recurrent.GRUCell):
   """Cell class for the GRU layer.

   Arguments:
     units: Positive integer, dimensionality of the output space.
     activation: Activation function to use. Default: hyperbolic tangent
       (`tanh`). If you pass None, no activation is applied
       (ie. "linear" activation: `a(x) = x`).
     recurrent_activation: Activation function to use for the recurrent step.
       Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
       applied (ie. "linear" activation: `a(x) = x`).
     use_bias: Boolean, whether the layer uses a bias vector.
     kernel_initializer: Initializer for the `kernel` weights matrix,
       used for the linear transformation of the inputs.
     recurrent_initializer: Initializer for the `recurrent_kernel`
       weights matrix, used for the linear transformation of the recurrent state.
     bias_initializer: Initializer for the bias vector.
     kernel_regularizer: Regularizer function applied to the `kernel` weights
       matrix.
     recurrent_regularizer: Regularizer function applied to the
       `recurrent_kernel` weights matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     kernel_constraint: Constraint function applied to the `kernel` weights
       matrix.
     recurrent_constraint: Constraint function applied to the `recurrent_kernel`
       weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       linear transformation of the inputs.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state.
     implementation: Implementation mode, either 1 or 2.
       Mode 1 will structure its operations as a larger number of
       smaller dot products and additions, whereas mode 2 (default) will
       batch them into fewer, larger operations. These modes will
       have different performance profiles on different hardware and
       for different applications.
     reset_after: GRU convention (whether to apply reset gate after or
       before matrix multiplication). False = "before",
       True = "after" (default and CuDNN compatible).

   Call arguments:
     inputs: A 2D tensor.
     states: List of state tensors corresponding to the previous timestep.
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode. Only relevant when `dropout` or
       `recurrent_dropout` is used.
   """

   def __init__(self,
                units,
                activation='tanh',
                recurrent_activation='sigmoid',
                use_bias=True,
                kernel_initializer='glorot_uniform',
                recurrent_initializer='orthogonal',
                bias_initializer='zeros',
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
                implementation=2,
                reset_after=True,
                **kwargs):
     super(GRUCell, self).__init__(
         units,
         activation=activation,
         recurrent_activation=recurrent_activation,
         use_bias=use_bias,
         kernel_initializer=kernel_initializer,
         recurrent_initializer=recurrent_initializer,
         bias_initializer=bias_initializer,
         kernel_regularizer=kernel_regularizer,
         recurrent_regularizer=recurrent_regularizer,
         bias_regularizer=bias_regularizer,
         kernel_constraint=kernel_constraint,
         recurrent_constraint=recurrent_constraint,
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
         implementation=implementation,
         reset_after=reset_after,
         **kwargs)


 @keras_export('keras.layers.GRU', v1=[])
 class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
   """Gated Recurrent Unit - Cho et al. 2014.

   Based on available runtime hardware and constraints, this layer
   will choose different implementations (cuDNN-based or pure-TensorFlow)
   to maximize the performance. If a GPU is available and all
   the arguments to the layer meet the requirement of the CuDNN kernel
   (see below for details), the layer will use a fast cuDNN implementation.

   The requirements to use the cuDNN implementation are:

   1. `activation` == 'tanh'
   2. `recurrent_activation` == 'sigmoid'
   3. `recurrent_dropout` == 0
   4. `unroll` is False
   5. `use_bias` is True
   6. `reset_after` is True
   7. Inputs are not masked or strictly right padded.

   There are two variants of the GRU implementation. The default one is based on
   [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
   state before matrix multiplication. The other one is based on
   [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.

   The second variant is compatible with CuDNNGRU (GPU-only) and allows
   inference on CPU. Thus it has separate biases for `kernel` and
   `recurrent_kernel`. To use this variant, set `'reset_after'=True` and
   `recurrent_activation='sigmoid'`.

   Arguments:
     units: Positive integer, dimensionality of the output space.
     activation: Activation function to use.
       Default: hyperbolic tangent (`tanh`).
       If you pass `None`, no activation is applied
       (ie. "linear" activation: `a(x) = x`).
     recurrent_activation: Activation function to use
       for the recurrent step.
       Default: sigmoid (`sigmoid`).
       If you pass `None`, no activation is applied
       (ie. "linear" activation: `a(x) = x`).
     use_bias: Boolean, whether the layer uses a bias vector.
     kernel_initializer: Initializer for the `kernel` weights matrix,
       used for the linear transformation of the inputs.
     recurrent_initializer: Initializer for the `recurrent_kernel`
        weights matrix,
        used for the linear transformation of the recurrent state.
     bias_initializer: Initializer for the bias vector.
     kernel_regularizer: Regularizer function applied to
       the `kernel` weights matrix.
     recurrent_regularizer: Regularizer function applied to
       the `recurrent_kernel` weights matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     activity_regularizer: Regularizer function applied to
       the output of the layer (its "activation")..
     kernel_constraint: Constraint function applied to
       the `kernel` weights matrix.
     recurrent_constraint: Constraint function applied to
       the `recurrent_kernel` weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     dropout: Float between 0 and 1.
       Fraction of the units to drop for the linear transformation of the inputs.
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
     implementation: Implementation mode, either 1 or 2.
       Mode 1 will structure its operations as a larger number of
       smaller dot products and additions, whereas mode 2 will
       batch them into fewer, larger operations. These modes will
       have different performance profiles on different hardware and
       for different applications.
     return_sequences: Boolean. Whether to return the last output
       in the output sequence, or the full sequence.
     return_state: Boolean. Whether to return the last state
       in addition to the output.
     go_backwards: Boolean (default False).
       If True, process the input sequence backwards and return the
       reversed sequence.
     stateful: Boolean (default False). If True, the last state
       for each sample at index i in a batch will be used as initial
       state for the sample of index i in the following batch.
     unroll: Boolean (default False).
       If True, the network will be unrolled,
       else a symbolic loop will be used.
       Unrolling can speed-up a RNN,
       although it tends to be more memory-intensive.
       Unrolling is only suitable for short sequences.
     reset_after: GRU convention (whether to apply reset gate after or
       before matrix multiplication). False = "before",
       True = "after" (default and CuDNN compatible).

   Call arguments:
     inputs: A 3D tensor.
     mask: Binary tensor of shape `(samples, timesteps)` indicating whether
       a given timestep should be masked.
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode. This argument is passed to the cell
       when calling it. This is only relevant if `dropout` or
       `recurrent_dropout` is used.
     initial_state: List of initial state tensors to be passed to the first
       call of the cell.
   """

   def __init__(self,
                units,
                activation='tanh',
                recurrent_activation='sigmoid',
                use_bias=True,
                kernel_initializer='glorot_uniform',
                recurrent_initializer='orthogonal',
                bias_initializer='zeros',
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
                activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
                implementation=2,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
                time_major=False,
                reset_after=True,
                **kwargs):
     # return_runtime is a flag for testing, which shows the real backend
     # implementation chosen by grappler in graph mode.
     self._return_runtime = kwargs.pop('return_runtime', False)

     super(GRU, self).__init__(
         units,
         activation=activation,
         recurrent_activation=recurrent_activation,
         use_bias=use_bias,
         kernel_initializer=kernel_initializer,
         recurrent_initializer=recurrent_initializer,
         bias_initializer=bias_initializer,
         kernel_regularizer=kernel_regularizer,
         recurrent_regularizer=recurrent_regularizer,
         bias_regularizer=bias_regularizer,
         activity_regularizer=activity_regularizer,
         kernel_constraint=kernel_constraint,
         recurrent_constraint=recurrent_constraint,
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
         implementation=implementation,
         return_sequences=return_sequences,
         return_state=return_state,
         go_backwards=go_backwards,
         stateful=stateful,
         unroll=unroll,
         time_major=time_major,
         reset_after=reset_after,
         **kwargs)
     # CuDNN uses following setting by default and not configurable.
     self.could_use_cudnn = (
         activation == 'tanh' and recurrent_activation == 'sigmoid' and
         recurrent_dropout == 0 and not unroll and use_bias and
         reset_after)

   def call(self, inputs, mask=None, training=None, initial_state=None):
     # GRU does not support constants. Ignore it during process.
     inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)

     if isinstance(mask, list):
       mask = mask[0]

     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]

     if not self.could_use_cudnn:
       kwargs = {'training': training}
       self.cell.reset_dropout_mask()
       self.cell.reset_recurrent_dropout_mask()

       def step(cell_inputs, cell_states):
         return self.cell.call(cell_inputs, cell_states, **kwargs)

       last_output, outputs, states = K.rnn(
           step,
           inputs,
           initial_state,
           constants=None,
           go_backwards=self.go_backwards,
           mask=mask,
           unroll=self.unroll,
           input_length=timesteps,
           time_major=self.time_major,
           zero_output_for_mask=self.zero_output_for_mask)
       # This is a dummy tensor for testing purpose.
       runtime = _runtime(_RUNTIME_UNKNOWN)
     else:
       last_output, outputs, runtime, states = self._defun_gru_call(
           inputs, initial_state, training, mask)

     if self.stateful:
       updates = [state_ops.assign(self.states[0], states[0])]
       self.add_update(updates)

     if self.return_sequences:
       output = outputs
     else:
       output = last_output

     if self.return_state:
       return [output] + list(states)
     elif self._return_runtime:
       return output, runtime
     else:
       return output

   def _defun_gru_call(self, inputs, initial_state, training, mask):
     # Use the new defun approach for backend implementation swap.
     # Note that different implementations need to have same function
     # signature, eg, the tensor parameters need to have same shape and dtypes.

     self.reset_dropout_mask()
     dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
     if dropout_mask is not None:
       inputs = inputs * dropout_mask[0]

     cudnn_gru_kwargs = {
         'inputs': inputs,
         'init_h': initial_state[0],
         'kernel': self.cell.kernel,
         'recurrent_kernel': self.cell.recurrent_kernel,
         'bias': self.cell.bias,
         'mask': mask,
         'time_major': self.time_major,
         'go_backwards': self.go_backwards
     }
     normal_gru_kwargs = cudnn_gru_kwargs.copy()
     normal_gru_kwargs.update({
         'activation': self.activation,
         'recurrent_activation': self.recurrent_activation
     })

     if context.executing_eagerly():
       device_type = _get_context_device_type()
       can_use_gpu = (
           # Either user specified GPU or unspecified but GPU is available.
           (device_type == _GPU_DEVICE_NAME
            or (device_type is None and context.num_gpus() > 0))
           and
           (mask is None or is_sequence_right_padded(mask, self.time_major)))
       # Under eager context, check the device placement and prefer the
       if can_use_gpu:
         last_output, outputs, new_h, runtime = cudnn_gru(**cudnn_gru_kwargs)
       else:
         last_output, outputs, new_h, runtime = standard_gru(**normal_gru_kwargs)
     else:
       last_output, outputs, new_h, runtime = gru_with_backend_selection(
           **normal_gru_kwargs)

     states = [new_h]
     return last_output, outputs, runtime, states


 def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, activation,
                  recurrent_activation, mask, time_major, go_backwards):
   """GRU with standard kernel implementation.

   This implementation can be run on all types of hardware.

   This implementation lifts out all the layer weights and make them function
   parameters. It has same number of tensor input params as the CuDNN
   counterpart. The RNN step logic has been simplified, eg dropout and mask is
   removed since CuDNN implementation does not support that.

   Arguments:
     inputs: Input tensor of GRU layer.
     init_h: Initial state tensor for the cell output.
     kernel: Weights for cell kernel.
     recurrent_kernel: Weights for cell recurrent kernel.
     bias: Weights for cell kernel bias and recurrent bias. The bias contains the
       combined input_bias and recurrent_bias.
     activation: Activation function to use for output.
     recurrent_activation: Activation function to use for hidden recurrent state.
     mask: Binary tensor of shape `(samples, timesteps)` indicating whether
       a given timestep should be masked.
     time_major: Boolean, whether the inputs are in the format of
       [time, batch, feature] or [batch, time, feature].
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.

   Returns:
     last_output: output tensor for the last timestep, which has shape
       [batch, units].
     outputs: output tensor for all timesteps, which has shape
       [batch, time, units].
     state_0: the cell output, which has same shape as init_h.
     runtime: constant string tensor which indicate real runtime hardware. This
       value is for testing purpose and should be used by user.
   """
   input_shape = K.int_shape(inputs)
   timesteps = input_shape[0] if time_major else input_shape[1]

   input_bias, recurrent_bias = array_ops.unstack(bias)

   def step(cell_inputs, cell_states):
     """Step function that will be used by Keras RNN backend."""
     h_tm1 = cell_states[0]

     # inputs projected by all gate matrices at once
     matrix_x = K.dot(cell_inputs, kernel)
     matrix_x = K.bias_add(matrix_x, input_bias)

     x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)

     # hidden state projected by all gate matrices at once
     matrix_inner = K.dot(h_tm1, recurrent_kernel)
     matrix_inner = K.bias_add(matrix_inner, recurrent_bias)

     recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
                                                             axis=1)
     z = recurrent_activation(x_z + recurrent_z)
     r = recurrent_activation(x_r + recurrent_r)
     hh = activation(x_h + r * recurrent_h)

     # previous and candidate state mixed by update gate
     h = z * h_tm1 + (1 - z) * hh
     return h, [h]

   last_output, outputs, new_states = K.rnn(
       step,
       inputs, [init_h],
       constants=None,
       unroll=False,
       time_major=time_major,
       mask=mask,
       go_backwards=go_backwards,
       input_length=timesteps)
   return last_output, outputs, new_states[0], _runtime(_RUNTIME_CPU)


 def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
               go_backwards):
   """GRU with CuDNN implementation which is only available for GPU."""
   if not time_major:
     inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
   init_h = array_ops.expand_dims(init_h, axis=0)

   weights = array_ops.split(kernel, 3, axis=1)
   weights += array_ops.split(recurrent_kernel, 3, axis=1)
   # Note that the bias was initialized as shape (2, 3 * units), flat it into
   # (6 * units)
   bias = array_ops.split(K.flatten(bias), 6)
   # Note that the gate order for CuDNN is different from the canonical format.
   # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need to
   # be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
   # z is update gate weights.
   # r is reset gate weights.
   # h is output gate weights.
   weights[0], weights[1] = weights[1], weights[0]
   weights[3], weights[4] = weights[4], weights[3]
   bias[0], bias[1] = bias[1], bias[0]
   bias[3], bias[4] = bias[4], bias[3]

   params = _canonical_to_params(
       weights=weights,
       biases=bias,
       shape=constant_op.constant([-1]),
       transpose_weights=True)

   if mask is not None:
     sequence_length = calculate_sequence_by_mask(mask, time_major)
     if go_backwards:
       # Three reversals are required. E.g.,
       # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
       # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
       # output_from_cudnn = [6, 5, 4, 0, 0]
       # expected_output = [0, 0, 6, 5 ,4]
       inputs = array_ops.reverse_sequence_v2(inputs, sequence_length,
                                              seq_axis=0, batch_axis=1)
     outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(
         inputs, input_h=init_h, input_c=0, params=params, is_training=True,
         rnn_mode='gru', sequence_lengths=sequence_length)
     if go_backwards:
       outputs = array_ops.reverse_sequence_v2(outputs, sequence_length,
                                               seq_axis=0, batch_axis=1)
       outputs = array_ops.reverse(outputs, axis=[0])
   else:
     if go_backwards:
       # Reverse axis 0 since the input is already convert to time major.
       inputs = array_ops.reverse(inputs, axis=[0])
     outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs, input_h=init_h, input_c=0, params=params, is_training=True,
         rnn_mode='gru')

   last_output = outputs[-1]
   if not time_major:
     outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
   h = h[0]

   # In the case of variable length input, the cudnn kernel will fill zeros for
   # the output, whereas the default keras behavior is to bring over the previous
   # output for t-1, so that in the return_sequence=False case, user can quickly
   # get the final effect output instead just 0s at the last timestep.
   # In order to mimic the default keras behavior, we copy the final h state as
   # the last_output, since it is numerically same as the output.
   if mask is not None:
     last_output = h

   return last_output, outputs, h, _runtime(_RUNTIME_GPU)


 def gru_with_backend_selection(
     inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
     go_backwards, activation, recurrent_activation):
   """Call the GRU with optimized backend kernel selection.

   Under the hood, this function will create two TF function, one with the most
   generic kernel and can run on all device condition, and the second one with
   CuDNN specific kernel, which can only run on GPU.

   The first function will be called with normal_lstm_params, while the second
   function is not called, but only registered in the graph. The Grappler will
   do the proper graph rewrite and swap the optimized TF function based on the
   device placement.

   Args:
     inputs: Input tensor of GRU layer.
     init_h: Initial state tensor for the cell output.
     kernel: Weights for cell kernel.
     recurrent_kernel: Weights for cell recurrent kernel.
     bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
       is used in this case.
     mask: Boolean tensor for mask out the steps within sequence.
     time_major: Boolean, whether the inputs are in the format of
       [time, batch, feature] or [batch, time, feature].
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.
     activation: Activation function to use for output.
     recurrent_activation: Activation function to use for hidden recurrent state.

   Returns:
     List of output tensors, same as standard_gru.
   """
   params = {
       'inputs': inputs,
       'init_h': init_h,
       'kernel': kernel,
       'recurrent_kernel': recurrent_kernel,
       'bias': bias,
       'mask': mask,
       'time_major': time_major,
       'go_backwards': go_backwards,
       'activation': activation,
       'recurrent_activation': recurrent_activation
   }

   def cudnn_gru_with_fallback(inputs, init_h, kernel, recurrent_kernel,
                               bias, mask, time_major, go_backwards, activation,
                               recurrent_activation):
     """Use CuDNN kernel when mask is none or strictly right padded."""
     if mask is None:
       return cudnn_gru(inputs=inputs, init_h=init_h, kernel=kernel,
                        recurrent_kernel=recurrent_kernel, bias=bias, mask=mask,
                        time_major=time_major, go_backwards=go_backwards)
     # Note that mask is a boolean tensor, which doesn't need to do gradient
     # calculation, when using tf.cond, a default gradient is added for it,
     # which then cause the backward function to have a signature mismatch.
     # Force the mask to not generate gradient to allow implementation_selector
     # to work properly.
     # TODO(b/80444525): Remove the stop_gradient().
     mask = array_ops.stop_gradient(mask)

     def input_right_padded():
       return cudnn_gru(inputs=inputs, init_h=init_h, kernel=kernel,
                        recurrent_kernel=recurrent_kernel, bias=bias, mask=mask,
                        time_major=time_major, go_backwards=go_backwards)

     def input_not_right_padded():
       return standard_gru(inputs=inputs, init_h=init_h, kernel=kernel,
                           recurrent_kernel=recurrent_kernel, bias=bias,
                           mask=mask, time_major=time_major,
                           go_backwards=go_backwards, activation=activation,
                           recurrent_activation=recurrent_activation)

     return control_flow_ops.cond(
         is_sequence_right_padded(mask, time_major),
         true_fn=input_right_padded,
         false_fn=input_not_right_padded)

   # Each time a `tf.function` is called, we will give it a unique
   # identifiable API name, so that Grappler won't get confused when it
   # sees multiple GRU layers added into same graph, and it will be able
   # to pair up the different implementations across them.
   api_name = 'gru_' + str(uuid.uuid4())
   defun_standard_gru = _generate_defun_backend(
       api_name, _CPU_DEVICE_NAME, standard_gru)
   defun_cudnn_gru = _generate_defun_backend(
       api_name, _GPU_DEVICE_NAME, cudnn_gru_with_fallback)

   # Call the normal GRU impl and register the CuDNN impl function. The
   # grappler will kick in during session execution to optimize the graph.
   last_output, outputs, new_h, runtime = defun_standard_gru(**params)
   function.register(defun_cudnn_gru, **params)
   return last_output, outputs, new_h, runtime


 @keras_export('keras.layers.LSTMCell', v1=[])
 class LSTMCell(recurrent.LSTMCell):
   """Cell class for the LSTM layer.

   Arguments:
     units: Positive integer, dimensionality of the output space.
     activation: Activation function to use. Default: hyperbolic tangent
       (`tanh`). If you pass `None`, no activation is applied (ie. "linear"
       activation: `a(x) = x`).
     recurrent_activation: Activation function to use for the recurrent step.
       Default: sigmoid (`sigmoid`). If you pass `None`, no activation is applied
       (ie. "linear" activation: `a(x) = x`).
     use_bias: Boolean, whether the layer uses a bias vector.
     kernel_initializer: Initializer for the `kernel` weights matrix, used for
       the linear transformation of the inputs.
     recurrent_initializer: Initializer for the `recurrent_kernel` weights
       matrix, used for the linear transformation of the recurrent state.
     bias_initializer: Initializer for the bias vector.
     unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
       initialization. Setting it to true will also force
       `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
         al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
     kernel_regularizer: Regularizer function applied to the `kernel` weights
       matrix.
     recurrent_regularizer: Regularizer function applied to
       the `recurrent_kernel` weights matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     kernel_constraint: Constraint function applied to the `kernel` weights
       matrix.
     recurrent_constraint: Constraint function applied to the `recurrent_kernel`
       weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     dropout: Float between 0 and 1. Fraction of the units to drop for the linear
       transformation of the inputs.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state.
     implementation: Implementation mode, either 1 or 2.
       Mode 1 will structure its operations as a larger number of smaller dot
       products and additions, whereas mode 2 (default) will batch them into
       fewer, larger operations. These modes will have different performance
       profiles on different hardware and for different applications.

   Call arguments:
     inputs: A 2D tensor.
     states: List of state tensors corresponding to the previous timestep.
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode. Only relevant when `dropout` or
       `recurrent_dropout` is used.
   """

   def __init__(self,
                units,
                activation='tanh',
                recurrent_activation='sigmoid',
                use_bias=True,
                kernel_initializer='glorot_uniform',
                recurrent_initializer='orthogonal',
                bias_initializer='zeros',
                unit_forget_bias=True,
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
                implementation=2,
                **kwargs):
     super(LSTMCell, self).__init__(
         units,
         activation=activation,
         recurrent_activation=recurrent_activation,
         use_bias=use_bias,
         kernel_initializer=kernel_initializer,
         recurrent_initializer=recurrent_initializer,
         bias_initializer=bias_initializer,
         unit_forget_bias=unit_forget_bias,
         kernel_regularizer=kernel_regularizer,
         recurrent_regularizer=recurrent_regularizer,
         bias_regularizer=bias_regularizer,
         kernel_constraint=kernel_constraint,
         recurrent_constraint=recurrent_constraint,
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
         implementation=implementation,
         **kwargs)


 @keras_export('keras.layers.LSTM', v1=[])
 class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
   """Long Short-Term Memory layer - Hochreiter 1997.

   Based on available runtime hardware and constraints, this layer
   will choose different implementations (cuDNN-based or pure-TensorFlow)
   to maximize the performance. If a GPU is available and all
   the arguments to the layer meet the requirement of the CuDNN kernel
   (see below for details), the layer will use a fast cuDNN implementation.

   The requirements to use the cuDNN implementation are:

   1. `activation` == 'tanh'
   2. `recurrent_activation` == 'sigmoid'
   3. `recurrent_dropout` == 0
   4. `unroll` is False
   5. `use_bias` is True
   6. Inputs are not masked or strictly right padded.

   Arguments:
     units: Positive integer, dimensionality of the output space.
     activation: Activation function to use.
       Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
       is applied (ie. "linear" activation: `a(x) = x`).
     recurrent_activation: Activation function to use for the recurrent step.
       Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
       applied (ie. "linear" activation: `a(x) = x`).
     use_bias: Boolean, whether the layer uses a bias vector.
     kernel_initializer: Initializer for the `kernel` weights matrix, used for
       the linear transformation of the inputs..
     recurrent_initializer: Initializer for the `recurrent_kernel` weights
       matrix, used for the linear transformation of the recurrent state..
     bias_initializer: Initializer for the bias vector.
     unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
       initialization. Setting it to true will also force
       `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
           al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
     kernel_regularizer: Regularizer function applied to the `kernel` weights
       matrix.
     recurrent_regularizer: Regularizer function applied to the
       `recurrent_kernel` weights matrix.
     bias_regularizer: Regularizer function applied to the bias vector.
     activity_regularizer: Regularizer function applied to the output of the
       layer (its "activation")..
     kernel_constraint: Constraint function applied to the `kernel` weights
       matrix.
     recurrent_constraint: Constraint function applied to the `recurrent_kernel`
       weights matrix.
     bias_constraint: Constraint function applied to the bias vector.
     dropout: Float between 0 and 1. Fraction of the units to drop for the linear
       transformation of the inputs.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state.
     implementation: Implementation mode, either 1 or 2. Mode 1 will structure
       its operations as a larger number of smaller dot products and additions,
       whereas mode 2 will batch them into fewer, larger operations. These modes
       will have different performance profiles on different hardware and for
       different applications.
     return_sequences: Boolean. Whether to return the last output. in the output
       sequence, or the full sequence.
     return_state: Boolean. Whether to return the last state in addition to the
       output.
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.
     stateful: Boolean (default False). If True, the last state for each sample
       at index i in a batch will be used as initial state for the sample of
       index i in the following batch.
     unroll: Boolean (default False). If True, the network will be unrolled, else
       a symbolic loop will be used. Unrolling can speed-up a RNN, although it
       tends to be more memory-intensive. Unrolling is only suitable for short
       sequences.

   Call arguments:
     inputs: A 3D tensor.
     mask: Binary tensor of shape `(samples, timesteps)` indicating whether
       a given timestep should be masked.
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode. This argument is passed to the cell
       when calling it. This is only relevant if `dropout` or
       `recurrent_dropout` is used.
     initial_state: List of initial state tensors to be passed to the first
       call of the cell.
   """

   def __init__(self,
                units,
                activation='tanh',
                recurrent_activation='sigmoid',
                use_bias=True,
                kernel_initializer='glorot_uniform',
                recurrent_initializer='orthogonal',
                bias_initializer='zeros',
                unit_forget_bias=True,
                kernel_regularizer=None,
                recurrent_regularizer=None,
                bias_regularizer=None,
                activity_regularizer=None,
                kernel_constraint=None,
                recurrent_constraint=None,
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
                implementation=2,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
                stateful=False,
                time_major=False,
                unroll=False,
                **kwargs):
     # return_runtime is a flag for testing, which shows the real backend
     # implementation chosen by grappler in graph mode.
     self.return_runtime = kwargs.pop('return_runtime', False)

     super(LSTM, self).__init__(
         units,
         activation=activation,
         recurrent_activation=recurrent_activation,
         use_bias=use_bias,
         kernel_initializer=kernel_initializer,
         recurrent_initializer=recurrent_initializer,
         bias_initializer=bias_initializer,
         unit_forget_bias=unit_forget_bias,
         kernel_regularizer=kernel_regularizer,
         recurrent_regularizer=recurrent_regularizer,
         bias_regularizer=bias_regularizer,
         activity_regularizer=activity_regularizer,
         kernel_constraint=kernel_constraint,
         recurrent_constraint=recurrent_constraint,
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
         implementation=implementation,
         return_sequences=return_sequences,
         return_state=return_state,
         go_backwards=go_backwards,
         stateful=stateful,
         time_major=time_major,
         unroll=unroll,
         **kwargs)

     self.state_spec = [
         InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
     ]
     self.could_use_cudnn = (
         activation == 'tanh' and recurrent_activation == 'sigmoid' and
         recurrent_dropout == 0 and not unroll and use_bias)

   def call(self, inputs, mask=None, training=None, initial_state=None):
     # LSTM does not support constants. Ignore it during process.
     inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)

     if isinstance(mask, list):
       mask = mask[0]

     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]

     if not self.could_use_cudnn:
       # Fall back to use the normal LSTM.
       kwargs = {'training': training}
       self.cell.reset_dropout_mask()
       self.cell.reset_recurrent_dropout_mask()

       def step(inputs, states):
         return self.cell.call(inputs, states, **kwargs)

       last_output, outputs, states = K.rnn(
           step,
           inputs,
           initial_state,
           constants=None,
           go_backwards=self.go_backwards,
           mask=mask,
           unroll=self.unroll,
           input_length=timesteps,
           time_major=self.time_major,
           zero_output_for_mask=self.zero_output_for_mask)
       runtime = _runtime(_RUNTIME_UNKNOWN)
     else:
       # Use the new defun approach for backend implementation swap.
       # Note that different implementations need to have same function
       # signature, eg, the tensor parameters need to have same shape and dtypes.
       # Since the CuDNN has an extra set of bias, those bias will be passed to
       # both normal and CuDNN implementations.
       self.reset_dropout_mask()
       dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
       if dropout_mask is not None:
         inputs = inputs * dropout_mask[0]
       cudnn_lstm_kwargs = {
           'inputs': inputs,
           'init_h': initial_state[0],
           'init_c': initial_state[1],
           'kernel': self.cell.kernel,
           'recurrent_kernel': self.cell.recurrent_kernel,
           'bias': self.cell.bias,
           'mask': mask,
           'time_major': self.time_major,
           'go_backwards': self.go_backwards
       }
       normal_lstm_kwargs = cudnn_lstm_kwargs.copy()
       normal_lstm_kwargs.update({
           'activation': self.activation,
           'recurrent_activation': self.recurrent_activation
       })

       if context.executing_eagerly():
         device_type = _get_context_device_type()
         can_use_gpu = (
             # Either user specified GPU or unspecified but GPU is available.
             (device_type == _GPU_DEVICE_NAME
              or (device_type is None and context.num_gpus() > 0))
             and
             (mask is None or is_sequence_right_padded(mask, self.time_major)))
         # Under eager context, check the device placement and prefer the
         # GPU implementation when GPU is available.
         if can_use_gpu:
           last_output, outputs, new_h, new_c, runtime = cudnn_lstm(
               **cudnn_lstm_kwargs)
         else:
           last_output, outputs, new_h, new_c, runtime = standard_lstm(
               **normal_lstm_kwargs)
       else:
         (last_output, outputs, new_h, new_c,
          runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)

       states = [new_h, new_c]

     if self.stateful:
       updates = []
       for i in range(len(states)):
         updates.append(state_ops.assign(self.states[i], states[i]))
       self.add_update(updates)

     if self.return_sequences:
       output = outputs
     else:
       output = last_output

     if self.return_state:
       return [output] + list(states)
     elif self.return_runtime:
       return output, runtime
     else:
       return output


 def _canonical_to_params(weights, biases, shape, transpose_weights=False):
   """Utility function convert variable to CuDNN compatible parameter.

   Note that Keras weights for kernels are different from the CuDNN format. Eg.:

   ```
     Keras                 CuDNN
     [[0, 1, 2],  <--->  [[0, 2, 4],
      [3, 4, 5]]          [1, 3, 5]]
   ```

   If the input weights need to be in a unified format, then set
   `transpose_weights=True` to convert the weights.

   Args:
     weights: list of weights for the individual kernels and recurrent kernels.
     biases: list of biases for individual gate.
     shape: the shape for the converted variables that will be feed to CuDNN.
     transpose_weights: boolean, whether to transpose the weights.

   Returns:
     The converted weights that can be feed to CuDNN ops as param.
   """
   def convert(w):
     return array_ops.transpose(w) if transpose_weights else w

   weights = [array_ops.reshape(convert(x), shape) for x in weights]
   biases = [array_ops.reshape(x, shape) for x in biases]
   return array_ops.concat(weights + biases, axis=0)


 def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
                   activation, recurrent_activation, mask, time_major,
                   go_backwards):
   """LSTM with standard kernel implementation.

   This implementation can be run on all types for hardware.

   This implementation lifts out all the layer weights and make them function
   parameters. It has same number of tensor input params as the CuDNN
   counterpart. The RNN step logic has been simplified, eg dropout and mask is
   removed since CuDNN implementation does not support that.

   Note that the first half of the bias tensor should be ignored by this impl.
   The CuDNN impl need an extra set of input gate bias. In order to make the both
   function take same shape of parameter, that extra set of bias is also feed
   here.

   Args:
     inputs: input tensor of LSTM layer.
     init_h: initial state tensor for the cell output.
     init_c: initial state tensor for the cell hidden state.
     kernel: weights for cell kernel.
     recurrent_kernel: weights for cell recurrent kernel.
     bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
       is used in this case.
     activation: Activation function to use for output.
     recurrent_activation: Activation function to use for hidden recurrent state.
     mask: Boolean tensor for mask out the steps within sequence.
     time_major: boolean, whether the inputs are in the format of
       [time, batch, feature] or [batch, time, feature].
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.

   Returns:
     last_output: output tensor for the last timestep, which has shape
       [batch, units].
     outputs: output tensor for all timesteps, which has shape
       [batch, time, units].
     state_0: the cell output, which has same shape as init_h.
     state_1: the cell hidden state, which has same shape as init_c.
     runtime: constant string tensor which indicate real runtime hardware. This
       value is for testing purpose and should be used by user.
   """
   input_shape = K.int_shape(inputs)
   timesteps = input_shape[0] if time_major else input_shape[1]

   def step(cell_inputs, cell_states):
     """Step function that will be used by Keras RNN backend."""
     h_tm1 = cell_states[0]  # previous memory state
     c_tm1 = cell_states[1]  # previous carry state

     z = K.dot(cell_inputs, kernel)
     z += K.dot(h_tm1, recurrent_kernel)
     z = K.bias_add(z, bias)

     z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)

     i = recurrent_activation(z0)
     f = recurrent_activation(z1)
     c = f * c_tm1 + i * activation(z2)
     o = recurrent_activation(z3)

     h = o * activation(c)
     return h, [h, c]

   last_output, outputs, new_states = K.rnn(
       step,
       inputs, [init_h, init_c],
       constants=None,
       unroll=False,
       time_major=time_major,
       mask=mask,
       go_backwards=go_backwards,
       input_length=timesteps)
   return (last_output, outputs, new_states[0], new_states[1],
           _runtime(_RUNTIME_CPU))


 def cudnn_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
                time_major, go_backwards):
   """LSTM with CuDNN implementation which is only available for GPU.

   Note that currently only right padded data is supported, or the result will be
   polluted by the unmasked data which should be filtered.

   Args:
     inputs: Input tensor of LSTM layer.
     init_h: Initial state tensor for the cell output.
     init_c: Initial state tensor for the cell hidden state.
     kernel: Weights for cell kernel.
     recurrent_kernel: Weights for cell recurrent kernel.
     bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
       is used in this case.
     mask: Boolean tensor for mask out the steps within sequence.
     time_major: Boolean, whether the inputs are in the format of
       [time, batch, feature] or [batch, time, feature].
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.

   Returns:
     last_output: Output tensor for the last timestep, which has shape
       [batch, units].
     outputs: Output tensor for all timesteps, which has shape
       [batch, time, units].
     state_0: The cell output, which has same shape as init_h.
     state_1: The cell hidden state, which has same shape as init_c.
     runtime: Constant string tensor which indicate real runtime hardware. This
       value is for testing purpose and should not be used by user.
   """
   if not time_major:
     # Cudnn kernel prefer the input to be time major.
     inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
   init_h = array_ops.expand_dims(init_h, axis=0)
   init_c = array_ops.expand_dims(init_c, axis=0)

   weights = array_ops.split(kernel, 4, axis=1)
   weights += array_ops.split(recurrent_kernel, 4, axis=1)
   # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
   # so that mathematically it is same as the canonical LSTM implementation.
   full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)

   params = _canonical_to_params(
       weights=weights,
       biases=array_ops.split(full_bias, 8),
       shape=constant_op.constant([-1]),
       transpose_weights=True)

   if mask is not None:
     sequence_length = calculate_sequence_by_mask(mask, time_major)
     if go_backwards:
       # Three reversals are required. E.g.,
       # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
       # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
       # output_from_cudnn = [6, 5, 4, 0, 0]
       # expected_output = [0, 0, 6, 5 ,4]
       inputs = array_ops.reverse_sequence_v2(inputs, sequence_length,
                                              seq_axis=0, batch_axis=1)
     outputs, h, c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(
         inputs, input_h=init_h, input_c=init_c, params=params, is_training=True,
         rnn_mode='lstm', sequence_lengths=sequence_length)
     if go_backwards:
       outputs = array_ops.reverse_sequence_v2(outputs, sequence_length,
                                               seq_axis=0, batch_axis=1)
       outputs = array_ops.reverse(outputs, axis=[0])
   else:
     # # Fill the array with shape [batch] with value of max timesteps.
     # sequence_length = array_ops.fill([array_ops.shape(inputs)[1]],
     #                                  array_ops.shape(inputs)[0])
     if go_backwards:
       # Reverse axis 0 since the input is already convert to time major.
       inputs = array_ops.reverse(inputs, axis=[0])
     outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs, input_h=init_h, input_c=init_c, params=params, is_training=True,
         rnn_mode='lstm')

   last_output = outputs[-1]
   if not time_major:
     outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
   h = h[0]
   c = c[0]

   # In the case of variable length input, the cudnn kernel will fill zeros for
   # the output, whereas the default keras behavior is to bring over the previous
   # output for t-1, so that in the return_sequence=False case, user can quickly
   # get the final effect output instead just 0s at the last timestep.
   # In order to mimic the default keras behavior, we copy the final h state as
   # the last_output, since it is numerically same as the output.
   if mask is not None:
     last_output = h
   return last_output, outputs, h, c, _runtime(_RUNTIME_GPU)


 def lstm_with_backend_selection(
     inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask, time_major,
     go_backwards, activation, recurrent_activation):
   """Call the LSTM with optimized backend kernel selection.

   Under the hood, this function will create two TF function, one with the most
   generic kernel and can run on all device condition, and the second one with
   CuDNN specific kernel, which can only run on GPU.

   The first function will be called with normal_lstm_params, while the second
   function is not called, but only registered in the graph. The Grappler will
   do the proper graph rewrite and swap the optimized TF function based on the
   device placement.

   Args:
     inputs: Input tensor of LSTM layer.
     init_h: Initial state tensor for the cell output.
     init_c: Initial state tensor for the cell hidden state.
     kernel: Weights for cell kernel.
     recurrent_kernel: Weights for cell recurrent kernel.
     bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
       is used in this case.
     mask: Boolean tensor for mask out the steps within sequence.
     time_major: Boolean, whether the inputs are in the format of
       [time, batch, feature] or [batch, time, feature].
     go_backwards: Boolean (default False). If True, process the input sequence
       backwards and return the reversed sequence.
     activation: Activation function to use for output.
     recurrent_activation: Activation function to use for hidden recurrent state.

   Returns:
     List of output tensors, same as standard_lstm.
   """
   params = {
       'inputs': inputs,
       'init_h': init_h,
       'init_c': init_c,
       'kernel': kernel,
       'recurrent_kernel': recurrent_kernel,
       'bias': bias,
       'mask': mask,
       'time_major': time_major,
       'go_backwards': go_backwards,
       'activation': activation,
       'recurrent_activation': recurrent_activation
   }

   def cudnn_lstm_with_fallback(inputs, init_h, init_c, kernel, recurrent_kernel,
                                bias, mask, time_major, go_backwards, activation,
                                recurrent_activation):
     """Use CuDNN kernel when mask is none or strictly right padded."""
     if mask is None:
       return cudnn_lstm(inputs=inputs, init_h=init_h, init_c=init_c,
                         kernel=kernel, recurrent_kernel=recurrent_kernel,
                         bias=bias, mask=mask, time_major=time_major,
                         go_backwards=go_backwards)
     # Note that mask is a boolean tensor, which doesn't need to do gradient
     # calculation, when using tf.cond, a default gradient is added for it,
     # which then cause the backward function to have a signature mismatch.
     # Force the mask to not generate gradient to allow implementation_selector
     # to work properly.
     # TODO(b/80444525): Remove the stop_gradient().
     mask = array_ops.stop_gradient(mask)

     def input_right_padded():
       return cudnn_lstm(inputs=inputs, init_h=init_h, init_c=init_c,
                         kernel=kernel, recurrent_kernel=recurrent_kernel,
                         bias=bias, mask=mask, time_major=time_major,
                         go_backwards=go_backwards)

     def input_not_right_padded():
       return standard_lstm(inputs=inputs, init_h=init_h, init_c=init_c,
                            kernel=kernel, recurrent_kernel=recurrent_kernel,
                            bias=bias, mask=mask, time_major=time_major,
                            go_backwards=go_backwards, activation=activation,
                            recurrent_activation=recurrent_activation)

     return control_flow_ops.cond(
         is_sequence_right_padded(mask, time_major),
         true_fn=input_right_padded,
         false_fn=input_not_right_padded)

   # Each time a `tf.function` is called, we will give it a unique
   # identifiable API name, so that Grappler won't get confused when it
   # sees multiple LSTM layers added into same graph, and it will be able
   # to pair up the different implementations across them.
   api_name = 'lstm_' + str(uuid.uuid4())
   defun_standard_lstm = _generate_defun_backend(
       api_name, _CPU_DEVICE_NAME, standard_lstm)
   defun_cudnn_lstm = _generate_defun_backend(
       api_name, _GPU_DEVICE_NAME, cudnn_lstm_with_fallback)

   # Call the normal LSTM impl and register the CuDNN impl function. The
   # grappler will kick in during session execution to optimize the graph.
   last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
       **params)
   function.register(defun_cudnn_lstm, **params)

   return last_output, outputs, new_h, new_c, runtime


 def is_sequence_right_padded(mask, time_major):
   """Check the mask tensor and see if it right padded.

   For CuDNN kernel, it uses the sequence length param to skip the tailing
   timestep. If the data is left padded, or not a strict right padding (has
   masked value in the middle of the sequence), then CuDNN kernel won't be work
   properly in those cases.

   Left padded data: [[False, False, True, True, True]].
   Right padded data: [[True, True, True, False, False]].
   Mixture of mask/unmasked data: [[True, False, True, False, False]].

   Note that for the mixed data example above, the actually data RNN should see
   are those 2 Trues (index 0 and 2), the index 1 False should be ignored and not
   pollute the internal states.

   Args:
     mask: the Boolean tensor with shape [batch, timestep] or [timestep, batch]
       when time_major is True.
     time_major: Boolean, whether the input mask is time major or batch major.

   Returns:
     boolean scalar tensor, whether the mask is strictly right padded.
   """
   if time_major:
     mask = array_ops.transpose(mask)
   max_seq_length = array_ops.shape(mask)[1]
   count_of_true = math_ops.reduce_sum(math_ops.cast(mask, dtypes.int32), axis=1)
   right_padded_mask = array_ops.sequence_mask(
       count_of_true, maxlen=max_seq_length)
   return math_ops.reduce_all(math_ops.equal(mask, right_padded_mask))


 def calculate_sequence_by_mask(mask, time_major):
   """Calculate the sequence length tensor (1-D) based on the masking tensor.

   The masking tensor is a 2D boolean tensor with shape [batch, timestep]. For
   any timestep that should be masked, the corresponding field will be False.
   Consider the following example:
     a = [[True, True, False, False],
          [True, True, True, False]]
   It is a (2, 4) tensor, and the corresponding sequence length result should be
   1D tensor with value [2, 3]. Note that the masking tensor must be right
   padded that could be checked by, e.g., `is_sequence_right_padded()`.

   Args:
     mask: Boolean tensor with shape [batch, timestep] or [timestep, batch] if
       time_major=True.
     time_major: Boolean, which indicates whether the mask is time major or batch
       major.
   Returns:
     sequence_length: 1D int32 tensor.
   """
   timestep_index = 0 if time_major else 1
   return math_ops.reduce_sum(math_ops.cast(mask, dtypes.int32),
                              axis=timestep_index)


 def _generate_defun_backend(unique_api_name, preferred_device, func):
   function_attributes = {
       _DEFUN_API_NAME_ATTRIBUTE: unique_api_name,
       _DEFUN_DEVICE_ATTRIBUTE: preferred_device,
   }
   return function.defun_with_attributes(func=func,
                                         attributes=function_attributes,
                                         autograph=False)


 def _get_context_device_type():
   """Parse the current context and return the device type, eg CPU/GPU."""
   current_device = context.context().device_name
   if current_device is None:
     return None
   return device.DeviceSpec.from_string(current_device).device_type


 def _runtime(runtime_name):
   with ops.device('/cpu:0'):
     return constant_op.constant(
         runtime_name, dtype=dtypes.float32, name='runtime')