tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py - platform/external/tensorflow - Git at Google

 # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Tests for Cudnn RNN models."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

 import argparse
 import collections
 import functools
 import itertools
 import os
 import sys
 import unittest

 import numpy as np

 from tensorflow.contrib.cudnn_rnn.python.layers import cudnn_rnn
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.contrib.rnn.python.ops import rnn as contrib_rnn_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients_impl as gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import rnn as rnn_lib
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.checkpointable import util as checkpointable_utils


 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
 CUDNN_GRU = cudnn_rnn_ops.CUDNN_GRU
 CUDNN_RNN_RELU = cudnn_rnn_ops.CUDNN_RNN_RELU
 CUDNN_RNN_TANH = cudnn_rnn_ops.CUDNN_RNN_TANH
 CUDNN_RNN_UNIDIRECTION = cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION
 CUDNN_RNN_BIDIRECTION = cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION

 CUDNN_LSTM_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_LSTM_PARAMS_PER_LAYER
 CUDNN_GRU_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_GRU_PARAMS_PER_LAYER
 CUDNN_RNN_TANH_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_TANH_PARAMS_PER_LAYER
 CUDNN_RNN_RELU_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_RELU_PARAMS_PER_LAYER


 class CudnnTestModel(object):
   """Model with convenient APIs for easier building and running test graph.

   The graph built is used by all tests below to avoid repeatedly building
   similar test graphs.
   """

   def __init__(self,
                rnn_mode,
                num_layers,
                num_units,
                input_size,
                direction=CUDNN_RNN_UNIDIRECTION,
                dropout=0.,
                dtype=dtypes.float32,
                training=False,
                seed=None,
                kernel_initializer=None,
                bias_initializer=None):
     if dtype not in (dtypes.float16, dtypes.float32, dtypes.float64):
       raise ValueError("Invalid dtype: %s" % dtype)
     self._dtype = dtype

     self._inputs = array_ops.placeholder(
         dtype=dtype, shape=[None, None, input_size], name="inputs")
     h = array_ops.placeholder(
         dtype=dtype, shape=[None, None, num_units], name="h")
     c = array_ops.placeholder(
         dtype=dtype, shape=[None, None, num_units], name="c")
     if rnn_mode == CUDNN_LSTM:
       model_fn = cudnn_rnn.CudnnLSTM
       self._initial_state = (h, c)
     elif rnn_mode == CUDNN_GRU:
       model_fn = cudnn_rnn.CudnnGRU
       self._initial_state = (h,)
     elif rnn_mode == CUDNN_RNN_TANH:
       model_fn = cudnn_rnn.CudnnRNNTanh
       self._initial_state = (h,)
     elif rnn_mode == CUDNN_RNN_RELU:
       model_fn = cudnn_rnn.CudnnRNNRelu
       self._initial_state = (h,)
     else:
       raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
     self._rnn = model_fn(
         num_layers,
         num_units,
         direction=direction,
         dropout=dropout,
         dtype=dtype,
         seed=seed,
         kernel_initializer=kernel_initializer,
         bias_initializer=bias_initializer)
     self._rnn.build([None, None, input_size])

     self._outputs, self._output_state = self._rnn(
         self._inputs, initial_state=self._initial_state, training=training)

   def _AddUp(self, outputs, output_state):
     total = math_ops.reduce_sum(outputs)
     for s in output_state:
       total += math_ops.reduce_sum(s)
     return total

   @property
   def inputs(self):
     return self._inputs

   @property
   def initial_state(self):
     return self._initial_state

   @property
   def outputs(self):
     return self._outputs

   @property
   def output_state(self):
     return self._output_state

   @property
   def rnn(self):
     return self._rnn

   @property
   def total_sum(self):
     return self._AddUp(self.outputs, self.output_state)

   def SynthesizeInput(self, seq_length, batch_size, seed=1234):
     """Synthesizes input and initial state values for testing."""
     np.random.seed(seed)
     num_layers = self._rnn.num_layers
     dir_count = self._rnn.num_dirs
     num_units = self._rnn.num_units
     input_size = self._rnn.input_size

     np_dtype = np.float32 if self._dtype == dtypes.float32 else np.float64
     inputs = np.random.randn(seq_length, batch_size,
                              input_size).astype(np_dtype)
     input_h = np.random.randn(num_layers * dir_count, batch_size,
                               num_units).astype(np_dtype)
     if self._rnn.rnn_mode == CUDNN_LSTM:
       input_c = np.random.randn(num_layers * dir_count, batch_size,
                                 num_units).astype(np_dtype)
       initial_state = (input_h, input_c)
     else:
       initial_state = (input_h,)
     return inputs, initial_state

   def ZeroState(self, batch_size):
     num_layers = self._rnn.num_layers
     dir_count = self._rnn.num_dirs
     num_units = self._rnn.num_units

     np_dtype = np.float32 if self._dtype == dtypes.float32 else np.float64
     input_h = np.zeros((num_layers * dir_count, batch_size,
                         num_units)).astype(np_dtype)
     if self._rnn.rnn_mode == CUDNN_LSTM:
       input_c = np.zeros((num_layers * dir_count, batch_size,
                           num_units)).astype(np_dtype)
       initial_state = (input_h, input_c)
     else:
       initial_state = (input_h,)
     return initial_state

   def FProp(self, inputs_t, initial_state_t, training):
     """Builds additional subgraph with given inputs and state.

     Args:
       inputs_t: a tensor.
       initial_state_t: a tensor.
       training: boolean, true if training mode.
     Returns:
       A tensor of the forward pass output of the model.
     """
     outputs, output_state = self._rnn(
         inputs_t, initial_state=initial_state_t, training=training)
     return self._AddUp(outputs, output_state)

   def Feed(self, sess, inputs, initial_state=None, return_sum=True):
     """Runs graph with given inputs and initial state."""
     batch_size = inputs.shape[1]
     if initial_state is None:
       initial_state = self.ZeroState(batch_size)
     if return_sum:
       return sess.run(
           self.total_sum,
           feed_dict={self.inputs: inputs,
                      self.initial_state: initial_state})
     else:
       return sess.run(
           [self.outputs, self.output_state],
           feed_dict={self.inputs: inputs,
                      self.initial_state: initial_state})


 def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None):
   mode = rnn.rnn_mode
   num_units = rnn.num_units
   num_layers = rnn.num_layers

   # To reuse cuDNN-trained models, must use cudnn compatible rnn cells.
   if mode == CUDNN_LSTM:
     single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units)
   elif mode == CUDNN_GRU:
     single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units)
   elif mode == CUDNN_RNN_TANH:
     single_cell = (lambda: rnn_cell_impl.BasicRNNCell(num_units, math_ops.tanh))
   elif mode == CUDNN_RNN_RELU:
     single_cell = (
         lambda: rnn_cell_impl.BasicRNNCell(num_units, gen_nn_ops.relu))
   else:
     raise ValueError("%s is not supported!" % mode)

   if not is_bidi:
     cell = rnn_cell_impl.MultiRNNCell(
         [single_cell() for _ in range(num_layers)])
     return rnn_lib.dynamic_rnn(
         cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope)
   else:
     cells_fw = [single_cell() for _ in range(num_layers)]
     cells_bw = [single_cell() for _ in range(num_layers)]

     (outputs, output_state_fw,
      output_state_bw) = contrib_rnn_lib.stack_bidirectional_dynamic_rnn(
          cells_fw,
          cells_bw,
          inputs,
          dtype=dtypes.float32,
          time_major=True,
          scope=scope)
     return outputs, (output_state_fw, output_state_bw)


 class CudnnRNNTestBasic(test_util.TensorFlowTestCase):

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testLayerBasic(self):
     num_layers = 4
     num_units = 2
     batch_size = 8
     direction = CUDNN_RNN_UNIDIRECTION
     dir_count = 1

     with vs.variable_scope("main"):
       kernel_initializer = init_ops.constant_initializer(0.)
       bias_initializer = init_ops.constant_initializer(0.)
       inputs = random_ops.random_uniform([
           num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32)

       lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units,
                                  direction=direction,
                                  kernel_initializer=kernel_initializer,
                                  bias_initializer=bias_initializer,
                                  name="awesome_lstm")

       # Build the layer
       outputs1, _ = lstm(inputs)
       # Reuse the layer
       outputs2, _ = lstm(inputs)

       total_sum1 = math_ops.reduce_sum(outputs1)
       total_sum2 = math_ops.reduce_sum(outputs2)

     with vs.variable_scope("main", reuse=True):
       lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units,
                                  direction=direction,
                                  kernel_initializer=kernel_initializer,
                                  bias_initializer=bias_initializer,
                                  name="awesome_lstm")

       # Reuse the layer
       outputs3, _ = lstm(inputs)
       total_sum3 = math_ops.reduce_sum(outputs3)

     self.assertEqual(1, len(variables.trainable_variables()))
     self.assertEqual(1, len(ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS)))
     self.assertEqual("main/awesome_lstm/opaque_kernel",
                      variables.trainable_variables()[0].op.name)

     with self.test_session(use_gpu=True) as sess:
       sess.run(variables.global_variables_initializer())
       (total_sum1_v, total_sum2_v, total_sum3_v) = sess.run(
           [total_sum1, total_sum2, total_sum3])
       self.assertEqual(0, total_sum1_v)
       self.assertEqual(0, total_sum2_v)
       self.assertEqual(0, total_sum3_v)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testOptimizersSupport(self):
     for opt in ("adagrad", "adam", "rmsprop", "momentum", "sgd"):
       self._TestOptimizerSupportHelper(opt)

   def _GetOptimizer(self, opt):
     if opt == "adagrad":
       return adagrad.AdagradOptimizer(learning_rate=1e-2)
     elif opt == "adam":
       return adam.AdamOptimizer(learning_rate=1e-2)
     elif opt == "rmsprop":
       return rmsprop.RMSPropOptimizer(learning_rate=1e-2)
     elif opt == "momentum":
       return momentum.MomentumOptimizer(learning_rate=1e-2, momentum=0.9)
     elif opt == "sgd":
       return gradient_descent.GradientDescentOptimizer(learning_rate=1e-2)
     else:
       raise ValueError("Unsupported optimizer: %s" % opt)

   def _TestOptimizerSupportHelper(self, opt):
     num_layers = 4
     num_units = 2
     batch_size = 8
     direction = CUDNN_RNN_UNIDIRECTION
     dir_count = 1

     with ops.Graph().as_default() as g:
       kernel_initializer = init_ops.constant_initializer(0.)
       bias_initializer = init_ops.constant_initializer(0.)
       inputs = random_ops.random_uniform([
           num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32)

       lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units,
                                  direction=direction,
                                  kernel_initializer=kernel_initializer,
                                  bias_initializer=bias_initializer,
                                  name="awesome_lstm")
       outputs, _ = lstm(inputs)
       loss = math_ops.reduce_sum(outputs)
       optimizer = self._GetOptimizer(opt)
       train_op = optimizer.minimize(loss)

     with self.test_session(use_gpu=True, graph=g) as sess:
       sess.run(variables.global_variables_initializer())
       sess.run(train_op)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSaveableGraphDeviceAssignment(self):
     num_layers = 4
     num_units = 2
     batch_size = 8
     direction = CUDNN_RNN_UNIDIRECTION
     dir_count = 1

     def DeviceFn(op):
       if op.type in ("Variable", "VariableV2"):
         return "/cpu:0"
       else:
         return "/gpu:0"

     with ops.Graph().as_default() as g:
       with ops.device(DeviceFn):
         with vs.variable_scope("main"):
           kernel_initializer = init_ops.constant_initializer(3.14)
           bias_initializer = init_ops.constant_initializer(1.59)
           inputs = random_ops.random_uniform(
               [num_layers * dir_count, batch_size, num_units],
               dtype=dtypes.float32)

           lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units,
                                      direction=direction,
                                      kernel_initializer=kernel_initializer,
                                      bias_initializer=bias_initializer,
                                      name="awesome_lstm")
           outputs = lstm(inputs)

         # saver is created in the scope of DeviceFn.
         saver = saver_lib.Saver()

     with self.test_session(use_gpu=True, graph=g) as sess:
       save_path = os.path.join(self.get_temp_dir(),
                                "test-saveable-device-assignment")
       sess.run(variables.global_variables_initializer())

       saver.save(sess, save_path)
       saver.restore(sess, save_path)
       sess.run(outputs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testDifferentShapesEager(self):
     # Checks that kernel caching does not cause sharing of temporary storage
     # across different input shapes when executing eagerly.
     with context.eager_mode():
       with ops.device("gpu:0"):
         first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
             array_ops.zeros([28, 100, 28]))
         second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
             array_ops.zeros([28, 100, 100]))
         self.assertAllEqual([28, 100, 100], first_output.shape)
         self.assertAllEqual([28, 100, 100], second_output.shape)

         def _LossFunc():
           first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
               array_ops.zeros([28, 100, 28]))
           second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
               array_ops.zeros([28, 100, 100]))
           return (math_ops.reduce_sum(first_output) +
                   math_ops.reduce_sum(second_output))

         backprop.implicit_grad(_LossFunc)()

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testDifferentShapesGraph(self):
     # Tests that a single kernel instance presented with multiple input shapes
     # does not crash with graph execution.
     with ops.device("gpu:0"):
       layer = cudnn_rnn.CudnnGRU(1, 100)
       layer(array_ops.zeros([28, 100, 100]))

       def _Cond(index, accumulation):
         del accumulation  # unused
         return math_ops.less(index, 4)

       def _Body(index, accumulation):
         layer_input = accumulation[:, :, 10 * (1 + index % 2):]
         output, _ = layer(layer_input)
         return index + 1, accumulation + output

       original_input = array_ops.zeros([28, 100, 100])
       _, accumulation = control_flow_ops.while_loop(_Cond, _Body,
                                                     [0, original_input])
       grad, = gradients.gradients(
           math_ops.reduce_sum(accumulation), (original_input,))
     init_op = variables.global_variables_initializer()
     with self.test_session() as sess:
       sess.run(init_op)
       accumulation_eval, grad_eval = sess.run((accumulation, grad))
       self.assertAllEqual([28, 100, 100], accumulation_eval.shape)
       self.assertAllEqual([28, 100, 100], grad_eval.shape)


 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
 class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):

   def _CompareWeights(self, lhs, rhs):
     self.assertEqual(len(lhs), len(rhs))
     for lw, rw in zip(lhs, rhs):
       self.assertAllEqual(lw, rw)

   def _CompareBiases(self, lhs, rhs, rnn_mode, num_layers, direction):
     self.assertEqual(len(lhs), len(rhs))
     if rnn_mode == CUDNN_LSTM:
       num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
     elif rnn_mode == CUDNN_GRU:
       num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
     elif rnn_mode == CUDNN_RNN_TANH:
       num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
     else:
       num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
     num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2
     num_params_per_layer *= num_dirs
     self.assertEqual(num_params_per_layer * num_layers, len(lhs))

     for i in range(num_layers):
       layer_lhs = lhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
       layer_rhs = rhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
       if direction == CUDNN_RNN_UNIDIRECTION:
         self._CompareSingleLayerBiases(layer_lhs, layer_rhs)
       else:
         size = len(layer_lhs)
         fw_lhs, bw_lhs = layer_lhs[:size//2], layer_lhs[size//2:]
         fw_rhs, bw_rhs = layer_rhs[:size//2], layer_rhs[size//2:]
         self._CompareSingleLayerBiases(fw_lhs, fw_rhs)
         self._CompareSingleLayerBiases(bw_lhs, bw_rhs)

   def _CompareSingleLayerBiases(self, lhs, rhs):
     self.assertEqual(len(lhs), len(rhs))

     lf_lhs, rt_lhs = lhs[:len(lhs)//2], lhs[len(lhs)//2:]
     lf_rhs, rt_rhs = rhs[:len(rhs)//2], rhs[len(rhs)//2:]
     self.assertEqual(len(lf_lhs), len(rt_lhs))
     self.assertEqual(len(lf_rhs), len(rt_rhs))

     sum_lhs, sum_rhs = [], []
     for lf, rt in zip(lf_lhs, rt_lhs):
       sum_lhs.append(lf + rt)
     for lf, rt in zip(lf_rhs, rt_rhs):
       sum_rhs.append(lf + rt)
     self.assertEqual(len(sum_lhs), len(sum_rhs))
     for lf, rt in zip(sum_lhs, sum_rhs):
       self.assertAllEqual(lf, rt)

   def _TestSaveRestoreVariable(self, rnn_mode, direction, dtype):
     input_size = 3
     num_layers = 2
     num_units = 7
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(1234)
       model = CudnnTestModel(
           rnn_mode,
           num_layers,
           num_units,
           input_size,
           direction=direction,
           dtype=dtype)
       rnn = model.rnn
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test")
       saver = saver_lib.Saver()
       weights, biases = model.rnn.saveable._OpaqueParamsToCanonical()
       opaque_params = rnn.trainable_variables[0]
       # CudnnTestModel() creates CudnnOpaqueParamsSaveable that helps saver save
       # Cudnn vars in canonical format.
       reset_op = state_ops.assign(
           opaque_params,
           array_ops.zeros(array_ops.shape(opaque_params), dtype=dtype))
       # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(use_gpu=True, graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         val = saver.save(sess, save_path)
         self.assertEqual(save_path, val)
         weights_v, biases_v = sess.run([weights, biases])

         # Reset opaque param
         sess.run(reset_op)
         saver.restore(sess, save_path)
         weights_v_restored, biases_v_restored = sess.run([weights, biases])

         self._CompareWeights(weights_v, weights_v_restored)
         self._CompareBiases(biases_v, biases_v_restored, rnn_mode, num_layers,
                             direction)

   def _TestSaveRestoreTwoVariables(self, rnn_mode, direction, dtype):
     input_size = 3
     num_layers = 2
     num_units = 7
     with ops.Graph().as_default() as g:
       random_seed.set_random_seed(1234)
       with vs.variable_scope("m1"):
         model1 = CudnnTestModel(
             rnn_mode,
             num_layers,
             num_units,
             input_size,
             direction=direction,
             dtype=dtype)
       with vs.variable_scope("m2"):
         model2 = CudnnTestModel(
             rnn_mode,
             num_layers,
             num_units,
             input_size,
             direction=direction,
             dtype=dtype)
       opaque_params = (model1.rnn.trainable_variables[0],
                        model2.rnn.trainable_variables[0])
       weights1, biases1 = model1.rnn.saveable._OpaqueParamsToCanonical()
       weights2, biases2 = model2.rnn.saveable._OpaqueParamsToCanonical()
       reset_params = [
           state_ops.assign(params,
                            array_ops.zeros_like(params, dtype=dtype))
           for params in opaque_params
       ]
       reset_op = control_flow_ops.group(*reset_params)
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test2")
       saver = saver_lib.Saver()
       # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(use_gpu=True, graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         val = saver.save(sess, save_path)
         self.assertEqual(save_path, val)

         weights1_v, biases1_v = sess.run([weights1, biases1])
         weights2_v, biases2_v = sess.run([weights2, biases2])

         sess.run(reset_op)
         saver.restore(sess, save_path)
         weights1_v_restored, biases1_v_restored = sess.run([weights1, biases1])
         weights2_v_restored, biases2_v_restored = sess.run([weights2, biases2])

         self._CompareWeights(weights1_v, weights1_v_restored)
         self._CompareWeights(weights2_v, weights2_v_restored)
         self._CompareBiases(biases1_v, biases1_v_restored, rnn_mode, num_layers,
                             direction)
         self._CompareBiases(biases2_v, biases2_v_restored, rnn_mode, num_layers,
                             direction)

   def _TestSaveRestoreOutput(self, rnn_mode, direction, dtype):
     with ops.Graph().as_default() as g:
       num_layers = 2
       num_units = 7
       input_size = 7
       seq_length = 8
       batch_size = 4
       model = CudnnTestModel(
           rnn_mode,
           num_layers,
           num_units,
           input_size,
           direction=direction,
           dtype=dtype,
           training=False)
       rnn = model.rnn

       save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test")
       saver = saver_lib.Saver()

       # Only one opaque var in a cudnn layer.
       assert len(rnn.trainable_variables) == 1
       reset_params = state_ops.assign(
           rnn.trainable_variables[0],
           array_ops.zeros(
               array_ops.shape(rnn.trainable_variables[0]), dtype=dtype))

       # Passing graph explicitly, otherwise an old sess would be reused.
       with self.test_session(use_gpu=True, graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         inputs, initial_state = model.SynthesizeInput(seq_length, batch_size)
         total_sum_v = model.Feed(sess, inputs, initial_state)
         val = saver.save(sess, save_path)
         self.assertEqual(save_path, val)

         sess.run(reset_params)
         saver.restore(sess, save_path)
         total_sum_v_restored = model.Feed(sess, inputs, initial_state)
         self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5)

   def _TestSaveRestoreHelper(self, rnn_mode):
     directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
     dtype_list = [dtypes.float16, dtypes.float32, dtypes.float64]
     for direction, dtype in itertools.product(directions, dtype_list):
       self._TestSaveRestoreVariable(rnn_mode, direction, dtype)
       self._TestSaveRestoreTwoVariables(rnn_mode, direction, dtype)
       self._TestSaveRestoreOutput(rnn_mode, direction, dtype)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSaveRestoreRepeatedlyCreateCustomSaveable(self):
     input_size = 3
     num_layers = 2
     num_units = 7
     with ops.Graph().as_default():
       random_seed.set_random_seed(1234)
       model = CudnnTestModel(
           CUDNN_LSTM,
           num_layers,
           num_units,
           input_size,
           direction=CUDNN_RNN_UNIDIRECTION,
           dtype=dtypes.float32)
       with self.assertRaisesRegexp(RuntimeError,
                                    "Cudnn saveable already created"):
         model.rnn._create_saveable()

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSaveRestoreLSTM(self):
     self._TestSaveRestoreHelper(CUDNN_LSTM)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSaveRestoreGRU(self):
     self._TestSaveRestoreHelper(CUDNN_GRU)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSaveRestoreRNNTanh(self):
     self._TestSaveRestoreHelper(CUDNN_RNN_TANH)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSaveRestoreRNNRelu(self):
     self._TestSaveRestoreHelper(CUDNN_RNN_RELU)


 class CudnnRNNTestSaveRestoreCheckpointable(test_util.TensorFlowTestCase):

   def _VerifyCheckpoint(
       self, checkpoint_path, compatible_cell_fn, cudnn_cell_fn,
       num_layers, input_size, expected_variable_values, num_applications=3):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     with ops.device("gpu:0"):
       cudnn_layer = cudnn_cell_fn()
       cudnn_checkpoint = checkpointable_utils.Checkpoint(cell=cudnn_layer)
       status = cudnn_checkpoint.restore(checkpoint_path)
       inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
                                    dtype=dtypes.float32)
       cudnn_output, _ = cudnn_layer(inputs)
       status.run_restore_ops()
     second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
     restore_layer = compatible_cell_fn()
     restore_layer_checkpoint = checkpointable_utils.Checkpoint(
         cell=restore_layer)
     status = restore_layer_checkpoint.restore(second_save_path)
     current_state = restore_layer.zero_state(1, dtypes.float32)
     for _ in range(num_applications):
       restore_layer_output, current_state = restore_layer(
           inputs=3. * array_ops.ones([1, input_size]),
           state=current_state)
     status.run_restore_ops()
     self.assertTrue(restore_layer.variables)
     for variable, expected_value in zip(
         restore_layer.variables, expected_variable_values):
       self.assertAllClose(expected_value, self.evaluate(variable))
     self.assertAllClose(self.evaluate(restore_layer_output),
                         self.evaluate(cudnn_output)[-1, -1:, ...])

   def _CheckpointableSingleCellUnidirectionalTestTemplate(
       self, single_cell_fn, cudnn_cell_fn):
     # Single-layer cuDNN cells with object-based checkpointing should be
     # checkpoint compatible with either single CudnnCompatible cells or
     # MultiRnnCells with one cell.
     input_size = 3
     save_cell_layer = single_cell_fn()
     save_cell_layer(
         inputs=array_ops.ones([1, input_size]),
         state=save_cell_layer.zero_state(1, dtypes.float32))
     self.assertTrue(save_cell_layer.variables)
     expected_values = []
     np.random.seed(10)
     for variable in save_cell_layer.variables:
       value = np.random.normal(size=variable.shape)
       expected_values.append(value)
       self.evaluate(variable.assign(value))
     save_checkpoint = checkpointable_utils.Checkpoint(cell=save_cell_layer)
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
     first_save_path = save_checkpoint.save(checkpoint_prefix)
     self._VerifyCheckpoint(
         checkpoint_path=first_save_path,
         compatible_cell_fn=
         lambda: rnn_cell_impl.MultiRNNCell([single_cell_fn()]),
         cudnn_cell_fn=cudnn_cell_fn,
         num_layers=1,
         expected_variable_values=expected_values,
         input_size=input_size)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
   def testLSTMCheckpointableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
     self._CheckpointableSingleCellUnidirectionalTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
             cudnn_rnn.CudnnLSTM, num_layers=1, num_units=num_units,
             direction=direction, name="awesome_lstm"))

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
   def testGRUCheckpointableSingleLayer(self):
     num_units = 2
     direction = CUDNN_RNN_UNIDIRECTION
     with self.assertRaises(NotImplementedError):
       # TODO(allenl): Implement object-based saving for GRUs and other cells.
       self._CheckpointableSingleCellUnidirectionalTestTemplate(
           single_cell_fn=functools.partial(
               cudnn_rnn_ops.CudnnCompatibleGRUCell, num_units=num_units),
           cudnn_cell_fn=functools.partial(
               cudnn_rnn.CudnnGRU, num_layers=1, num_units=num_units,
               direction=direction, name="awesome_gru"))

   def _CheckpointableMultiLayerTestTemplate(
       self, single_cell_fn, cudnn_cell_fn, num_layers):

     def _MultiCellFn():
       return rnn_cell_impl.MultiRNNCell(
           [single_cell_fn() for _ in range(num_layers)])
     input_size = 3
     save_graph = ops.Graph()
     with save_graph.as_default(), self.session(graph=save_graph):
       save_layer = _MultiCellFn()
       save_layer(inputs=array_ops.ones([1, input_size]),
                  state=save_layer.zero_state(1, dtypes.float32))
       self.assertTrue(save_layer.variables)
       expected_values = []
       np.random.seed(10)
       for variable in save_layer.variables:
         value = np.random.normal(size=variable.shape)
         expected_values.append(value)
         self.evaluate(variable.assign(value))
       save_checkpoint = checkpointable_utils.Checkpoint(cell=save_layer)
       checkpoint_directory = self.get_temp_dir()
       checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
       first_save_path = save_checkpoint.save(checkpoint_prefix)
     self._VerifyCheckpoint(
         checkpoint_path=first_save_path,
         compatible_cell_fn=_MultiCellFn, cudnn_cell_fn=cudnn_cell_fn,
         num_layers=num_layers,
         expected_variable_values=expected_values,
         input_size=input_size)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   @test_util.run_in_graph_and_eager_modes
   def testCudnnCompatibleLSTMCheckpointablMultiLayer(self):
     num_units = 2
     num_layers = 3
     direction = CUDNN_RNN_UNIDIRECTION
     self._CheckpointableMultiLayerTestTemplate(
         single_cell_fn=functools.partial(
             cudnn_rnn_ops.CudnnCompatibleLSTMCell, num_units=num_units),
         cudnn_cell_fn=functools.partial(
             cudnn_rnn.CudnnLSTM, num_layers=num_layers, num_units=num_units,
             direction=direction, name="awesome_lstm"),
         num_layers=num_layers)


 # TODO(jamesqin): Transform to parameterized test after it is included in the
 # TF open source codebase.
 class CudnnRNNTestCompatibleRNNCells(test_util.TensorFlowTestCase):

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testCudnnCompatibleLSTM(self):
     self._TestCudnnCompatibleRnnCellsHelper(CUDNN_LSTM)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testCudnnCompatibleGRU(self):
     self._TestCudnnCompatibleRnnCellsHelper(CUDNN_GRU)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testCudnnCompatibleRNNTanh(self):
     self._TestCudnnCompatibleRnnCellsHelper(CUDNN_RNN_TANH)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testCudnnCompatibleRNNRelu(self):
     self._TestCudnnCompatibleRnnCellsHelper(CUDNN_RNN_RELU)

   def _TestCudnnCompatibleRnnCellsHelper(self, rnn_mode):
     configs = [
         {
             "num_layers": 1,
             "seq_length": 3,
             "num_units": 4,
             "input_size": 5,
             "batch_size": 6,
         },
         {
             "num_layers": 2,
             "seq_length": 8,
             "num_units": 4,
             "input_size": 8,
             "batch_size": 16,
         },
         {
             "num_layers": 2,
             "seq_length": 3,
             "num_units": 4,
             "input_size": 5,
             "batch_size": 6,
         },
         {
             "num_layers": 1,
             "seq_length": 2,
             "num_units": 2,
             "input_size": 4,
             "batch_size": 1,
         },
     ]
     directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
     for cfg, direction in zip(configs, directions):
       self._TestCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"],
                                         cfg["num_units"], cfg["input_size"],
                                         cfg["batch_size"], rnn_mode, direction)

   def _TestCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units,
                                    input_size, batch_size, rnn_mode, direction):
     dtype = dtypes.float32
     # Train graph
     with ops.Graph().as_default() as g:
       model = CudnnTestModel(
           rnn_mode,
           num_layers,
           num_units,
           input_size,
           direction=direction,
           dtype=dtype,
           training=True)
       target_output = array_ops.placeholder(dtype=dtype)
       loss_op = losses.log_loss(
           labels=target_output, predictions=model.total_sum)
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1e-2)
       train_op = optimizer.minimize(loss_op)

       saver = saver_lib.Saver()

       # Train Cudnn model
       seed = 0
       with self.test_session(use_gpu=True, graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         # Train 128 steps
         num_steps = 128
         for _ in range(num_steps):
           inputs, _ = model.SynthesizeInput(seq_length, batch_size, seed)
           targets = np.random.rand()
           sess.run(
               train_op,
               feed_dict={
                   model.inputs: inputs,
                   model.initial_state: model.ZeroState(batch_size),
                   target_output: targets
               })
           seed += 1

         save_path = os.path.join(self.get_temp_dir(),
                                  ("cudnn-rnn-%s-test" % rnn_mode))
         save_v = saver.save(sess, save_path)
         self.assertEqual(save_path, save_v)

     # Cudnn inference graph
     with ops.Graph().as_default() as g:
       model = CudnnTestModel(
           rnn_mode,
           num_layers,
           num_units,
           input_size,
           direction=direction,
           dtype=dtype,
           training=False)
       rnn = model.rnn
       saver = saver_lib.Saver()

       inference_input = np.random.rand(seq_length, batch_size,
                                        input_size).astype(np.float32)
       with self.test_session(use_gpu=True, graph=g) as sess:
         sess.run(variables.global_variables_initializer())
         saver.restore(sess, save_path)

         # Cudnn inference
         cudnn_outputs_v, cudnn_output_states_v = model.Feed(
             sess, inference_input, return_sum=False)

     # Canonical RNN inference graph
     with ops.Graph().as_default() as g:
       cell_inputs = array_ops.placeholder(
           dtype, shape=[seq_length, batch_size, input_size])
       if direction == CUDNN_RNN_UNIDIRECTION:
         # outputs is one tensor, states are num_layer tuples, each 2 tensors
         (outputs, states) = _CreateCudnnCompatibleCanonicalRNN(rnn, cell_inputs)
         if rnn_mode == CUDNN_LSTM:
           output_h = array_ops.stack([s.h for s in states])
           output_c = array_ops.stack([s.c for s in states])
         else:
           output_state = array_ops.stack([s for s in states])
       else:
         # outputs is one tensor.
         # states is a tuple of 2 tuples:
         # each sub tuple is num_layer tuples, each with 2 tensors.
         (outputs, states) = _CreateCudnnCompatibleCanonicalRNN(
             rnn, cell_inputs, is_bidi=True)
         output_state_fw, output_state_bw = states
         if rnn_mode == CUDNN_LSTM:
           output_h, output_c = [], []
           for s_fw, s_bw in zip(output_state_fw, output_state_bw):
             output_h.append(array_ops.stack([s_fw.h, s_bw.h]))
             output_c.append(array_ops.stack([s_fw.c, s_bw.c]))
           output_h = array_ops.concat(output_h, axis=0)
           output_c = array_ops.concat(output_c, axis=0)
         else:
           output_state = []
           for s_fw, s_bw in zip(output_state_fw, output_state_bw):
             output_state.append(array_ops.stack([s_fw, s_bw]))
           output_state = array_ops.concat(output_state, axis=0)
       saver = saver_lib.Saver()

       with self.test_session(use_gpu=True, graph=g) as sess:
         saver.restore(sess, save_path)

         # BlockCell inference
         if rnn_mode == CUDNN_LSTM:
           outputs_v, output_h_v, output_c_v = sess.run(
               [outputs, output_h, output_c],
               feed_dict={cell_inputs: inference_input})
           self.assertAllClose(cudnn_outputs_v, outputs_v)
           cudnn_output_h_v, cudnn_output_c_v = cudnn_output_states_v
           self.assertAllClose(cudnn_output_h_v, output_h_v)
           self.assertAllClose(cudnn_output_c_v, output_c_v)
         else:
           outputs_v, output_state_v = sess.run(
               [outputs, output_state],
               feed_dict={cell_inputs: inference_input})
           self.assertAllClose(cudnn_outputs_v, outputs_v, atol=2e-5, rtol=2e-5)
           (cudnn_output_h_v,) = cudnn_output_states_v
           self.assertAllClose(cudnn_output_h_v, output_state_v, atol=2e-5,
                               rtol=2e-5)


 class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):

   def _TestOpaqueParamsSize(self, rnn_mode, num_layers, num_units, input_size,
                             dtype, direction):
     logging.info("Testing one lstm param size with config: %s", locals())
     model = CudnnTestModel(
         rnn_mode,
         num_layers,
         num_units,
         input_size,
         dtype=dtype,
         direction=direction)
     rnn = model.rnn

     # Min param size estimate = sum(weights.size) + sum(biases.size)
     min_params_size = (
         np.sum(map(np.prod, rnn.canonical_weight_shapes)) +
         np.sum([sp[0] for sp in rnn.canonical_bias_shapes]))

     opaque_params = rnn.trainable_variables[0]
     with self.test_session(use_gpu=True, graph=ops.get_default_graph()):
       variables.global_variables_initializer().run()
       opaque_params_size_v = opaque_params.eval().size
       self.assertLessEqual(min_params_size, opaque_params_size_v)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testOpaqueParamsSize(self):
     test_configs = [
         [4, 200, 200],
         [4, 200, 300],
         [4, 200, 100],
         [1, 100, 200],
         [2, 200, 100],
         [3, 200, 400],
     ]
     directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
     dtype_list = [dtypes.float16, dtypes.float32, dtypes.float64]
     rnns = [CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH]
     for (rnn, config, dtype, direction) in itertools.product(
         rnns, test_configs, dtype_list, directions):
       num_layers, num_units, input_size = config
       with ops.Graph().as_default():
         self._TestOpaqueParamsSize(rnn, num_layers, num_units, input_size,
                                    dtype, direction)


 class CudnnRNNTestTraining(test_util.TensorFlowTestCase):

   def setUp(self):
     super(CudnnRNNTestTraining, self).setUp()
     self._reset_rnd_gen_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE",
                                                str(False))
     self._rnn_use_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0")

   def tearDown(self):
     super(CudnnRNNTestTraining, self).tearDown()
     os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = self._reset_rnd_gen_state
     os.environ["TF_CUDNN_RNN_USE_V2"] = self._rnn_use_v2

   def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1):
     """Compute the numeric gradient of y wrt to x.

     Args:
       sess: The TF session constructed with a graph containing x and y.
       y: A scalar TF Tensor in the graph constructed in sess.
       x: A TF Tensor in the graph constructed in sess.
       delta: Gradient checker's small perturbation of x[i].
       step: Only compute numerical gradients for a subset of x values.
         I.e. dy/dx[i] is computed if i % step == 0.
     Returns:
       A Tensor of the same shape and dtype as x. If x[i] is not chosen
       to compute the numerical gradient dy/x[i], the corresponding
       value is set to 0.
     """

     x_data = sess.run(x)
     x_size = x_data.size
     x_shape = x_data.shape

     numeric_grad = np.zeros(x_size, dtype=x_data.dtype)

     for i in range(0, x_size, step):
       x_pos = x_data.copy()
       if x_size == 1:
         x_pos += delta
       else:
         x_pos.flat[i] += delta
       y_pos_feed_dict = dict([(x.name, x_pos)])
       y_pos = sess.run(y, feed_dict=y_pos_feed_dict)

       x_neg = x_data.copy()
       if x_size == 1:
         x_neg -= delta
       else:
         x_neg.flat[i] -= delta
       y_neg_feed_dict = dict([(x.name, x_neg)])
       y_neg = sess.run(y, feed_dict=y_neg_feed_dict)
       numeric_grad[i] = (y_pos - y_neg) / (2 * delta)
     return numeric_grad.reshape(x_shape)

   def _GetShape(self, sess, inputs):
     if not isinstance(inputs, collections.Iterable):
       return sess.run(array_ops.shape(inputs))
     else:
       return sess.run([array_ops.shape(x) for x in inputs])

   def _GradientCheckFp16(self, sess, y, xs, num_samples,
                          tolerance=1e-6, delta=1e-4):
     """Gradient check for Fp16.

     Fp16 numerical gradients end up being zeros. Use a new way to check
     gradients:

     Given multi-variant function:
     y = f(x1, x2, ... xn)
     delta_y = f(x1 + delta_x1, x2+delta_x2, ..., xn+delta_xn) -
               f(x1, x2, ..., xn)
             = f'(x1) * delta_x1 + f'(x2) * delta_x2 + .. + f'(xn) * delta_xn
     where:
       delta_xi are very small disturbance.
       f'(xi) is the gradient of y w.r.t xi.

     The gradient check verifies the expected delta_y calculated by the above
     equation is close to the actual delta_y.
     Args:
       sess: tf.Session object.
       y: output tensor.
       xs: a tensor or a list of input tensors.
       num_samples: number of test samples to run.
       tolerance: error tolerance.
       delta: the order of magnititued of input disturbance to apply to calculate
         the output change w.r.t inputs.
     """
     sym_grads = self._ComputeSymGrads(sess, y, xs)
     xs_shapes = self._GetShape(sess, xs)

     x_vals = [sess.run(x) for x in xs]
     for _ in range(num_samples):
       delta_xs = [delta * np.random.rand(*shape.tolist())
                   for shape in xs_shapes]

       feed_dict = {}
       for x, x_val, delta_x in zip(xs, x_vals, delta_xs):
         feed_dict[x] = x_val + delta_x
       actual_delta_y = (float(sess.run(y, feed_dict=feed_dict)) -
                         float(sess.run(y)))

       expected_delta_y = 0.
       for sym_grad, delta_x in zip(sym_grads, delta_xs):
         expected_delta_y += np.dot(
             sym_grad.astype(np.float32).flatten(),
             delta_x.astype(np.float32).flatten())
       self.assertAllClose(expected_delta_y, actual_delta_y,
                           atol=tolerance, rtol=tolerance)

   def _GradientCheck(self, sess, y, xs, tolerance=1e-6, delta=1e-4):
     sym_grads = self._ComputeSymGrads(sess, y, xs)

     num_grads = [self._ComputeNumericGrad(sess, y, x, delta) for x in xs]
     self.assertEqual(len(sym_grads), len(num_grads))
     for sym, num in zip(sym_grads, num_grads):
       self.assertFalse(np.any(np.isnan(sym)))
       self.assertFalse(np.any(np.isnan(num)))
       self.assertAllClose(sym, num, atol=tolerance, rtol=tolerance)

   def _ComputeSymGrads(self, sess, y, xs):
     sym_grads_t = gradients.gradients(y, xs)
     return sess.run(sym_grads_t)

   def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
                              batch_size, seq_length, dir_count, dropout, dtype,
                              use_v2, delta, tolerance):
     # Gradient checking runs two forward ops with almost the same input. Need to
     # make sure the drop patterns across the two runs are the same.
     logging.info("Training test with config: %s", locals())
     os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)

     np.random.seed(1234)
     random_seed.set_random_seed(5678)
     has_input_c = (rnn_mode == CUDNN_LSTM)
     direction = (CUDNN_RNN_UNIDIRECTION
                  if dir_count == 1 else CUDNN_RNN_BIDIRECTION)
     if use_v2:
       os.environ["TF_CUDNN_RNN_USE_V2"] = "1"
     else:
       os.environ["TF_CUDNN_RNN_USE_V2"] = "0"
     model = CudnnTestModel(
         rnn_mode,
         num_layers,
         num_units,
         input_size,
         direction=direction,
         dropout=dropout,
         dtype=dtype,
         training=True,
         bias_initializer=init_ops.random_normal_initializer(
             mean=1., dtype=dtype))
     rnn = model.rnn
     params = rnn.trainable_variables[0]

     inputs = variables.Variable(
         random_ops.random_uniform(
             [seq_length, batch_size, input_size], dtype=dtype),
         dtype=dtype)
     input_h = variables.Variable(
         random_ops.random_uniform(
             [num_layers * dir_count, batch_size, num_units], dtype=dtype),
         dtype=dtype)
     if has_input_c:
       input_c = variables.Variable(
           random_ops.random_uniform(
               [num_layers * dir_count, batch_size, num_units], dtype=dtype),
           dtype=dtype)
       initial_state = (input_h, input_c)
     else:
       initial_state = (input_h,)
     total_sum = model.FProp(inputs, initial_state, training=True)

     with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
       sess.run(variables.global_variables_initializer())
       all_inputs = [inputs, params]
       for s in initial_state:
         all_inputs.append(s)
       if dtype == dtypes.float16:
         self._GradientCheckFp16(
             sess, total_sum, all_inputs,
             num_samples=FLAGS.grad_check_num_samples,
             tolerance=tolerance, delta=delta)
       else:
         for _ in range(FLAGS.grad_check_num_samples):
           # Each time choose a different set of inputs.
           sess.run(variables.global_variables_initializer())
           self._GradientCheck(
               sess, total_sum, all_inputs,
               tolerance=tolerance, delta=delta)

   def _TestSimpleTrainingHelper(self, rnn_mode, test_configs):
     dropouts = [0, 0.5, 1.]
     v2_options = [str(False), str(True)]
     for config, dropout, use_v2 in itertools.product(test_configs, dropouts,
                                                      v2_options):
       dtype = config.get("dtype", dtypes.float32)
       delta = config.get("delta", 1e-4)
       tolerance = config.get("tolerance", 1e-6)
       dir_count = config.get("dir_count", 1)
       shape = config["shape"]
       with ops.Graph().as_default():
         self._TestOneSimpleTraining(
             rnn_mode, shape["num_layers"], shape["num_units"],
             shape["input_size"], shape["batch_size"], shape["seq_length"],
             dir_count, dropout, dtype, use_v2, delta, tolerance)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingLSTMFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
             "tolerance": 5e-6,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_LSTM, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingLSTMFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
             "delta": 1e-4,
             "tolerance": 9e-2,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_LSTM, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingLSTMFp16(self):
     test_configs = [
         {
             "dtype": dtypes.float16,
             "delta": 1e-3,
             "tolerance": 9e-2,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
         {
             "dtype": dtypes.float16,
             "delta": 1e-2,
             "tolerance": 9e-2,
             "shape": {
                 "num_layers": 2,
                 "num_units": 6,
                 "input_size": 8,
                 "batch_size": 6,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_LSTM, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingGRUFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
             "tolerance": 5e-6,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             }
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_GRU, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingGRUFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
             "delta": 1e-3,
             "tolerance": 4e-3,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_GRU, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingGRUFp16(self):
     test_configs = [
         {
             "dtype": dtypes.float16,
             "delta": 2e-3,
             "tolerance": 6e-2,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_GRU, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingRNNTanhFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
             "tolerance": 5e-6,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_RNN_TANH, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingRNNTanhFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
             "delta": 1e-3,
             "tolerance": 5e-3,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_RNN_TANH, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingRNNTanhFp16(self):
     test_configs = [
         {
             "dtype": dtypes.float16,
             "delta": 1e-3,
             "tolerance": 5e-2,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_RNN_TANH, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingRNNReluFp64(self):
     test_configs = [
         {
             "dtype": dtypes.float64,
             "tolerance": 5e-6,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_RNN_RELU, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingRNNReluFp32(self):
     test_configs = [
         {
             "dtype": dtypes.float32,
             "delta": 1e-4,
             "tolerance": 3e-1,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_RNN_RELU, test_configs)

   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
   def testSimpleTrainingRNNReluFp16(self):
     test_configs = [
         {
             "dtype": dtypes.float16,
             "delta": 1e-3,
             "tolerance": 7e-2,
             "shape": {
                 "num_layers": 2,
                 "num_units": 3,
                 "input_size": 4,
                 "batch_size": 3,
                 "seq_length": 4,
             },
         },
     ]
     self._TestSimpleTrainingHelper(CUDNN_RNN_RELU, test_configs)


 if __name__ == "__main__":
   argv0 = sys.argv[0]
   parser = argparse.ArgumentParser()
   parser.add_argument(
       "--grad_check_num_samples",
       type=int,
       default=5,
       help="Number of samples to run for gradient check.")
   FLAGS, unparsed = parser.parse_known_args()
   sys.argv = [argv0] + unparsed
   googletest.main()