torch/nn/modules/rnn.py - platform/external/pytorch - Git at Google

 import math
 import torch
 import warnings
 import numbers

 from .module import Module
 from ..parameter import Parameter
 from ..utils.rnn import PackedSequence
 from .. import init
 from .. import _VF

 _rnn_impls = {
     'RNN_TANH': _VF.rnn_tanh,
     'RNN_RELU': _VF.rnn_relu,
 }


 def apply_permutation(tensor, permutation, dim=1):
     # type: (Tensor, Tensor, int) -> Tensor
     return tensor.index_select(dim, permutation)


 class RNNBase(Module):
     __constants__ = ['mode', 'input_size', 'hidden_size', 'num_layers', 'bias',
                      'batch_first', 'dropout', 'bidirectional']

     def __init__(self, mode, input_size, hidden_size,
                  num_layers=1, bias=True, batch_first=False,
                  dropout=0., bidirectional=False):
         super(RNNBase, self).__init__()
         self.mode = mode
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.num_layers = num_layers
         self.bias = bias
         self.batch_first = batch_first
         self.dropout = float(dropout)
         self.bidirectional = bidirectional
         num_directions = 2 if bidirectional else 1

         if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
                 isinstance(dropout, bool):
             raise ValueError("dropout should be a number in range [0, 1] "
                              "representing the probability of an element being "
                              "zeroed")
         if dropout > 0 and num_layers == 1:
             warnings.warn("dropout option adds dropout after all but last "
                           "recurrent layer, so non-zero dropout expects "
                           "num_layers greater than 1, but got dropout={} and "
                           "num_layers={}".format(dropout, num_layers))

         if mode == 'LSTM':
             gate_size = 4 * hidden_size
         elif mode == 'GRU':
             gate_size = 3 * hidden_size
         elif mode == 'RNN_TANH':
             gate_size = hidden_size
         elif mode == 'RNN_RELU':
             gate_size = hidden_size
         else:
             raise ValueError("Unrecognized RNN mode: " + mode)

         self._flat_weights_names = []
         self._all_weights = []
         for layer in range(num_layers):
             for direction in range(num_directions):
                 layer_input_size = input_size if layer == 0 else hidden_size * num_directions

                 w_ih = Parameter(torch.Tensor(gate_size, layer_input_size))
                 w_hh = Parameter(torch.Tensor(gate_size, hidden_size))
                 b_ih = Parameter(torch.Tensor(gate_size))
                 # Second bias vector included for CuDNN compatibility. Only one
                 # bias vector is needed in standard definition.
                 b_hh = Parameter(torch.Tensor(gate_size))
                 layer_params = (w_ih, w_hh, b_ih, b_hh)

                 suffix = '_reverse' if direction == 1 else ''
                 param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}']
                 if bias:
                     param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}']
                 param_names = [x.format(layer, suffix) for x in param_names]

                 for name, param in zip(param_names, layer_params):
                     setattr(self, name, param)
                 self._flat_weights_names.extend(param_names)
                 self._all_weights.append(param_names)

         self._flat_weights = [getattr(self, weight) for weight in self._flat_weights_names if hasattr(self, weight)]
         self.flatten_parameters()
         self.reset_parameters()

     def __setattr__(self, attr, value):
         if hasattr(self, "_flat_weights_names") and attr in self._flat_weights_names:
             # keep self._flat_weights up to date if you do self.weight = ...
             idx = self._flat_weights_names.index(attr)
             self._flat_weights[idx] = value
         super(RNNBase, self).__setattr__(attr, value)

     def flatten_parameters(self):
         """Resets parameter data pointer so that they can use faster code paths.

         Right now, this works only if the module is on the GPU and cuDNN is enabled.
         Otherwise, it's a no-op.
         """
         # Short-circuits if _flat_weights is only partially instantiated
         if len(self._flat_weights) != len(self._flat_weights_names):
             return

         # Short-circuits if any tensor in self._flat_weights is not acceptable to cuDNN
         # or the tensors in _flat_weights are of different dtypes
         first_fw = self._flat_weights[0].data
         if not torch.is_tensor(first_fw):
             return
         dtype = first_fw.dtype
         for fw in self._flat_weights:
             if (not torch.is_tensor(fw.data) or not (fw.data.dtype == dtype) or
                     not fw.data.is_cuda or
                     not torch.backends.cudnn.is_acceptable(fw.data)):
                 return

         # If any parameters alias, we fall back to the slower, copying code path. This is
         # a sufficient check, because overlapping parameter buffers that don't completely
         # alias would break the assumptions of the uniqueness check in
         # Module.named_parameters().
         unique_data_ptrs = set(p.data_ptr() for p in self._flat_weights)
         if len(unique_data_ptrs) != len(self._flat_weights):
             return

         with torch.cuda.device_of(first_fw):
             import torch.backends.cudnn.rnn as rnn

             # Note: no_grad() is necessary since _cudnn_rnn_flatten_weight is
             # an inplace operation on self._flat_weights
             with torch.no_grad():
                 torch._cudnn_rnn_flatten_weight(
                     self._flat_weights, (4 if self.bias else 2),
                     self.input_size, rnn.get_cudnn_mode(self.mode), self.hidden_size, self.num_layers,
                     self.batch_first, bool(self.bidirectional))

     def _apply(self, fn):
         ret = super(RNNBase, self)._apply(fn)

         # Resets _flat_weights
         # Note: be v. careful before removing this, as 3rd party device types
         # likely rely on this behavior to properly .to() modules like LSTM.
         self._flat_weights = [getattr(self, weight) for weight in self._flat_weights_names if hasattr(self, weight)]

         # Flattens params (on CUDA)
         self.flatten_parameters()

         return ret

     def reset_parameters(self):
         stdv = 1.0 / math.sqrt(self.hidden_size)
         for weight in self.parameters():
             init.uniform_(weight, -stdv, stdv)

     def check_input(self, input, batch_sizes):
         # type: (Tensor, Optional[Tensor]) -> None
         expected_input_dim = 2 if batch_sizes is not None else 3
         if input.dim() != expected_input_dim:
             raise RuntimeError(
                 'input must have {} dimensions, got {}'.format(
                     expected_input_dim, input.dim()))
         if self.input_size != input.size(-1):
             raise RuntimeError(
                 'input.size(-1) must be equal to input_size. Expected {}, got {}'.format(
                     self.input_size, input.size(-1)))

     def get_expected_hidden_size(self, input, batch_sizes):
         # type: (Tensor, Optional[Tensor]) -> Tuple[int, int, int]
         if batch_sizes is not None:
             mini_batch = batch_sizes[0]
             mini_batch = int(mini_batch)
         else:
             mini_batch = input.size(0) if self.batch_first else input.size(1)
         num_directions = 2 if self.bidirectional else 1
         expected_hidden_size = (self.num_layers * num_directions,
                                 mini_batch, self.hidden_size)
         return expected_hidden_size

     def check_hidden_size(self, hx, expected_hidden_size, msg='Expected hidden size {}, got {}'):
         # type: (Tensor, Tuple[int, int, int], str) -> None
         if hx.size() != expected_hidden_size:
             raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size())))

     def check_forward_args(self, input, hidden, batch_sizes):
         # type: (Tensor, Tensor, Optional[Tensor]) -> None
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)

         self.check_hidden_size(hidden, expected_hidden_size)

     def permute_hidden(self, hx, permutation):
         # type: (Tensor, Optional[Tensor]) -> Tensor
         if permutation is None:
             return hx
         return apply_permutation(hx, permutation)

     def forward(self, input, hx=None):
         is_packed = isinstance(input, PackedSequence)
         if is_packed:
             input, batch_sizes, sorted_indices, unsorted_indices = input
             max_batch_size = batch_sizes[0]
             max_batch_size = int(max_batch_size)
         else:
             batch_sizes = None
             max_batch_size = input.size(0) if self.batch_first else input.size(1)
             sorted_indices = None
             unsorted_indices = None

         if hx is None:
             num_directions = 2 if self.bidirectional else 1
             hx = torch.zeros(self.num_layers * num_directions,
                              max_batch_size, self.hidden_size,
                              dtype=input.dtype, device=input.device)
         else:
             # Each batch of the hidden state should match the input sequence that
             # the user believes he/she is passing in.
             hx = self.permute_hidden(hx, sorted_indices)

         self.check_forward_args(input, hx, batch_sizes)
         _impl = _rnn_impls[self.mode]
         if batch_sizes is None:
             result = _impl(input, hx, self._flat_weights, self.bias, self.num_layers,
                            self.dropout, self.training, self.bidirectional, self.batch_first)
         else:
             result = _impl(input, batch_sizes, hx, self._flat_weights, self.bias,
                            self.num_layers, self.dropout, self.training, self.bidirectional)
         output = result[0]
         hidden = result[1]

         if is_packed:
             output = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
         return output, self.permute_hidden(hidden, unsorted_indices)

     def extra_repr(self):
         s = '{input_size}, {hidden_size}'
         if self.num_layers != 1:
             s += ', num_layers={num_layers}'
         if self.bias is not True:
             s += ', bias={bias}'
         if self.batch_first is not False:
             s += ', batch_first={batch_first}'
         if self.dropout != 0:
             s += ', dropout={dropout}'
         if self.bidirectional is not False:
             s += ', bidirectional={bidirectional}'
         return s.format(**self.__dict__)

     def __setstate__(self, d):
         super(RNNBase, self).__setstate__(d)
         if 'all_weights' in d:
             self._all_weights = d['all_weights']

         if isinstance(self._all_weights[0][0], str):
             return
         num_layers = self.num_layers
         num_directions = 2 if self.bidirectional else 1
         self._flat_weights_names = []
         self._all_weights = []
         for layer in range(num_layers):
             for direction in range(num_directions):
                 suffix = '_reverse' if direction == 1 else ''
                 weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}']
                 weights = [x.format(layer, suffix) for x in weights]
                 if self.bias:
                     self._all_weights += [weights]
                     self._flat_weights_names.extend(weights)
                 else:
                     self._all_weights += [weights[:2]]
                     self._flat_weights_names.extend(weights[:2])
         self._flat_weights = [getattr(self, weight) for weight in self._flat_weights_names if hasattr(self, weight)]

     @property
     def all_weights(self):
         return [[getattr(self, weight) for weight in weights] for weights in self._all_weights]

     def _replicate_for_data_parallel(self):
         replica = super(RNNBase, self)._replicate_for_data_parallel()
         # Need to copy these caches, otherwise the replica will share the same
         # flat weights list.
         replica._flat_weights = replica._flat_weights[:]
         replica._flat_weights_names = replica._flat_weights_names[:]
         return replica


 class RNN(RNNBase):
     r"""Applies a multi-layer Elman RNN with :math:`tanh` or :math:`ReLU` non-linearity to an
     input sequence.


     For each element in the input sequence, each layer computes the following
     function:

     .. math::
         h_t = \text{tanh}(W_{ih} x_t + b_{ih} + W_{hh} h_{(t-1)} + b_{hh})

     where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is
     the input at time `t`, and :math:`h_{(t-1)}` is the hidden state of the
     previous layer at time `t-1` or the initial hidden state at time `0`.
     If :attr:`nonlinearity` is ``'relu'``, then `ReLU` is used instead of `tanh`.

     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
         num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
             would mean stacking two RNNs together to form a `stacked RNN`,
             with the second RNN taking in outputs of the first RNN and
             computing the final results. Default: 1
         nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``
         bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
             Default: ``True``
         batch_first: If ``True``, then the input and output tensors are provided
             as `(batch, seq, feature)`. Default: ``False``
         dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
             RNN layer except the last layer, with dropout probability equal to
             :attr:`dropout`. Default: 0
         bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False``

     Inputs: input, h_0
         - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
           of the input sequence. The input can also be a packed variable length
           sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
           or :func:`torch.nn.utils.rnn.pack_sequence`
           for details.
         - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the initial hidden state for each element in the batch.
           Defaults to zero if not provided. If the RNN is bidirectional,
           num_directions should be 2, else it should be 1.

     Outputs: output, h_n
         - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
           containing the output features (`h_t`) from the last layer of the RNN,
           for each `t`.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
           been given as the input, the output will also be a packed sequence.

           For the unpacked case, the directions can be separated
           using ``output.view(seq_len, batch, num_directions, hidden_size)``,
           with forward and backward being direction `0` and `1` respectively.
           Similarly, the directions can be separated in the packed case.
         - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the hidden state for `t = seq_len`.

           Like *output*, the layers can be separated using
           ``h_n.view(num_layers, num_directions, batch, hidden_size)``.

     Shape:
         - Input1: :math:`(L, N, H_{in})` tensor containing input features where
           :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
         - Input2: :math:`(S, N, H_{out})` tensor
           containing the initial hidden state for each element in the batch.
           :math:`H_{out}=\text{hidden\_size}`
           Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
           If the RNN is bidirectional, num_directions should be 2, else it should be 1.
         - Output1: :math:`(L, N, H_{all})` where :math:`H_{all}=\text{num\_directions} * \text{hidden\_size}`
         - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
           for each element in the batch

     Attributes:
         weight_ih_l[k]: the learnable input-hidden weights of the k-th layer,
             of shape `(hidden_size, input_size)` for `k = 0`. Otherwise, the shape is
             `(hidden_size, num_directions * hidden_size)`
         weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer,
             of shape `(hidden_size, hidden_size)`
         bias_ih_l[k]: the learnable input-hidden bias of the k-th layer,
             of shape `(hidden_size)`
         bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer,
             of shape `(hidden_size)`

     .. note::
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`

     .. include:: cudnn_persistent_rnn.rst

     Examples::

         >>> rnn = nn.RNN(10, 20, 2)
         >>> input = torch.randn(5, 3, 10)
         >>> h0 = torch.randn(2, 3, 20)
         >>> output, hn = rnn(input, h0)
     """

     def __init__(self, *args, **kwargs):
         self.nonlinearity = kwargs.pop('nonlinearity', 'tanh')
         if self.nonlinearity == 'tanh':
             mode = 'RNN_TANH'
         elif self.nonlinearity == 'relu':
             mode = 'RNN_RELU'
         else:
             raise ValueError("Unknown nonlinearity '{}'".format(self.nonlinearity))
         super(RNN, self).__init__(mode, *args, **kwargs)


 # XXX: LSTM and GRU implementation is different from RNNBase, this is because:
 # 1. we want to support nn.LSTM and nn.GRU in TorchScript and TorchScript in
 #    its current state could not support the python Union Type or Any Type
 # 2. TorchScript static typing does not allow a Function or Callable type in
 #    Dict values, so we have to separately call _VF instead of using _rnn_impls
 # 3. This is temporary only and in the transition state that we want to make it
 #    on time for the release
 #
 # More discussion details in https://github.com/pytorch/pytorch/pull/23266
 #
 # TODO: remove the overriding implementations for LSTM and GRU when TorchScript
 # support expressing these two modules generally.
 class LSTM(RNNBase):
     r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
     sequence.


     For each element in the input sequence, each layer computes the following
     function:

     .. math::
         \begin{array}{ll} \\
             i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
             f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
             g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
             o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
             c_t = f_t * c_{(t-1)} + i_t * g_t \\
             h_t = o_t * \tanh(c_t) \\
         \end{array}

     where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
     state at time `t`, :math:`x_t` is the input at time `t`, :math:`h_{(t-1)}`
     is the hidden state of the layer at time `t-1` or the initial hidden
     state at time `0`, and :math:`i_t`, :math:`f_t`, :math:`g_t`,
     :math:`o_t` are the input, forget, cell, and output gates, respectively.
     :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

     In a multilayer LSTM, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
     (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
     dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
     variable which is :math:`0` with probability :attr:`dropout`.

     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
         num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
             would mean stacking two LSTMs together to form a `stacked LSTM`,
             with the second LSTM taking in outputs of the first LSTM and
             computing the final results. Default: 1
         bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
             Default: ``True``
         batch_first: If ``True``, then the input and output tensors are provided
             as (batch, seq, feature). Default: ``False``
         dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
             LSTM layer except the last layer, with dropout probability equal to
             :attr:`dropout`. Default: 0
         bidirectional: If ``True``, becomes a bidirectional LSTM. Default: ``False``

     Inputs: input, (h_0, c_0)
         - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
           of the input sequence.
           The input can also be a packed variable length sequence.
           See :func:`torch.nn.utils.rnn.pack_padded_sequence` or
           :func:`torch.nn.utils.rnn.pack_sequence` for details.
         - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the initial hidden state for each element in the batch.
           If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
         - **c_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the initial cell state for each element in the batch.

           If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.


     Outputs: output, (h_n, c_n)
         - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
           containing the output features `(h_t)` from the last layer of the LSTM,
           for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
           given as the input, the output will also be a packed sequence.

           For the unpacked case, the directions can be separated
           using ``output.view(seq_len, batch, num_directions, hidden_size)``,
           with forward and backward being direction `0` and `1` respectively.
           Similarly, the directions can be separated in the packed case.
         - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the hidden state for `t = seq_len`.

           Like *output*, the layers can be separated using
           ``h_n.view(num_layers, num_directions, batch, hidden_size)`` and similarly for *c_n*.
         - **c_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the cell state for `t = seq_len`.

     Attributes:
         weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
             `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size, input_size)` for `k = 0`.
             Otherwise, the shape is `(4*hidden_size, num_directions * hidden_size)`
         weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
             `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size, hidden_size)`
         bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
             `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)`
         bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
             `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)`

     .. note::
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`

     .. include:: cudnn_persistent_rnn.rst

     Examples::

         >>> rnn = nn.LSTM(10, 20, 2)
         >>> input = torch.randn(5, 3, 10)
         >>> h0 = torch.randn(2, 3, 20)
         >>> c0 = torch.randn(2, 3, 20)
         >>> output, (hn, cn) = rnn(input, (h0, c0))
     """
     def __init__(self, *args, **kwargs):
         super(LSTM, self).__init__('LSTM', *args, **kwargs)

     def check_forward_args(self, input, hidden, batch_sizes):
         # type: (Tensor, Tuple[Tensor, Tensor], Optional[Tensor]) -> None
         self.check_input(input, batch_sizes)
         expected_hidden_size = self.get_expected_hidden_size(input, batch_sizes)

         self.check_hidden_size(hidden[0], expected_hidden_size,
                                'Expected hidden[0] size {}, got {}')
         self.check_hidden_size(hidden[1], expected_hidden_size,
                                'Expected hidden[1] size {}, got {}')

     def permute_hidden(self, hx, permutation):
         # type: (Tuple[Tensor, Tensor], Optional[Tensor]) -> Tuple[Tensor, Tensor]
         if permutation is None:
             return hx
         return apply_permutation(hx[0], permutation), apply_permutation(hx[1], permutation)

     @torch._jit_internal._overload_method  # noqa: F811
     def forward(self, input, hx=None):  # noqa: F811
         # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tuple[Tensor, Tensor]]
         pass

     @torch._jit_internal._overload_method  # noqa: F811
     def forward(self, input, hx=None):  # noqa: F811
         # type: (PackedSequence, Optional[Tuple[Tensor, Tensor]]) -> Tuple[PackedSequence, Tuple[Tensor, Tensor]]  # noqa
         pass

     def forward(self, input, hx=None):  # noqa: F811
         orig_input = input
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             input, batch_sizes, sorted_indices, unsorted_indices = input
             max_batch_size = batch_sizes[0]
             max_batch_size = int(max_batch_size)
         else:
             batch_sizes = None
             max_batch_size = input.size(0) if self.batch_first else input.size(1)
             sorted_indices = None
             unsorted_indices = None

         if hx is None:
             num_directions = 2 if self.bidirectional else 1
             zeros = torch.zeros(self.num_layers * num_directions,
                                 max_batch_size, self.hidden_size,
                                 dtype=input.dtype, device=input.device)
             hx = (zeros, zeros)
         else:
             # Each batch of the hidden state should match the input sequence that
             # the user believes he/she is passing in.
             hx = self.permute_hidden(hx, sorted_indices)

         self.check_forward_args(input, hx, batch_sizes)
         if batch_sizes is None:
             result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,
                               self.dropout, self.training, self.bidirectional, self.batch_first)
         else:
             result = _VF.lstm(input, batch_sizes, hx, self._flat_weights, self.bias,
                               self.num_layers, self.dropout, self.training, self.bidirectional)
         output = result[0]
         hidden = result[1:]
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
             return output, self.permute_hidden(hidden, unsorted_indices)


 class GRU(RNNBase):
     r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.


     For each element in the input sequence, each layer computes the following
     function:

     .. math::
         \begin{array}{ll}
             r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
             z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
             n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
             h_t = (1 - z_t) * n_t + z_t * h_{(t-1)}
         \end{array}

     where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the input
     at time `t`, :math:`h_{(t-1)}` is the hidden state of the layer
     at time `t-1` or the initial hidden state at time `0`, and :math:`r_t`,
     :math:`z_t`, :math:`n_t` are the reset, update, and new gates, respectively.
     :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

     In a multilayer GRU, the input :math:`x^{(l)}_t` of the :math:`l` -th layer
     (:math:`l >= 2`) is the hidden state :math:`h^{(l-1)}_t` of the previous layer multiplied by
     dropout :math:`\delta^{(l-1)}_t` where each :math:`\delta^{(l-1)}_t` is a Bernoulli random
     variable which is :math:`0` with probability :attr:`dropout`.

     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
         num_layers: Number of recurrent layers. E.g., setting ``num_layers=2``
             would mean stacking two GRUs together to form a `stacked GRU`,
             with the second GRU taking in outputs of the first GRU and
             computing the final results. Default: 1
         bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
             Default: ``True``
         batch_first: If ``True``, then the input and output tensors are provided
             as (batch, seq, feature). Default: ``False``
         dropout: If non-zero, introduces a `Dropout` layer on the outputs of each
             GRU layer except the last layer, with dropout probability equal to
             :attr:`dropout`. Default: 0
         bidirectional: If ``True``, becomes a bidirectional GRU. Default: ``False``

     Inputs: input, h_0
         - **input** of shape `(seq_len, batch, input_size)`: tensor containing the features
           of the input sequence. The input can also be a packed variable length
           sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence`
           for details.
         - **h_0** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the initial hidden state for each element in the batch.
           Defaults to zero if not provided. If the RNN is bidirectional,
           num_directions should be 2, else it should be 1.

     Outputs: output, h_n
         - **output** of shape `(seq_len, batch, num_directions * hidden_size)`: tensor
           containing the output features h_t from the last layer of the GRU,
           for each `t`. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
           given as the input, the output will also be a packed sequence.
           For the unpacked case, the directions can be separated
           using ``output.view(seq_len, batch, num_directions, hidden_size)``,
           with forward and backward being direction `0` and `1` respectively.

           Similarly, the directions can be separated in the packed case.
         - **h_n** of shape `(num_layers * num_directions, batch, hidden_size)`: tensor
           containing the hidden state for `t = seq_len`

           Like *output*, the layers can be separated using
           ``h_n.view(num_layers, num_directions, batch, hidden_size)``.

     Shape:
         - Input1: :math:`(L, N, H_{in})` tensor containing input features where
           :math:`H_{in}=\text{input\_size}` and `L` represents a sequence length.
         - Input2: :math:`(S, N, H_{out})` tensor
           containing the initial hidden state for each element in the batch.
           :math:`H_{out}=\text{hidden\_size}`
           Defaults to zero if not provided. where :math:`S=\text{num\_layers} * \text{num\_directions}`
           If the RNN is bidirectional, num_directions should be 2, else it should be 1.
         - Output1: :math:`(L, N, H_{all})` where :math:`H_{all}=\text{num\_directions} * \text{hidden\_size}`
         - Output2: :math:`(S, N, H_{out})` tensor containing the next hidden state
           for each element in the batch

     Attributes:
         weight_ih_l[k] : the learnable input-hidden weights of the :math:`\text{k}^{th}` layer
             (W_ir|W_iz|W_in), of shape `(3*hidden_size, input_size)` for `k = 0`.
             Otherwise, the shape is `(3*hidden_size, num_directions * hidden_size)`
         weight_hh_l[k] : the learnable hidden-hidden weights of the :math:`\text{k}^{th}` layer
             (W_hr|W_hz|W_hn), of shape `(3*hidden_size, hidden_size)`
         bias_ih_l[k] : the learnable input-hidden bias of the :math:`\text{k}^{th}` layer
             (b_ir|b_iz|b_in), of shape `(3*hidden_size)`
         bias_hh_l[k] : the learnable hidden-hidden bias of the :math:`\text{k}^{th}` layer
             (b_hr|b_hz|b_hn), of shape `(3*hidden_size)`

     .. note::
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`

     .. include:: cudnn_persistent_rnn.rst

     Examples::

         >>> rnn = nn.GRU(10, 20, 2)
         >>> input = torch.randn(5, 3, 10)
         >>> h0 = torch.randn(2, 3, 20)
         >>> output, hn = rnn(input, h0)
     """
     def __init__(self, *args, **kwargs):
         super(GRU, self).__init__('GRU', *args, **kwargs)

     @torch._jit_internal._overload_method  # noqa: F811
     def forward(self, input, hx=None):  # noqa: F811
         # type: (Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor]
         pass

     @torch._jit_internal._overload_method  # noqa: F811
     def forward(self, input, hx=None):  # noqa: F811
         # type: (PackedSequence, Optional[Tensor]) -> Tuple[PackedSequence, Tensor]
         pass

     def forward(self, input, hx=None):  # noqa: F811
         orig_input = input
         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             input, batch_sizes, sorted_indices, unsorted_indices = input
             max_batch_size = batch_sizes[0]
             max_batch_size = int(max_batch_size)
         else:
             batch_sizes = None
             max_batch_size = input.size(0) if self.batch_first else input.size(1)
             sorted_indices = None
             unsorted_indices = None

         if hx is None:
             num_directions = 2 if self.bidirectional else 1
             hx = torch.zeros(self.num_layers * num_directions,
                              max_batch_size, self.hidden_size,
                              dtype=input.dtype, device=input.device)
         else:
             # Each batch of the hidden state should match the input sequence that
             # the user believes he/she is passing in.
             hx = self.permute_hidden(hx, sorted_indices)

         self.check_forward_args(input, hx, batch_sizes)
         if batch_sizes is None:
             result = _VF.gru(input, hx, self._flat_weights, self.bias, self.num_layers,
                              self.dropout, self.training, self.bidirectional, self.batch_first)
         else:
             result = _VF.gru(input, batch_sizes, hx, self._flat_weights, self.bias,
                              self.num_layers, self.dropout, self.training, self.bidirectional)
         output = result[0]
         hidden = result[1]

         # xxx: isinstance check needs to be in conditional for TorchScript to compile
         if isinstance(orig_input, PackedSequence):
             output_packed = PackedSequence(output, batch_sizes, sorted_indices, unsorted_indices)
             return output_packed, self.permute_hidden(hidden, unsorted_indices)
         else:
             return output, self.permute_hidden(hidden, unsorted_indices)


 class RNNCellBase(Module):
     __constants__ = ['input_size', 'hidden_size', 'bias']

     def __init__(self, input_size, hidden_size, bias, num_chunks):
         super(RNNCellBase, self).__init__()
         self.input_size = input_size
         self.hidden_size = hidden_size
         self.bias = bias
         self.weight_ih = Parameter(torch.Tensor(num_chunks * hidden_size, input_size))
         self.weight_hh = Parameter(torch.Tensor(num_chunks * hidden_size, hidden_size))
         if bias:
             self.bias_ih = Parameter(torch.Tensor(num_chunks * hidden_size))
             self.bias_hh = Parameter(torch.Tensor(num_chunks * hidden_size))
         else:
             self.register_parameter('bias_ih', None)
             self.register_parameter('bias_hh', None)
         self.reset_parameters()

     def extra_repr(self):
         s = '{input_size}, {hidden_size}'
         if 'bias' in self.__dict__ and self.bias is not True:
             s += ', bias={bias}'
         if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
             s += ', nonlinearity={nonlinearity}'
         return s.format(**self.__dict__)

     def check_forward_input(self, input):
         if input.size(1) != self.input_size:
             raise RuntimeError(
                 "input has inconsistent input_size: got {}, expected {}".format(
                     input.size(1), self.input_size))

     def check_forward_hidden(self, input, hx, hidden_label=''):
         # type: (Tensor, Tensor, str) -> None
         if input.size(0) != hx.size(0):
             raise RuntimeError(
                 "Input batch size {} doesn't match hidden{} batch size {}".format(
                     input.size(0), hidden_label, hx.size(0)))

         if hx.size(1) != self.hidden_size:
             raise RuntimeError(
                 "hidden{} has inconsistent hidden_size: got {}, expected {}".format(
                     hidden_label, hx.size(1), self.hidden_size))

     def reset_parameters(self):
         stdv = 1.0 / math.sqrt(self.hidden_size)
         for weight in self.parameters():
             init.uniform_(weight, -stdv, stdv)


 class RNNCell(RNNCellBase):
     r"""An Elman RNN cell with tanh or ReLU non-linearity.

     .. math::

         h' = \tanh(W_{ih} x + b_{ih}  +  W_{hh} h + b_{hh})

     If :attr:`nonlinearity` is `'relu'`, then ReLU is used in place of tanh.

     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
         bias: If ``False``, then the layer does not use bias weights `b_ih` and `b_hh`.
             Default: ``True``
         nonlinearity: The non-linearity to use. Can be either ``'tanh'`` or ``'relu'``. Default: ``'tanh'``

     Inputs: input, hidden
         - **input** of shape `(batch, input_size)`: tensor containing input features
         - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden
           state for each element in the batch.
           Defaults to zero if not provided.

     Outputs: h'
         - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
           for each element in the batch

     Shape:
         - Input1: :math:`(N, H_{in})` tensor containing input features where
           :math:`H_{in}` = `input_size`
         - Input2: :math:`(N, H_{out})` tensor containing the initial hidden
           state for each element in the batch where :math:`H_{out}` = `hidden_size`
           Defaults to zero if not provided.
         - Output: :math:`(N, H_{out})` tensor containing the next hidden state
           for each element in the batch

     Attributes:
         weight_ih: the learnable input-hidden weights, of shape
             `(hidden_size, input_size)`
         weight_hh: the learnable hidden-hidden weights, of shape
             `(hidden_size, hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(hidden_size)`
         bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)`

     .. note::
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`

     Examples::

         >>> rnn = nn.RNNCell(10, 20)
         >>> input = torch.randn(6, 3, 10)
         >>> hx = torch.randn(3, 20)
         >>> output = []
         >>> for i in range(6):
                 hx = rnn(input[i], hx)
                 output.append(hx)
     """
     __constants__ = ['input_size', 'hidden_size', 'bias', 'nonlinearity']

     def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"):
         super(RNNCell, self).__init__(input_size, hidden_size, bias, num_chunks=1)
         self.nonlinearity = nonlinearity

     def forward(self, input, hx=None):
         # type: (Tensor, Optional[Tensor]) -> Tensor
         self.check_forward_input(input)
         if hx is None:
             hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
         self.check_forward_hidden(input, hx, '')
         if self.nonlinearity == "tanh":
             ret = _VF.rnn_tanh_cell(
                 input, hx,
                 self.weight_ih, self.weight_hh,
                 self.bias_ih, self.bias_hh,
             )
         elif self.nonlinearity == "relu":
             ret = _VF.rnn_relu_cell(
                 input, hx,
                 self.weight_ih, self.weight_hh,
                 self.bias_ih, self.bias_hh,
             )
         else:
             ret = input  # TODO: remove when jit supports exception flow
             raise RuntimeError(
                 "Unknown nonlinearity: {}".format(self.nonlinearity))
         return ret


 class LSTMCell(RNNCellBase):
     r"""A long short-term memory (LSTM) cell.

     .. math::

         \begin{array}{ll}
         i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
         f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
         g = \tanh(W_{ig} x + b_{ig} + W_{hg} h + b_{hg}) \\
         o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
         c' = f * c + i * g \\
         h' = o * \tanh(c') \\
         \end{array}

     where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
         bias: If ``False``, then the layer does not use bias weights `b_ih` and
             `b_hh`. Default: ``True``

     Inputs: input, (h_0, c_0)
         - **input** of shape `(batch, input_size)`: tensor containing input features
         - **h_0** of shape `(batch, hidden_size)`: tensor containing the initial hidden
           state for each element in the batch.
         - **c_0** of shape `(batch, hidden_size)`: tensor containing the initial cell state
           for each element in the batch.

           If `(h_0, c_0)` is not provided, both **h_0** and **c_0** default to zero.

     Outputs: (h_1, c_1)
         - **h_1** of shape `(batch, hidden_size)`: tensor containing the next hidden state
           for each element in the batch
         - **c_1** of shape `(batch, hidden_size)`: tensor containing the next cell state
           for each element in the batch

     Attributes:
         weight_ih: the learnable input-hidden weights, of shape
             `(4*hidden_size, input_size)`
         weight_hh: the learnable hidden-hidden weights, of shape
             `(4*hidden_size, hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)`
         bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)`

     .. note::
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`

     Examples::

         >>> rnn = nn.LSTMCell(10, 20)
         >>> input = torch.randn(6, 3, 10)
         >>> hx = torch.randn(3, 20)
         >>> cx = torch.randn(3, 20)
         >>> output = []
         >>> for i in range(6):
                 hx, cx = rnn(input[i], (hx, cx))
                 output.append(hx)
     """

     def __init__(self, input_size, hidden_size, bias=True):
         super(LSTMCell, self).__init__(input_size, hidden_size, bias, num_chunks=4)

     def forward(self, input, hx=None):
         # type: (Tensor, Optional[Tuple[Tensor, Tensor]]) -> Tuple[Tensor, Tensor]
         self.check_forward_input(input)
         if hx is None:
             zeros = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
             hx = (zeros, zeros)
         self.check_forward_hidden(input, hx[0], '[0]')
         self.check_forward_hidden(input, hx[1], '[1]')
         return _VF.lstm_cell(
             input, hx,
             self.weight_ih, self.weight_hh,
             self.bias_ih, self.bias_hh,
         )


 class GRUCell(RNNCellBase):
     r"""A gated recurrent unit (GRU) cell

     .. math::

         \begin{array}{ll}
         r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\
         z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\
         n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\
         h' = (1 - z) * n + z * h
         \end{array}

     where :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product.

     Args:
         input_size: The number of expected features in the input `x`
         hidden_size: The number of features in the hidden state `h`
         bias: If ``False``, then the layer does not use bias weights `b_ih` and
             `b_hh`. Default: ``True``

     Inputs: input, hidden
         - **input** of shape `(batch, input_size)`: tensor containing input features
         - **hidden** of shape `(batch, hidden_size)`: tensor containing the initial hidden
           state for each element in the batch.
           Defaults to zero if not provided.

     Outputs: h'
         - **h'** of shape `(batch, hidden_size)`: tensor containing the next hidden state
           for each element in the batch

     Shape:
         - Input1: :math:`(N, H_{in})` tensor containing input features where
           :math:`H_{in}` = `input_size`
         - Input2: :math:`(N, H_{out})` tensor containing the initial hidden
           state for each element in the batch where :math:`H_{out}` = `hidden_size`
           Defaults to zero if not provided.
         - Output: :math:`(N, H_{out})` tensor containing the next hidden state
           for each element in the batch

     Attributes:
         weight_ih: the learnable input-hidden weights, of shape
             `(3*hidden_size, input_size)`
         weight_hh: the learnable hidden-hidden weights, of shape
             `(3*hidden_size, hidden_size)`
         bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)`
         bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)`

     .. note::
         All the weights and biases are initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`
         where :math:`k = \frac{1}{\text{hidden\_size}}`

     Examples::

         >>> rnn = nn.GRUCell(10, 20)
         >>> input = torch.randn(6, 3, 10)
         >>> hx = torch.randn(3, 20)
         >>> output = []
         >>> for i in range(6):
                 hx = rnn(input[i], hx)
                 output.append(hx)
     """

     def __init__(self, input_size, hidden_size, bias=True):
         super(GRUCell, self).__init__(input_size, hidden_size, bias, num_chunks=3)

     def forward(self, input, hx=None):
         # type: (Tensor, Optional[Tensor]) -> Tensor
         self.check_forward_input(input)
         if hx is None:
             hx = torch.zeros(input.size(0), self.hidden_size, dtype=input.dtype, device=input.device)
         self.check_forward_hidden(input, hx, '')
         return _VF.gru_cell(
             input, hx,
             self.weight_ih, self.weight_hh,
             self.bias_ih, self.bias_hh,
         )