| import math |
| import torch |
| import warnings |
| |
| from .module import Module |
| from ..parameter import Parameter |
| from ..utils.rnn import PackedSequence |
| |
| |
| class RNNBase(Module): |
| |
| def __init__(self, mode, input_size, hidden_size, |
| num_layers=1, bias=True, batch_first=False, |
| dropout=0, bidirectional=False): |
| super(RNNBase, self).__init__() |
| self.mode = mode |
| self.input_size = input_size |
| self.hidden_size = hidden_size |
| self.num_layers = num_layers |
| self.bias = bias |
| self.batch_first = batch_first |
| self.dropout = dropout |
| self.dropout_state = {} |
| self.bidirectional = bidirectional |
| num_directions = 2 if bidirectional else 1 |
| |
| if mode == 'LSTM': |
| gate_size = 4 * hidden_size |
| elif mode == 'GRU': |
| gate_size = 3 * hidden_size |
| else: |
| gate_size = hidden_size |
| |
| self._all_weights = [] |
| for layer in range(num_layers): |
| for direction in range(num_directions): |
| layer_input_size = input_size if layer == 0 else hidden_size * num_directions |
| |
| w_ih = Parameter(torch.Tensor(gate_size, layer_input_size)) |
| w_hh = Parameter(torch.Tensor(gate_size, hidden_size)) |
| b_ih = Parameter(torch.Tensor(gate_size)) |
| b_hh = Parameter(torch.Tensor(gate_size)) |
| layer_params = (w_ih, w_hh, b_ih, b_hh) |
| |
| suffix = '_reverse' if direction == 1 else '' |
| param_names = ['weight_ih_l{}{}', 'weight_hh_l{}{}'] |
| if bias: |
| param_names += ['bias_ih_l{}{}', 'bias_hh_l{}{}'] |
| param_names = [x.format(layer, suffix) for x in param_names] |
| |
| for name, param in zip(param_names, layer_params): |
| setattr(self, name, param) |
| self._all_weights.append(param_names) |
| |
| self.flatten_parameters() |
| self.reset_parameters() |
| |
| def flatten_parameters(self): |
| """Resets parameter data pointer so that they can use faster code paths. |
| |
| Right now, this works only if the module is on the GPU and cuDNN is enabled. |
| Otherwise, it's a no-op. |
| """ |
| any_param = next(self.parameters()).data |
| if not any_param.is_cuda or not torch.backends.cudnn.is_acceptable(any_param): |
| self._data_ptrs = [] |
| return |
| |
| # If any parameters alias, we fall back to the slower, copying code path. This is |
| # a sufficient check, because overlapping parameter buffers that don't completely |
| # alias would break the assumptions of the uniqueness check in |
| # Module.named_parameters(). |
| unique_data_ptrs = set(p.data_ptr() for l in self.all_weights for p in l) |
| if len(unique_data_ptrs) != sum(len(l) for l in self.all_weights): |
| self._data_ptrs = [] |
| return |
| |
| with torch.cuda.device_of(any_param): |
| # This is quite ugly, but it allows us to reuse the cuDNN code without larger |
| # modifications. It's really a low-level API that doesn't belong in here, but |
| # let's make this exception. |
| from torch.backends.cudnn import rnn |
| from torch.backends import cudnn |
| from torch.nn._functions.rnn import CudnnRNN |
| handle = cudnn.get_handle() |
| with warnings.catch_warnings(record=True): |
| fn = CudnnRNN( |
| self.mode, |
| self.input_size, |
| self.hidden_size, |
| num_layers=self.num_layers, |
| batch_first=self.batch_first, |
| dropout=self.dropout, |
| train=self.training, |
| bidirectional=self.bidirectional, |
| dropout_state=self.dropout_state, |
| ) |
| |
| # Initialize descriptors |
| fn.datatype = cudnn._typemap[any_param.type()] |
| fn.x_descs = cudnn.descriptor(any_param.new(1, self.input_size), 1) |
| fn.rnn_desc = rnn.init_rnn_descriptor(fn, handle) |
| |
| # Allocate buffer to hold the weights |
| self._param_buf_size = rnn.get_num_weights(handle, fn.rnn_desc, fn.x_descs[0], fn.datatype) |
| fn.weight_buf = any_param.new(self._param_buf_size).zero_() |
| fn.w_desc = rnn.init_weight_descriptor(fn, fn.weight_buf) |
| |
| # Slice off views into weight_buf |
| all_weights = [[p.data for p in l] for l in self.all_weights] |
| params = rnn.get_parameters(fn, handle, fn.weight_buf) |
| |
| # Copy weights and update their storage |
| rnn._copyParams(all_weights, params) |
| for orig_layer_param, new_layer_param in zip(all_weights, params): |
| for orig_param, new_param in zip(orig_layer_param, new_layer_param): |
| orig_param.set_(new_param.view_as(orig_param)) |
| |
| self._data_ptrs = list(p.data.data_ptr() for p in self.parameters()) |
| |
| def _apply(self, fn): |
| ret = super(RNNBase, self)._apply(fn) |
| self.flatten_parameters() |
| return ret |
| |
| def reset_parameters(self): |
| stdv = 1.0 / math.sqrt(self.hidden_size) |
| for weight in self.parameters(): |
| weight.data.uniform_(-stdv, stdv) |
| |
| def check_forward_args(self, input, hidden, batch_sizes): |
| is_input_packed = batch_sizes is not None |
| expected_input_dim = 2 if is_input_packed else 3 |
| if input.dim() != expected_input_dim: |
| raise RuntimeError( |
| 'input must have {} dimensions, got {}'.format( |
| expected_input_dim, input.dim())) |
| if self.input_size != input.size(-1): |
| raise RuntimeError( |
| 'input.size(-1) must be equal to input_size. Expected {}, got {}'.format( |
| fn.input_size, input.size(-1))) |
| |
| if is_input_packed: |
| mini_batch = batch_sizes[0] |
| else: |
| mini_batch = input.size(0) if self.batch_first else input.size(1) |
| |
| num_directions = 2 if self.bidirectional else 1 |
| expected_hidden_size = (self.num_layers * num_directions, |
| mini_batch, self.hidden_size) |
| |
| def check_hidden_size(hx, expected_hidden_size, msg='Expected hidden size {}, got {}'): |
| if tuple(hx.size()) != expected_hidden_size: |
| raise RuntimeError(msg.format(expected_hidden_size, tuple(hx.size()))) |
| |
| if self.mode == 'LSTM': |
| check_hidden_size(hidden[0], expected_hidden_size, |
| 'Expected hidden[0] size {}, got {}') |
| check_hidden_size(hidden[1], expected_hidden_size, |
| 'Expected hidden[1] size {}, got {}') |
| else: |
| check_hidden_size(hidden, expected_hidden_size) |
| |
| def forward(self, input, hx=None): |
| is_packed = isinstance(input, PackedSequence) |
| if is_packed: |
| input, batch_sizes = input |
| max_batch_size = batch_sizes[0] |
| else: |
| batch_sizes = None |
| max_batch_size = input.size(0) if self.batch_first else input.size(1) |
| |
| if hx is None: |
| num_directions = 2 if self.bidirectional else 1 |
| hx = torch.autograd.Variable(input.data.new(self.num_layers * |
| num_directions, |
| max_batch_size, |
| self.hidden_size).zero_(), requires_grad=False) |
| if self.mode == 'LSTM': |
| hx = (hx, hx) |
| |
| has_flat_weights = list(p.data.data_ptr() for p in self.parameters()) == self._data_ptrs |
| if has_flat_weights: |
| first_data = next(self.parameters()).data |
| assert first_data.storage().size() == self._param_buf_size |
| flat_weight = first_data.new().set_(first_data.storage(), 0, torch.Size([self._param_buf_size])) |
| else: |
| flat_weight = None |
| |
| self.check_forward_args(input, hx, batch_sizes) |
| func = self._backend.RNN( |
| self.mode, |
| self.input_size, |
| self.hidden_size, |
| num_layers=self.num_layers, |
| batch_first=self.batch_first, |
| dropout=self.dropout, |
| train=self.training, |
| bidirectional=self.bidirectional, |
| batch_sizes=batch_sizes, |
| dropout_state=self.dropout_state, |
| flat_weight=flat_weight |
| ) |
| output, hidden = func(input, self.all_weights, hx) |
| if is_packed: |
| output = PackedSequence(output, batch_sizes) |
| return output, hidden |
| |
| def __repr__(self): |
| s = '{name}({input_size}, {hidden_size}' |
| if self.num_layers != 1: |
| s += ', num_layers={num_layers}' |
| if self.bias is not True: |
| s += ', bias={bias}' |
| if self.batch_first is not False: |
| s += ', batch_first={batch_first}' |
| if self.dropout != 0: |
| s += ', dropout={dropout}' |
| if self.bidirectional is not False: |
| s += ', bidirectional={bidirectional}' |
| s += ')' |
| return s.format(name=self.__class__.__name__, **self.__dict__) |
| |
| def __setstate__(self, d): |
| super(RNNBase, self).__setstate__(d) |
| self.__dict__.setdefault('_data_ptrs', []) |
| if 'all_weights' in d: |
| self._all_weights = d['all_weights'] |
| if isinstance(self._all_weights[0][0], str): |
| return |
| num_layers = self.num_layers |
| num_directions = 2 if self.bidirectional else 1 |
| self._all_weights = [] |
| for layer in range(num_layers): |
| for direction in range(num_directions): |
| suffix = '_reverse' if direction == 1 else '' |
| weights = ['weight_ih_l{}{}', 'weight_hh_l{}{}', 'bias_ih_l{}{}', 'bias_hh_l{}{}'] |
| weights = [x.format(layer, suffix) for x in weights] |
| if self.bias: |
| self._all_weights += [weights] |
| else: |
| self._all_weights += [weights[:2]] |
| |
| @property |
| def all_weights(self): |
| return [[getattr(self, weight) for weight in weights] for weights in self._all_weights] |
| |
| |
| class RNN(RNNBase): |
| r"""Applies a multi-layer Elman RNN with tanh or ReLU non-linearity to an |
| input sequence. |
| |
| |
| For each element in the input sequence, each layer computes the following |
| function: |
| |
| .. math:: |
| |
| h_t = \tanh(w_{ih} * x_t + b_{ih} + w_{hh} * h_{(t-1)} + b_{hh}) |
| |
| where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is |
| the hidden state of the previous layer at time `t` or :math:`input_t` |
| for the first layer. If nonlinearity='relu', then `ReLU` is used instead |
| of `tanh`. |
| |
| Args: |
| input_size: The number of expected features in the input x |
| hidden_size: The number of features in the hidden state h |
| num_layers: Number of recurrent layers. |
| nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' |
| bias: If ``False``, then the layer does not use bias weights b_ih and b_hh. |
| Default: ``True`` |
| batch_first: If ``True``, then the input and output tensors are provided |
| as (batch, seq, feature) |
| dropout: If non-zero, introduces a dropout layer on the outputs of each |
| RNN layer except the last layer |
| bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False`` |
| |
| Inputs: input, h_0 |
| - **input** (seq_len, batch, input_size): tensor containing the features |
| of the input sequence. The input can also be a packed variable length |
| sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` |
| for details. |
| - **h_0** (num_layers * num_directions, batch, hidden_size): tensor |
| containing the initial hidden state for each element in the batch. |
| Defaults to zero if not provided. |
| |
| Outputs: output, h_n |
| - **output** (seq_len, batch, hidden_size * num_directions): tensor |
| containing the output features (h_k) from the last layer of the RNN, |
| for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has |
| been given as the input, the output will also be a packed sequence. |
| - **h_n** (num_layers * num_directions, batch, hidden_size): tensor |
| containing the hidden state for k=seq_len. |
| |
| Attributes: |
| weight_ih_l[k]: the learnable input-hidden weights of the k-th layer, |
| of shape `(input_size x hidden_size)` |
| weight_hh_l[k]: the learnable hidden-hidden weights of the k-th layer, |
| of shape `(hidden_size x hidden_size)` |
| bias_ih_l[k]: the learnable input-hidden bias of the k-th layer, |
| of shape `(hidden_size)` |
| bias_hh_l[k]: the learnable hidden-hidden bias of the k-th layer, |
| of shape `(hidden_size)` |
| |
| Examples:: |
| |
| >>> rnn = nn.RNN(10, 20, 2) |
| >>> input = Variable(torch.randn(5, 3, 10)) |
| >>> h0 = Variable(torch.randn(2, 3, 20)) |
| >>> output, hn = rnn(input, h0) |
| """ |
| |
| def __init__(self, *args, **kwargs): |
| if 'nonlinearity' in kwargs: |
| if kwargs['nonlinearity'] == 'tanh': |
| mode = 'RNN_TANH' |
| elif kwargs['nonlinearity'] == 'relu': |
| mode = 'RNN_RELU' |
| else: |
| raise ValueError("Unknown nonlinearity '{}'".format( |
| kwargs['nonlinearity'])) |
| del kwargs['nonlinearity'] |
| else: |
| mode = 'RNN_TANH' |
| |
| super(RNN, self).__init__(mode, *args, **kwargs) |
| |
| |
| class LSTM(RNNBase): |
| r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input |
| sequence. |
| |
| |
| For each element in the input sequence, each layer computes the following |
| function: |
| |
| .. math:: |
| |
| \begin{array}{ll} |
| i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\ |
| f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\ |
| g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\ |
| o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\ |
| c_t = f_t * c_{(t-1)} + i_t * g_t \\ |
| h_t = o_t * \tanh(c_t) |
| \end{array} |
| |
| where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell |
| state at time `t`, :math:`x_t` is the hidden state of the previous layer at |
| time `t` or :math:`input_t` for the first layer, and :math:`i_t`, |
| :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, |
| and out gates, respectively. :math:`\sigma` is the sigmoid function. |
| |
| Args: |
| input_size: The number of expected features in the input x |
| hidden_size: The number of features in the hidden state h |
| num_layers: Number of recurrent layers. |
| bias: If ``False``, then the layer does not use bias weights b_ih and b_hh. |
| Default: ``True`` |
| batch_first: If ``True``, then the input and output tensors are provided |
| as (batch, seq, feature) |
| dropout: If non-zero, introduces a dropout layer on the outputs of each |
| RNN layer except the last layer |
| bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False`` |
| |
| Inputs: input, (h_0, c_0) |
| - **input** (seq_len, batch, input_size): tensor containing the features |
| of the input sequence. |
| The input can also be a packed variable length sequence. |
| See :func:`torch.nn.utils.rnn.pack_padded_sequence` for details. |
| - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor |
| containing the initial hidden state for each element in the batch. |
| - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor |
| containing the initial cell state for each element in the batch. |
| |
| If (h_0, c_0) is not provided, both **h_0** and **c_0** default to zero. |
| |
| |
| Outputs: output, (h_n, c_n) |
| - **output** (seq_len, batch, hidden_size * num_directions): tensor |
| containing the output features `(h_t)` from the last layer of the RNN, |
| for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been |
| given as the input, the output will also be a packed sequence. |
| - **h_n** (num_layers * num_directions, batch, hidden_size): tensor |
| containing the hidden state for t=seq_len |
| - **c_n** (num_layers * num_directions, batch, hidden_size): tensor |
| containing the cell state for t=seq_len |
| |
| Attributes: |
| weight_ih_l[k] : the learnable input-hidden weights of the k-th layer |
| `(W_ii|W_if|W_ig|W_io)`, of shape `(4*hidden_size x input_size)` |
| weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer |
| `(W_hi|W_hf|W_hg|W_ho)`, of shape `(4*hidden_size x hidden_size)` |
| bias_ih_l[k] : the learnable input-hidden bias of the k-th layer |
| `(b_ii|b_if|b_ig|b_io)`, of shape `(4*hidden_size)` |
| bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer |
| `(b_hi|b_hf|b_hg|b_ho)`, of shape `(4*hidden_size)` |
| |
| Examples:: |
| |
| >>> rnn = nn.LSTM(10, 20, 2) |
| >>> input = Variable(torch.randn(5, 3, 10)) |
| >>> h0 = Variable(torch.randn(2, 3, 20)) |
| >>> c0 = Variable(torch.randn(2, 3, 20)) |
| >>> output, hn = rnn(input, (h0, c0)) |
| """ |
| |
| def __init__(self, *args, **kwargs): |
| super(LSTM, self).__init__('LSTM', *args, **kwargs) |
| |
| |
| class GRU(RNNBase): |
| r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence. |
| |
| |
| For each element in the input sequence, each layer computes the following |
| function: |
| |
| .. math:: |
| |
| \begin{array}{ll} |
| r_t = \sigma(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ |
| z_t = \sigma(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ |
| n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ |
| h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\ |
| \end{array} |
| |
| where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden |
| state of the previous layer at time `t` or :math:`input_t` for the first |
| layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input, |
| and new gates, respectively. :math:`\sigma` is the sigmoid function. |
| |
| Args: |
| input_size: The number of expected features in the input x |
| hidden_size: The number of features in the hidden state h |
| num_layers: Number of recurrent layers. |
| bias: If ``False``, then the layer does not use bias weights b_ih and b_hh. |
| Default: ``True`` |
| batch_first: If ``True``, then the input and output tensors are provided |
| as (batch, seq, feature) |
| dropout: If non-zero, introduces a dropout layer on the outputs of each |
| RNN layer except the last layer |
| bidirectional: If ``True``, becomes a bidirectional RNN. Default: ``False`` |
| |
| Inputs: input, h_0 |
| - **input** (seq_len, batch, input_size): tensor containing the features |
| of the input sequence. The input can also be a packed variable length |
| sequence. See :func:`torch.nn.utils.rnn.pack_padded_sequence` |
| for details. |
| - **h_0** (num_layers * num_directions, batch, hidden_size): tensor |
| containing the initial hidden state for each element in the batch. |
| Defaults to zero if not provided. |
| |
| Outputs: output, h_n |
| - **output** (seq_len, batch, hidden_size * num_directions): tensor |
| containing the output features h_t from the last layer of the RNN, |
| for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been |
| given as the input, the output will also be a packed sequence. |
| - **h_n** (num_layers * num_directions, batch, hidden_size): tensor |
| containing the hidden state for t=seq_len |
| |
| Attributes: |
| weight_ih_l[k] : the learnable input-hidden weights of the k-th layer |
| (W_ir|W_iz|W_in), of shape `(3*hidden_size x input_size)` |
| weight_hh_l[k] : the learnable hidden-hidden weights of the k-th layer |
| (W_hr|W_hz|W_hn), of shape `(3*hidden_size x hidden_size)` |
| bias_ih_l[k] : the learnable input-hidden bias of the k-th layer |
| (b_ir|b_iz|b_in), of shape `(3*hidden_size)` |
| bias_hh_l[k] : the learnable hidden-hidden bias of the k-th layer |
| (b_hr|b_hz|b_hn), of shape `(3*hidden_size)` |
| Examples:: |
| |
| >>> rnn = nn.GRU(10, 20, 2) |
| >>> input = Variable(torch.randn(5, 3, 10)) |
| >>> h0 = Variable(torch.randn(2, 3, 20)) |
| >>> output, hn = rnn(input, h0) |
| """ |
| |
| def __init__(self, *args, **kwargs): |
| super(GRU, self).__init__('GRU', *args, **kwargs) |
| |
| |
| class RNNCellBase(Module): |
| |
| def __repr__(self): |
| s = '{name}({input_size}, {hidden_size}' |
| if 'bias' in self.__dict__ and self.bias is not True: |
| s += ', bias={bias}' |
| if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh": |
| s += ', nonlinearity={nonlinearity}' |
| s += ')' |
| return s.format(name=self.__class__.__name__, **self.__dict__) |
| |
| def check_forward_input(self, input): |
| if input.size(1) != self.input_size: |
| raise RuntimeError( |
| "input has inconsistent input_size: got {}, expected {}".format( |
| input.size(1), self.input_size)) |
| |
| def check_forward_hidden(self, input, hx, hidden_label=''): |
| if input.size(0) != hx.size(0): |
| raise RuntimeError( |
| "Input batch size {} doesn't match hidden{} batch size {}".format( |
| input.size(0), hidden_label, hx.size(0))) |
| |
| if hx.size(1) != self.hidden_size: |
| raise RuntimeError( |
| "hidden{} has inconsistent hidden_size: got {}, expected {}".format( |
| hidden_label, input.size(1), self.input_size)) |
| |
| |
| class RNNCell(RNNCellBase): |
| r"""An Elman RNN cell with tanh or ReLU non-linearity. |
| |
| .. math:: |
| |
| h' = \tanh(w_{ih} * x + b_{ih} + w_{hh} * h + b_{hh}) |
| |
| If nonlinearity='relu', then ReLU is used in place of tanh. |
| |
| Args: |
| input_size: The number of expected features in the input x |
| hidden_size: The number of features in the hidden state h |
| bias: If ``False``, then the layer does not use bias weights b_ih and b_hh. |
| Default: ``True`` |
| nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' |
| |
| Inputs: input, hidden |
| - **input** (batch, input_size): tensor containing input features |
| - **hidden** (batch, hidden_size): tensor containing the initial hidden |
| state for each element in the batch. |
| |
| Outputs: h' |
| - **h'** (batch, hidden_size): tensor containing the next hidden state |
| for each element in the batch |
| |
| Attributes: |
| weight_ih: the learnable input-hidden weights, of shape |
| `(input_size x hidden_size)` |
| weight_hh: the learnable hidden-hidden weights, of shape |
| `(hidden_size x hidden_size)` |
| bias_ih: the learnable input-hidden bias, of shape `(hidden_size)` |
| bias_hh: the learnable hidden-hidden bias, of shape `(hidden_size)` |
| |
| Examples:: |
| |
| >>> rnn = nn.RNNCell(10, 20) |
| >>> input = Variable(torch.randn(6, 3, 10)) |
| >>> hx = Variable(torch.randn(3, 20)) |
| >>> output = [] |
| >>> for i in range(6): |
| ... hx = rnn(input[i], hx) |
| ... output.append(hx) |
| """ |
| |
| def __init__(self, input_size, hidden_size, bias=True, nonlinearity="tanh"): |
| super(RNNCell, self).__init__() |
| self.input_size = input_size |
| self.hidden_size = hidden_size |
| self.bias = bias |
| self.nonlinearity = nonlinearity |
| self.weight_ih = Parameter(torch.Tensor(hidden_size, input_size)) |
| self.weight_hh = Parameter(torch.Tensor(hidden_size, hidden_size)) |
| if bias: |
| self.bias_ih = Parameter(torch.Tensor(hidden_size)) |
| self.bias_hh = Parameter(torch.Tensor(hidden_size)) |
| else: |
| self.register_parameter('bias_ih', None) |
| self.register_parameter('bias_hh', None) |
| self.reset_parameters() |
| |
| def reset_parameters(self): |
| stdv = 1.0 / math.sqrt(self.hidden_size) |
| for weight in self.parameters(): |
| weight.data.uniform_(-stdv, stdv) |
| |
| def forward(self, input, hx): |
| self.check_forward_input(input) |
| self.check_forward_hidden(input, hx) |
| if self.nonlinearity == "tanh": |
| func = self._backend.RNNTanhCell |
| elif self.nonlinearity == "relu": |
| func = self._backend.RNNReLUCell |
| else: |
| raise RuntimeError( |
| "Unknown nonlinearity: {}".format(self.nonlinearity)) |
| |
| return func( |
| input, hx, |
| self.weight_ih, self.weight_hh, |
| self.bias_ih, self.bias_hh, |
| ) |
| |
| |
| class LSTMCell(RNNCellBase): |
| r"""A long short-term memory (LSTM) cell. |
| |
| .. math:: |
| |
| \begin{array}{ll} |
| i = \sigma(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ |
| f = \sigma(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ |
| g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\ |
| o = \sigma(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ |
| c' = f * c + i * g \\ |
| h' = o * \tanh(c') \\ |
| \end{array} |
| |
| where :math:`\sigma` is the sigmoid function. |
| |
| Args: |
| input_size: The number of expected features in the input x |
| hidden_size: The number of features in the hidden state h |
| bias: If `False`, then the layer does not use bias weights `b_ih` and |
| `b_hh`. Default: ``True`` |
| |
| Inputs: input, (h_0, c_0) |
| - **input** (batch, input_size): tensor containing input features |
| - **h_0** (batch, hidden_size): tensor containing the initial hidden |
| state for each element in the batch. |
| - **c_0** (batch. hidden_size): tensor containing the initial cell state |
| for each element in the batch. |
| |
| Outputs: h_1, c_1 |
| - **h_1** (batch, hidden_size): tensor containing the next hidden state |
| for each element in the batch |
| - **c_1** (batch, hidden_size): tensor containing the next cell state |
| for each element in the batch |
| |
| Attributes: |
| weight_ih: the learnable input-hidden weights, of shape |
| `(4*hidden_size x input_size)` |
| weight_hh: the learnable hidden-hidden weights, of shape |
| `(4*hidden_size x hidden_size)` |
| bias_ih: the learnable input-hidden bias, of shape `(4*hidden_size)` |
| bias_hh: the learnable hidden-hidden bias, of shape `(4*hidden_size)` |
| |
| Examples:: |
| |
| >>> rnn = nn.LSTMCell(10, 20) |
| >>> input = Variable(torch.randn(6, 3, 10)) |
| >>> hx = Variable(torch.randn(3, 20)) |
| >>> cx = Variable(torch.randn(3, 20)) |
| >>> output = [] |
| >>> for i in range(6): |
| ... hx, cx = rnn(input[i], (hx, cx)) |
| ... output.append(hx) |
| """ |
| |
| def __init__(self, input_size, hidden_size, bias=True): |
| super(LSTMCell, self).__init__() |
| self.input_size = input_size |
| self.hidden_size = hidden_size |
| self.bias = bias |
| self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size)) |
| self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size)) |
| if bias: |
| self.bias_ih = Parameter(torch.Tensor(4 * hidden_size)) |
| self.bias_hh = Parameter(torch.Tensor(4 * hidden_size)) |
| else: |
| self.register_parameter('bias_ih', None) |
| self.register_parameter('bias_hh', None) |
| self.reset_parameters() |
| |
| def reset_parameters(self): |
| stdv = 1.0 / math.sqrt(self.hidden_size) |
| for weight in self.parameters(): |
| weight.data.uniform_(-stdv, stdv) |
| |
| def forward(self, input, hx): |
| self.check_forward_input(input) |
| self.check_forward_hidden(input, hx[0], '[0]') |
| self.check_forward_hidden(input, hx[1], '[1]') |
| return self._backend.LSTMCell( |
| input, hx, |
| self.weight_ih, self.weight_hh, |
| self.bias_ih, self.bias_hh, |
| ) |
| |
| |
| class GRUCell(RNNCellBase): |
| r"""A gated recurrent unit (GRU) cell |
| |
| .. math:: |
| |
| \begin{array}{ll} |
| r = \sigma(W_{ir} x + b_{ir} + W_{hr} h + b_{hr}) \\ |
| z = \sigma(W_{iz} x + b_{iz} + W_{hz} h + b_{hz}) \\ |
| n = \tanh(W_{in} x + b_{in} + r * (W_{hn} h + b_{hn})) \\ |
| h' = (1 - z) * n + z * h |
| \end{array} |
| |
| where :math:`\sigma` is the sigmoid function. |
| |
| Args: |
| input_size: The number of expected features in the input x |
| hidden_size: The number of features in the hidden state h |
| bias: If `False`, then the layer does not use bias weights `b_ih` and |
| `b_hh`. Default: `True` |
| |
| Inputs: input, hidden |
| - **input** (batch, input_size): tensor containing input features |
| - **hidden** (batch, hidden_size): tensor containing the initial hidden |
| state for each element in the batch. |
| |
| Outputs: h' |
| - **h'**: (batch, hidden_size): tensor containing the next hidden state |
| for each element in the batch |
| |
| Attributes: |
| weight_ih: the learnable input-hidden weights, of shape |
| `(3*hidden_size x input_size)` |
| weight_hh: the learnable hidden-hidden weights, of shape |
| `(3*hidden_size x hidden_size)` |
| bias_ih: the learnable input-hidden bias, of shape `(3*hidden_size)` |
| bias_hh: the learnable hidden-hidden bias, of shape `(3*hidden_size)` |
| |
| Examples:: |
| |
| >>> rnn = nn.GRUCell(10, 20) |
| >>> input = Variable(torch.randn(6, 3, 10)) |
| >>> hx = Variable(torch.randn(3, 20)) |
| >>> output = [] |
| >>> for i in range(6): |
| ... hx = rnn(input[i], hx) |
| ... output.append(hx) |
| """ |
| |
| def __init__(self, input_size, hidden_size, bias=True): |
| super(GRUCell, self).__init__() |
| self.input_size = input_size |
| self.hidden_size = hidden_size |
| self.bias = bias |
| self.weight_ih = Parameter(torch.Tensor(3 * hidden_size, input_size)) |
| self.weight_hh = Parameter(torch.Tensor(3 * hidden_size, hidden_size)) |
| if bias: |
| self.bias_ih = Parameter(torch.Tensor(3 * hidden_size)) |
| self.bias_hh = Parameter(torch.Tensor(3 * hidden_size)) |
| else: |
| self.register_parameter('bias_ih', None) |
| self.register_parameter('bias_hh', None) |
| self.reset_parameters() |
| |
| def reset_parameters(self): |
| stdv = 1.0 / math.sqrt(self.hidden_size) |
| for weight in self.parameters(): |
| weight.data.uniform_(-stdv, stdv) |
| |
| def forward(self, input, hx): |
| self.check_forward_input(input) |
| self.check_forward_hidden(input, hx) |
| return self._backend.GRUCell( |
| input, hx, |
| self.weight_ih, self.weight_hh, |
| self.bias_ih, self.bias_hh, |
| ) |