test/common_nn.py - platform/external/pytorch - Git at Google

 import sys
 import tempfile
 import unittest
 from copy import deepcopy
 from itertools import product

 import torch
 import torch.cuda
 from torch.autograd import Variable, variable
 from common import TestCase, to_gpu, freeze_rng_state, is_iterable
 from torch.autograd.gradcheck import get_numerical_jacobian, iter_tensors, contiguous
 import torch.backends.cudnn

 # tarfile module tries to obtain a file object name in python 3.3
 if sys.version_info[:2] == (3, 3):
     TemporaryFile = tempfile.NamedTemporaryFile
 else:
     TemporaryFile = tempfile.TemporaryFile

 TEST_CUDA = torch.cuda.is_available()
 TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2
 TEST_CUDNN = TEST_CUDA and torch.backends.cudnn.is_acceptable(torch.cuda.FloatTensor(1))
 TEST_CUDNN_VERSION = TEST_CUDNN and torch.backends.cudnn.version()
 PRECISION = 1e-5


 def get_size_average(m):
     return getattr(m, 'size_average', False) or getattr(m, 'sizeAverage', False)


 def get_weight(m):
     result = getattr(m, 'weight', None)
     if result is not None:
         return result
     return getattr(m, 'weights', None)

 module_tests = [
     dict(
         module_name='Linear',
         constructor_args=(10, 8),
         input_size=(4, 10),
         reference_fn=lambda i, p: torch.mm(i, p[0].t()) + p[1].view(1, -1).expand(4, 8)
     ),
     dict(
         module_name='Linear',
         constructor_args=(10, 8, False),
         input_size=(4, 10),
         desc='no_bias',
         reference_fn=lambda i, p: torch.mm(i, p[0].t())
     ),
     dict(
         module_name='Threshold',
         constructor_args=(2, 1),
         input_size=(2, 3, 4, 5),
         check_inplace=True,
         desc='threshold_value'
     ),
     dict(
         module_name='Threshold',
         constructor_args=(2, 10),
         input_size=(2, 3, 4, 5),
         desc='large_value'
     ),
     dict(
         module_name='ReLU',
         input_size=(2, 3, 4, 5),
         check_inplace=True,
     ),
     dict(
         module_name='ReLU6',
         input_size=(2, 3, 4, 5),
         check_inplace=True,
     ),
     dict(
         module_name='RReLU',
         input_size=(1, 2, 2),
         test_cuda=False,
     ),
     dict(
         module_name='RReLU',
         constructor_args=(0.1, 0.9),
         input_size=(4, 4, 5),
         desc='with_up_down',
         test_cuda=False,
     ),
     dict(
         module_name='Hardtanh',
         input_size=(3, 2, 5),
         reference_fn=lambda i, _: i.clamp(-1, 1),
     ),
     dict(
         module_name='Sigmoid',
         input_size=(2, 3, 4, 5)
     ),
     dict(
         module_name='Tanh',
         input_size=(2, 3, 4, 5)
     ),
     dict(
         module_name='Softmax',
         constructor_args=(1,),
         input_size=(10, 20),
         reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, True).expand(10, 20)),
     ),
     dict(
         module_name='Softmax2d',
         input_size=(1, 3, 10, 20),
         reference_fn=lambda i, _: torch.exp(i).div(torch.exp(i).sum(1, False)),
     ),
     dict(
         module_name='LogSoftmax',
         constructor_args=(1,),
         input_size=(10, 20),
         reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, True).expand(10, 20)).log_(),
     ),
     dict(
         module_name='LogSoftmax',
         constructor_args=(1,),
         input_size=(1, 3, 10, 20),
         reference_fn=lambda i, _: torch.exp(i).div_(torch.exp(i).sum(1, False)).log_(),
         desc='multiparam',
     ),
     dict(
         module_name='ELU',
         constructor_args=(2.,),
         input_size=(3, 2, 5),
     ),
     # TODO: reference function
     dict(
         module_name='Hardshrink',
         constructor_args=(2.,),
         input_size=(4, 3, 2, 4),
     ),
     dict(
         module_name='LeakyReLU',
         input_size=(3, 2, 5),
         check_inplace=True
     ),
     dict(
         module_name='LeakyReLU',
         constructor_args=(0.5,),
         input_size=(3, 2, 5),
         check_inplace=True,
         desc='with_negval'
     ),
     dict(
         module_name='LogSigmoid',
         input_size=(2, 3, 4),
         reference_fn=lambda i, _: i.sigmoid().log(),
     ),
     dict(
         module_name='Softplus',
         input_size=(10, 20),
         reference_fn=lambda i, _: torch.log(1 + torch.exp(i)),
     ),
     dict(
         module_name='Softplus',
         constructor_args=(2,),
         input_size=(10, 20),
         reference_fn=lambda i, _: 1. / 2. * torch.log(1 + torch.exp(2 * i)),
         desc='beta',
     ),
     dict(
         module_name='Softplus',
         constructor_args=(2, -100),
         input_size=(10, 20),
         reference_fn=(lambda i, _: ((i * 2) > -100).type_as(i) * i +
                                    ((i * 2) <= -100).type_as(i) * 1. / 2. * torch.log(1 + torch.exp(2 * i))),
         desc='beta_threshold',
     ),
     dict(
         module_name='Softshrink',
         input_size=(3, 2, 5),
     ),
     dict(
         module_name='Softshrink',
         constructor_args=(1,),
         input_size=(3, 2, 5),
         desc='lambda',
     ),
     dict(
         module_name='CrossMapLRN2d',
         constructor_args=(5, 5e-3, 1e-3, 2),
         input_size=(2, 3, 6, 6),
         check_gradgrad=False,
     ),
     dict(
         module_name='PReLU',
         input_size=(2, 3, 4),
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
         desc='1d',
     ),
     dict(
         module_name='PReLU',
         constructor_args=(3,),
         input_size=(2, 3, 4),
         desc='1d_multiparam',
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
     ),
     dict(
         module_name='PReLU',
         input_size=(2, 3, 4, 5),
         desc='2d',
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
     ),
     dict(
         module_name='PReLU',
         constructor_args=(3,),
         input_size=(2, 3, 4, 5),
         desc='2d_multiparam',
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
     ),
     dict(
         module_name='PReLU',
         input_size=(2, 3, 4, 5, 6),
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
         desc='3d',
     ),
     dict(
         module_name='PReLU',
         constructor_args=(3,),
         input_size=(2, 3, 4, 5, 6),
         desc='3d_multiparam',
         reference_fn=lambda i, p: torch.clamp(i, min=0) + torch.clamp(i, max=0) * p[0][0],
     ),
     dict(
         module_name='Softsign',
         input_size=(3, 2, 5),
         reference_fn=lambda i, _: i.div(1 + torch.abs(i)),
     ),
     dict(
         module_name='Softmin',
         constructor_args=(1,),
         input_size=(10, 20),
     ),
     dict(
         module_name='Softmin',
         constructor_args=(1,),
         input_size=(2, 3, 5, 10),
         desc='multidim',
     ),
     dict(
         module_name='Tanhshrink',
         input_size=(2, 3, 4, 5)
     ),
 ]


 def kldivloss_reference(input, target, size_average=True, reduce=True):
     safe_target = target * (target > 0).type_as(target)
     safe_target_log = (safe_target + (target <= 0).type_as(target)).log()
     result = safe_target * (safe_target_log - input)
     if reduce and size_average:
         return result.mean()
     elif reduce:
         return result.sum()
     return result


 def nlllossNd_reference(input, target, weight=None, ignore_index=-100,
                         size_average=True, reduce=True):
     assert input.dim() >= 3
     N = input.size(0)
     C = input.size(1)
     out_size = (N,) + input.size()[2:]
     output = Variable(torch.zeros(out_size).type_as(input))

     if weight is None:
         weight = Variable(torch.ones(C).type_as(input))
     if torch.is_tensor(weight):
         # TODO: remove this once Variables and tensors merge
         weight = Variable(weight)
     total_weight_data = 0
     for tup in product(*[range(size) for size in out_size]):
         t_nx = target[tup]
         norm = 0. if ignore_index == t_nx else weight[t_nx]
         input_index = list(tup)
         input_index.insert(1, t_nx)
         output[tup] = -input[tuple(input_index)] * norm
         total_weight_data += norm

     if reduce and size_average:
         return output.sum() / total_weight_data
     elif reduce:
         return output.sum()
     return output


 def nllloss_reference(input, target, weight=None, ignore_index=-100,
                       size_average=True, reduce=True):

     def nll_loss_helper(input, target, weight, ignore_index):
         if target == ignore_index:
             return (variable(0), variable(0))
         norm = 1 if weight is None else weight[target]
         result = -input[target] * norm
         return (result, norm)

     if torch.is_tensor(weight):
         # TODO: remove this once Variables and tensors merge
         weight = Variable(weight)
     losses_and_weights = [nll_loss_helper(i, t, weight, ignore_index)
                           for i, t in zip(input, target)]
     losses, weights = zip(*losses_and_weights)
     if isinstance(losses[0], Variable):
         losses_tensor = torch.stack(losses).type_as(input)
         if not torch._C._with_scalars():
             losses_tensor = losses_tensor.squeeze(1)
     else:
         losses_tensor = torch.Tensor(losses).type_as(input)
     if reduce and size_average:
         return sum(losses_tensor) / sum(weights)
     elif reduce:
         return sum(losses_tensor)
     else:
         return losses_tensor


 def smoothl1loss_reference(input, target, size_average=True, reduce=True):
     abs_diff = (input - target).abs()
     ge_one_mask = (abs_diff >= 1).type_as(abs_diff)
     lt_one_mask = (abs_diff < 1).type_as(abs_diff)
     output = ge_one_mask * (abs_diff - 0.5) + lt_one_mask * 0.5 * (abs_diff ** 2)
     if reduce and size_average:
         return output.mean()
     elif reduce:
         return output.sum()
     return output


 def _multilabelmarginloss_reference(input, target):
     targets = []
     for target_index in target:
         if target_index < 0:
             break
         targets.append(target_index)

     sum = 0
     for target_index in targets:
         for i in range(0, len(input)):
             if i not in targets:
                 sum += max(0, 1 - input[target_index] + input[i])

     return sum


 def multilabelmarginloss_reference(input, target, size_average=True, reduce=True):
     if input.dim() == 1:
         n = 1
         dim = input.size(0)
         output = input.new(n).zero_()
         output[0] = _multilabelmarginloss_reference(input, target)
     else:
         n = input.size(0)
         dim = input.size(1)
         output = input.new(n).zero_()
         for i in range(0, n):
             output[i] = _multilabelmarginloss_reference(input[i], target[i])

     if reduce and size_average:
         return output.mean() / dim
     elif reduce:
         return output.sum() / dim
     return output / dim


 def hingeembeddingloss_reference(input, target, margin=1.0, size_average=True, reduce=True):
     # needed for legacy tests
     if not isinstance(input, Variable):
         input = Variable(input)
         target = Variable(target)

     margin_clamp = (margin - input).clamp(min=0).type_as(input)
     output = torch.where(target == 1, input, margin_clamp)

     if reduce and size_average:
         return output.mean()
     elif reduce:
         return output.sum()
     return output


 def softmarginloss_reference(input, target, size_average=True, reduce=True):
     output = (1 + (-input * target).exp()).log()

     if reduce and size_average:
         return output.mean()
     elif reduce:
         return output.sum()
     return output


 loss_reference_fns = {
     'KLDivLoss': kldivloss_reference,
     'NLLLoss': nllloss_reference,
     'NLLLossNd': nlllossNd_reference,
     'SmoothL1Loss': smoothl1loss_reference,
     'MultiLabelMarginLoss': multilabelmarginloss_reference,
     'HingeEmbeddingLoss': hingeembeddingloss_reference,
     'SoftMarginLoss': softmarginloss_reference,
 }


 sample_scalar = variable(0)


 # TODO: replace this with torch.rand() when Variables and tensors are merged;
 # this function will correctly handle scalars (i.e. empty tuple sizes) for now.
 def torch_rand(sizes, requires_grad=False):
     if len(sizes) == 0:
         return torch.testing.rand_like(sample_scalar, requires_grad=requires_grad)
     else:
         return Variable(torch.rand(*sizes), requires_grad=requires_grad)


 # TODO: replace this with torch.randn() when Variables and tensors are merged;
 # this function will correctly handle scalars (i.e. empty tuple sizes) for now.
 def torch_randn(sizes, requires_grad=False):
     if len(sizes) == 0:
         return torch.testing.randn_like(sample_scalar, requires_grad=requires_grad)
     else:
         return Variable(torch.randn(*sizes), requires_grad=requires_grad)

 criterion_tests = [
     dict(
         module_name='L1Loss',
         input_size=(2, 3, 4),
         target_size=(2, 3, 4),
         reference_fn=lambda i, t, _: 1. / i.numel() *
         sum((a - b).abs().sum() for a, b in zip(i, t)),
     ),
     dict(
         module_name='NLLLoss',
         input_fn=lambda: torch.rand(15, 10).log(),
         target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
         reference_fn=lambda i, t, m:
             nllloss_reference(i, t, size_average=get_size_average(m)),
         check_no_size_average=True
     ),
     dict(
         module_name='NLLLoss',
         constructor_args=(None, True, 2),
         input_fn=lambda: torch.rand(15, 10).log(),
         target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
         reference_fn=lambda i, t, _: nllloss_reference(i, t, ignore_index=2),
         desc='ignore_index'
     ),
     dict(
         module_name='NLLLoss',
         constructor_args_fn=lambda: (torch.rand(10),),
         input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
         target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
         reference_fn=lambda i, t, m:
             nllloss_reference(i, t, weight=get_weight(m)),
         desc='weights',
     ),
     dict(
         module_name='NLLLoss',
         constructor_args_fn=lambda: (torch.rand(10), True, 2),
         input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
         target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
         reference_fn=lambda i, t, m:
             nllloss_reference(i, t, weight=get_weight(m), ignore_index=2),
         desc='weights_ignore_index'
     ),
     dict(
         module_name='NLLLoss',
         constructor_args_fn=lambda: (torch.rand(10), True, -1),
         input_fn=lambda: torch.rand(15, 10).add(1e-2).log(),
         target_fn=lambda: torch.Tensor(15).uniform_().mul(10 + 1).floor().long() - 1,
         reference_fn=lambda i, t, m:
             nllloss_reference(i, t, weight=get_weight(m), ignore_index=-1),
         desc='weights_ignore_index_neg'
     ),
     dict(
         module_name='KLDivLoss',
         input_fn=lambda: torch.rand(10, 10).log(),
         target_fn=lambda: torch.rand(10, 10),
         reference_fn=lambda i, t, m:
             kldivloss_reference(i, t, get_size_average(m), reduce=True),
         check_no_size_average=True,
     ),
     dict(
         module_name='MSELoss',
         input_size=(2, 3, 4, 5),
         target_size=(2, 3, 4, 5),
         reference_fn=lambda i, t, m: (i - t).abs().pow(2).sum() / (i.numel() if get_size_average(m) else 1),
         check_no_size_average=True,
     ),
     dict(
         module_name='BCELoss',
         input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
         target_fn=lambda: torch.randn(15, 10).gt(0).double(),
         reference_fn=lambda i, t, m: -(t * i.log() + (1 - t) * (1 - i).log()).sum() /
             (i.numel() if get_size_average(m) else 1),
         check_gradgrad=False,
     ),
     dict(
         module_name='BCELoss',
         constructor_args_fn=lambda: (torch.rand(10),),
         input_fn=lambda: torch.rand(15, 10).clamp_(1e-2, 1 - 1e-2),
         target_fn=lambda: torch.randn(15, 10).gt(0).double(),
         reference_fn=lambda i, t, m: -((t * i.log() + (1 - t) * (1 - i).log()) * get_weight(m)).sum() /
             (i.numel() if get_size_average(m) else 1),
         desc='weights',
         check_gradgrad=False,
     ),
     dict(
         module_name='CrossEntropyLoss',
         input_size=(15, 10),
         target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
     ),
     dict(
         module_name='CrossEntropyLoss',
         constructor_args_fn=lambda: (torch.rand(10),),
         input_size=(15, 10),
         target_fn=lambda: torch.Tensor(15).uniform_().mul(10).floor().long(),
         desc='weights',
     ),
     dict(
         module_name='HingeEmbeddingLoss',
         input_size=(10,),
         target_fn=lambda: torch.randn(10).gt(0).double().mul_(2).sub(1),
         reference_fn=lambda i, t, m:
             hingeembeddingloss_reference(i, t, size_average=get_size_average(m)),
         check_no_size_average=True,
     ),
     dict(
         module_name='HingeEmbeddingLoss',
         constructor_args=(0.5,),
         input_size=(10,),
         target_fn=lambda: torch.randn(10).gt(0).double().mul_(2).sub(1),
         reference_fn=lambda i, t, m:
             hingeembeddingloss_reference(i, t, margin=0.5, size_average=get_size_average(m)),
         desc='margin',
         check_no_size_average=True,
     ),
     dict(
         module_name='MultiLabelMarginLoss',
         input_size=(10,),
         target_fn=lambda: torch.rand(10).mul(10).floor().long(),
         reference_fn=lambda i, t, m:
             multilabelmarginloss_reference(i, t, size_average=get_size_average(m)),
         desc="1d",
         check_no_size_average=True,
         check_gradgrad=False,
     ),
     dict(
         module_name='MultiLabelMarginLoss',
         input_size=(5, 10),
         target_fn=lambda: torch.rand(5, 10).mul(10).floor().long(),
         reference_fn=lambda i, t, m:
             multilabelmarginloss_reference(i, t, size_average=get_size_average(m)),
         check_no_size_average=True,
         check_gradgrad=False,
     ),
     dict(
         module_name='MultiLabelSoftMarginLoss',
         input_size=(5, 10),
         target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
         check_gradgrad=False,
     ),
     dict(
         module_name='MultiLabelSoftMarginLoss',
         constructor_args_fn=lambda: (torch.rand(10),),
         input_size=(5, 10),
         target_fn=lambda: torch.rand(5, 10).mul(2).floor(),
         desc='weights',
         check_gradgrad=False,
     ),
     dict(
         module_name='MultiMarginLoss',
         input_size=(5, 10),
         target_fn=lambda: torch.rand(5).mul(8).floor().long(),
         check_gradgrad=False,
     ),
     dict(
         module_name='SmoothL1Loss',
         input_size=(5, 10),
         target_size=(5, 10),
         check_no_size_average=True,
         reference_fn=lambda i, t, m:
             smoothl1loss_reference(i, t, size_average=get_size_average(m)),
     ),
     dict(
         module_name='SoftMarginLoss',
         input_size=(5, 5),
         target_fn=lambda: torch.randn(5, 5).sign(),
         reference_fn=lambda i, t, m:
             softmarginloss_reference(i, t, size_average=get_size_average(m)),
         check_no_size_average=True,
     ),
     dict(
         module_name='CosineEmbeddingLoss',
         input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
         target_fn=lambda: torch.randn(15).sign(),
         check_gradgrad=False,
     ),
     dict(
         module_name='CosineEmbeddingLoss',
         constructor_args=(0.7,),
         input_fn=lambda: (torch.rand(15, 10), torch.rand(15, 10)),
         target_fn=lambda: torch.randn(15).sign(),
         desc='margin',
         check_gradgrad=False,
     ),
     dict(
         module_name='MarginRankingLoss',
         input_fn=lambda: (torch.randn(50).mul(10), torch.randn(50).mul(10)),
         target_fn=lambda: torch.randn(50).sign(),
         check_no_size_average=True,
     ),
     dict(
         module_name='MarginRankingLoss',
         constructor_args=(2,),
         input_fn=lambda: (torch.randn(50).mul(10), torch.randn(50).mul(10)),
         target_fn=lambda: torch.randn(50).sign(),
         desc='margin',
         check_no_size_average=True,
     ),
 ]


 class NNTestCase(TestCase):

     def _jacobian(self, input, num_out):
         if isinstance(input, tuple):
             return tuple(self._jacobian(elem, num_out) for elem in input)
         elif isinstance(input, list):
             return [self._jacobian(elem, num_out) for elem in input]
         else:
             return torch.zeros(input.nelement(), num_out)

     def _flatten_tensors(self, x):
         if torch.is_tensor(x):
             if x.is_sparse:
                 return x.to_dense().view(-1)
             else:
                 return x.view(-1)
         elif isinstance(x, Variable):
             return self._flatten_tensors(x.data)
         else:
             return tuple(self._flatten_tensors(a) for a in x)

     def _zero_grad_input(self, input):
         if isinstance(input, Variable):
             if input.requires_grad and input.grad is not None:
                 input.grad.data.zero_()
                 input.grad.detach_()
         elif torch.is_tensor(input):
             return
         else:
             for i in input:
                 self._zero_grad_input(i)

     def _analytical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
         output = self._forward(module, input)
         output_size = output.nelement()

         if jacobian_input:
             jacobian_inp = self._jacobian(input, output_size)
             flat_jacobian_input = list(iter_tensors(jacobian_inp))

         if jacobian_parameters:
             num_param = sum(p.numel() for p in self._get_parameters(module)[0])
             jacobian_param = torch.zeros(num_param, output_size)

         for i in range(output_size):
             _, d_param = self._get_parameters(module)
             d_out = torch.zeros_like(output)
             flat_d_out = d_out.view(-1)
             flat_d_out[i] = 1

             if jacobian_parameters:
                 self._zero_grad_parameters(module)
             # Variables will accumulate gradient from multiple steps
             if jacobian_input:
                 self._zero_grad_input(input)
             d_input = self._backward(module, input, output, d_out)

             if jacobian_input:
                 for jacobian_x, d_x in zip(flat_jacobian_input, iter_tensors(d_input)):
                     jacobian_x[:, i] = d_x
             if jacobian_parameters:
                 jacobian_param[:, i] = torch.cat(self._flatten_tensors(d_param), 0)

         res = tuple()
         if jacobian_input:
             res += jacobian_inp,
         if jacobian_parameters:
             res += jacobian_param,

         return res

     def _numerical_jacobian(self, module, input, jacobian_input=True, jacobian_parameters=True):
         def fw(input):
             out = self._forward(module, input)
             if isinstance(out, Variable):
                 return out.data
             return out

         res = tuple()
         input = contiguous(input)
         if jacobian_input:
             res += get_numerical_jacobian(fw, input, input, eps=1e-6),
         if jacobian_parameters:
             param, _ = self._get_parameters(module)
             res += torch.cat(list(get_numerical_jacobian(fw, input, p, eps=1e-6) for p in param), 0),
         return res

     def check_jacobian(self, module, input, jacobian_input=True):
         jacobian_parameters = bool(self._get_parameters(module)[0])
         analytical = self._analytical_jacobian(module, input, jacobian_input, jacobian_parameters)
         numerical = self._numerical_jacobian(module, input, jacobian_input, jacobian_parameters)
         analytical_t = iter_tensors(analytical)
         numerical_t = iter_tensors(numerical)
         # TODO: compare structure
         self.assertLessEqual(
             max(a.add(-1, n).abs().max() for a, n in zip(analytical_t, numerical_t)),
             PRECISION
         )

     def check_criterion_jacobian(self, criterion, input, target):
         eps = 1e-6
         self._forward_criterion(criterion, input, target)
         analytical_d_x = self._backward_criterion(criterion, input, target)
         numerical_d_x = deepcopy(analytical_d_x)

         input_t = iter_tensors(input)
         numerical_t = iter_tensors(numerical_d_x)
         for x, d_x in zip(input_t, numerical_t):
             x = x.view(-1)
             d_x = d_x.view(-1)
             for i in range(x.nelement()):
                 original = x[i]
                 x[i] = original + eps
                 fx1 = self._forward_criterion(criterion, input, target)
                 x[i] = original - eps
                 fx2 = self._forward_criterion(criterion, input, target)
                 deriv = (fx1 - fx2) / (2. * eps)
                 d_x[i] = float(deriv)
                 x[i] = original

         # TODO: check structure
         analytical_t = iter_tensors(analytical_d_x)
         numerical_t = iter_tensors(numerical_d_x)
         self.assertLessEqual(
             max(a.add(-1, n).abs().max() for a, n in zip(analytical_t, numerical_t)),
             PRECISION
         )


 class TestBase(object):

     _required_arg_names = {'constructor_args', 'input'}

     def __init__(self, constructor, desc='', reference_fn=None, fullname=None, **kwargs):
         self.desc = desc
         self.fullname = fullname
         self.constructor = constructor
         self.reference_fn = reference_fn
         for name in self._required_arg_names:
             if name not in kwargs and name + '_fn' not in kwargs and name + '_size' not in kwargs:
                 if name == 'constructor_args':
                     kwargs['constructor_args'] = tuple()
                 else:
                     raise ValueError("{}: Specify {} by a value, a function to generate it, or it's size!"
                                      .format(self.get_name(), name))
         self._extra_kwargs = kwargs
         self._arg_cache = {}

     def get_name(self):
         if self.fullname is not None:
             return 'test_' + self.fullname

         test_name = 'test_' + self.constructor.__name__
         if self.desc:
             test_name += '_' + self.desc
         return test_name

     def _unpack(self, value):
         if isinstance(value, Variable):
             return value.data
         elif torch.is_tensor(value):
             return value
         elif is_iterable(value):
             return type(value)(self._unpack(v) for v in value)
         else:
             return value

     @property
     def constructor_args(self):
         return self._get_arg('constructor_args', True)

     def _get_arg(self, name, unpack):
         assert name in self._required_arg_names

         if name not in self._arg_cache:
             fn_name = name + '_fn'
             size_name = name + '_size'

             def convert_tensors_to_vars(args):
                 def tensor_to_var(t):
                     return Variable(t) if torch.is_tensor(t) else t

                 if isinstance(args, tuple):
                     return tuple(tensor_to_var(x) for x in args)
                 else:
                     return tensor_to_var(args)

             if name in self._extra_kwargs:
                 self._arg_cache[name] = convert_tensors_to_vars(self._extra_kwargs[name])
             elif fn_name in self._extra_kwargs:
                 self._arg_cache[name] = convert_tensors_to_vars(self._extra_kwargs[fn_name]())
             else:
                 assert size_name in self._extra_kwargs

                 def map_tensor_sizes(sizes):
                     if isinstance(sizes, list):
                         return [map_tensor_sizes(s) for s in sizes]
                     elif torch.is_tensor(sizes):
                         return Variable(sizes.double())
                     else:
                         return torch_randn(sizes)

                 self._arg_cache[name] = map_tensor_sizes(self._extra_kwargs[size_name])

         return self._unpack(self._arg_cache[name]) if unpack else self._arg_cache[name]

     def _get_input(self, unpack=True):
         return self._get_arg('input', unpack)

     def __call__(self, test_case):
         raise NotImplementedError


 class ModuleTest(TestBase):

     def __init__(self, *args, **kwargs):
         super(ModuleTest, self).__init__(*args, **kwargs)
         self.jacobian_input = kwargs.get('jacobian_input', True)
         self.should_test_cuda = kwargs.get('test_cuda', True)
         self.should_test_pickle = kwargs.get('pickle', True)
         self.check_gradgrad = kwargs.get('check_gradgrad', True)
         self.FIXME_no_cuda_gradgrad_comparison = \
             kwargs.get('FIXME_no_cuda_gradgrad_comparison', False)
         self.precision = kwargs.get('precision', 2e-4)

     def __call__(self, test_case):
         module = self.constructor(*self.constructor_args)
         input = self._get_input()

         if self.reference_fn is not None:
             out = test_case._forward(module, input)
             ref_input = deepcopy(input)
             expected_out = self.reference_fn(ref_input, test_case._get_parameters(module)[0])
             test_case.assertEqual(out, expected_out)
         self.test_noncontig(test_case, module, input)

         if self.should_test_pickle:
             # TODO: do this with in-memory files as soon as torch.save will support it
             with TemporaryFile() as f:
                 test_case._forward(module, input)
                 torch.save(module, f)
                 f.seek(0)
                 module_copy = torch.load(f)
                 test_case.assertEqual(test_case._forward(module, input), test_case._forward(module_copy, input))

         self._do_test(test_case, module, input)

     def noncontiguize(self, obj):
         if isinstance(obj, list):
             return [self.noncontiguize(o) for o in obj]
         tensor = obj.data if isinstance(obj, Variable) else obj
         ndim = tensor.dim()
         noncontig = torch.stack([tensor.clone().zero_(), tensor], ndim).select(ndim, 1)
         assert noncontig.numel() == 1 or not noncontig.is_contiguous()
         if isinstance(obj, Variable):
             return Variable(noncontig, requires_grad=obj.requires_grad)
         return noncontig

     def test_noncontig(self, test_case, module, input):
         # check no scalars, can't make non-contig
         if isinstance(input, Variable) and input.dim() == 0:
             return
         if any(i.dim() == 0 for i in input if isinstance(i, Variable)):
             return

         test_case._zero_grad_parameters(module)
         test_case._zero_grad_input(input)
         with freeze_rng_state():
             output = test_case._forward(module, input)
             grad_output = output
             if isinstance(grad_output, Variable):
                 grad_output = grad_output.data.clone()
             else:
                 grad_output = grad_output.clone()
                 output = output.clone()
             grad_output.normal_()
             d_input = deepcopy(test_case._backward(module, input, output, grad_output))
             d_param = deepcopy(test_case._get_parameters(module)[1])

         nc_input = self.noncontiguize(input)
         nc_grad_output = self.noncontiguize(grad_output)
         for contig_i, contig_g in product((True, False), repeat=2):
             i = input if contig_i else nc_input
             go = grad_output if contig_g else nc_grad_output
             test_case._zero_grad_parameters(module)
             test_case._zero_grad_input(i)
             with freeze_rng_state():
                 try:
                     out = test_case._forward(module, i)
                 except Exception:
                     # Some modules will fail because of non contiguous inputs and we're ok with that
                     continue
                 grad = test_case._backward(module, i, out, go)

                 test_case.assertEqual(out, output)
                 test_case.assertEqual(grad, d_input, 1e-4)
                 test_case.assertEqual(test_case._get_parameters(module)[1], d_param)

     def test_cuda(self, test_case):
         if not TEST_CUDA or not self.should_test_cuda:
             raise unittest.SkipTest('Excluded from CUDA tests')
         try:
             cpu_input = self._get_input()
             type_map = {torch.DoubleTensor: torch.cuda.FloatTensor}
             gpu_input = to_gpu(cpu_input, type_map=type_map)

             cpu_module = self.constructor(*self.constructor_args)
             gpu_module = self.constructor(*self.constructor_args).float().cuda()
             cpu_param = test_case._get_parameters(cpu_module)
             gpu_param = test_case._get_parameters(gpu_module)
             for cpu_p, gpu_p in zip(cpu_param[0], gpu_param[0]):
                 if isinstance(cpu_p, Variable):
                     cpu_p = cpu_p.data
                 if isinstance(gpu_p, Variable):
                     gpu_p = gpu_p.data
                 gpu_p.copy_(cpu_p)

             test_case._zero_grad_input(cpu_input)
             test_case._zero_grad_input(gpu_input)
             test_case._zero_grad_parameters(cpu_module)
             test_case._zero_grad_parameters(gpu_module)
             cpu_output = test_case._forward(cpu_module, cpu_input)
             gpu_output = test_case._forward(gpu_module, gpu_input)
             test_case.assertEqual(cpu_output, gpu_output, self.precision)

             # Run backwards on CPU and GPU and compare results
             for i in range(5):
                 cpu_gradOutput = cpu_output.clone().normal_()
                 gpu_gradOutput = cpu_gradOutput.type('torch.cuda.FloatTensor')
                 cpu_gradInput = test_case._backward(cpu_module, cpu_input, cpu_output, cpu_gradOutput)
                 gpu_gradInput = test_case._backward(gpu_module, gpu_input, gpu_output, gpu_gradOutput)
                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, self.precision)
                 for cpu_d_p, gpu_d_p in zip(cpu_param[1], gpu_param[1]):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, self.precision)

             # Run double-backwards on CPU and GPU and compare results
             if self.check_gradgrad and not self.FIXME_no_cuda_gradgrad_comparison:
                 cpu_output = cpu_module(cpu_input)
                 gpu_output = gpu_module(gpu_input)

                 cpu_gradOutput = Variable(cpu_output.data.clone().normal_(), requires_grad=True)
                 gpu_gradOutput = Variable(cpu_gradOutput.data.type_as(gpu_output), requires_grad=True)

                 cpu_gradInputs = torch.autograd.grad(
                     cpu_output,
                     (cpu_input,) + tuple(cpu_module.parameters()),
                     cpu_gradOutput,
                     create_graph=True)
                 gpu_gradInputs = torch.autograd.grad(
                     gpu_output,
                     (gpu_input,) + tuple(gpu_module.parameters()),
                     gpu_gradOutput,
                     create_graph=True)

                 for cpu_d_i, gpu_d_i in zip(cpu_gradInputs, gpu_gradInputs):
                     test_case.assertEqual(cpu_d_i, gpu_d_i, self.precision)

                 # We mix output into the second backwards computation so that
                 # torch.autograd.grad doesn't complain that some inputs
                 # are unreachable (which can happen if you differentiate
                 # only on the gradient.
                 cpu_gg = torch.autograd.grad(
                     cpu_output.sum() + sum(map(lambda x: x.sum(), cpu_gradInputs)),
                     (cpu_input, cpu_gradOutput) + tuple(cpu_module.parameters()),
                     retain_graph=True)
                 gpu_gg = torch.autograd.grad(
                     gpu_output.sum() + sum(map(lambda x: x.sum(), gpu_gradInputs)),
                     (gpu_input, gpu_gradOutput) + tuple(gpu_module.parameters()),
                     retain_graph=True)

                 test_case.assertEqual(cpu_gradInput, gpu_gradInput, self.precision)
                 for cpu_d_p, gpu_d_p in zip(cpu_gg, gpu_gg):
                     test_case.assertEqual(cpu_d_p, gpu_d_p, self.precision)

             self.test_noncontig(test_case, gpu_module, gpu_input)
         except NotImplementedError:
             pass
         # TODO: remove this after CUDA scatter_ is implemented
         except AttributeError as e:
             if len(e.args) == 1 and "'FloatTensor' object has no attribute 'scatter_'" in e.args[0]:
                 pass
             else:
                 raise


 class CriterionTest(TestBase):

     _required_arg_names = TestBase._required_arg_names.union({'target'})

     def __init__(self, *args, **kwargs):
         super(CriterionTest, self).__init__(*args, **kwargs)
         self.should_test_cuda = kwargs.get('test_cuda', True)

     def _get_target(self):
         return self._get_arg('target', True)

     def __call__(self, test_case):
         module = self.constructor(*self.constructor_args)
         input = self._get_input()

         # Check that these methods don't raise errors
         module.__repr__()
         str(module)

         target = self._get_target()

         if self.reference_fn is not None:
             out = test_case._forward_criterion(module, input, target)
             expected_out = self.reference_fn(deepcopy(input),
                                              deepcopy(target), module)
             test_case.assertEqual(out, expected_out)

         test_case.check_criterion_jacobian(module, input, target)
         self._do_extra_tests(test_case, module, input, target)

     def test_cuda(self, test_case):
         if not TEST_CUDA or not self.should_test_cuda:
             raise unittest.SkipTest('Excluded from CUDA tests')
         try:
             cpu_input = self._get_input()
             type_map = {
                 torch.DoubleTensor: torch.cuda.FloatTensor,
             }
             gpu_input = to_gpu(cpu_input, type_map=type_map)

             cpu_target = self._get_target()
             gpu_target = to_gpu(cpu_target, type_map=type_map)

             cpu_module = self.constructor(*self.constructor_args)
             gpu_module = self.constructor(*self.constructor_args).float().cuda()

             cpu_output = test_case._forward_criterion(cpu_module, cpu_input, cpu_target)
             gpu_output = test_case._forward_criterion(gpu_module, gpu_input, gpu_target)
             test_case.assertEqual(cpu_output, gpu_output, 4e-4)

             cpu_gradInput = test_case._backward_criterion(cpu_module, cpu_input, cpu_target)
             gpu_gradInput = test_case._backward_criterion(gpu_module, gpu_input, gpu_target)
             test_case.assertEqual(cpu_gradInput, gpu_gradInput, 4e-4)
         except NotImplementedError:
             pass

     def _do_extra_tests(self, test_case, module, input, target):
         pass