test/test_cuda.py - platform/external/pytorch - Git at Google

 import math
 import tempfile
 import re
 import unittest
 from itertools import repeat

 import torch
 import torch.cuda
 import torch.cuda.comm as comm

 from test_torch import TestTorch
 from common import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, IS_WINDOWS

 HAS_CUDA = True
 if not torch.cuda.is_available():
     print('CUDA not available, skipping tests')
     TestCase = object  # noqa: F811
     HAS_CUDA = False

 HAS_MAGMA = HAS_CUDA
 if HAS_CUDA:
     torch.ones(1).cuda()  # has_magma shows up after cuda is initialized
     HAS_MAGMA = torch.cuda.has_magma


 def is_floating(t):
     return type(t) in [torch.FloatTensor, torch.DoubleTensor,
                        torch.cuda.FloatTensor, torch.cuda.DoubleTensor]

 types = [
     torch.FloatTensor,
     torch.DoubleTensor,
     torch.LongTensor,
     torch.IntTensor,
     torch.ShortTensor,
     torch.CharTensor,
     torch.ByteTensor,
 ]

 float_types = [
     torch.FloatTensor,
     torch.DoubleTensor
 ]  # TODO: add half...


 def number(floating, integer, t):
     name = type(t).__name__
     if 'Double' in name or 'Float' in name or 'Half' in name:
         return floating
     else:
         return integer
 # TODO: check HalfTensor

 S = 10
 M = 50


 def make_tensor(t, *sizes):
     return t(*sizes).copy_(torch.randn(*sizes))


 def make_sparse_tensor(t, n, *sizes):
     assert t.is_sparse
     tensor = t()
     i = tensor._indices()
     i = i.new(len(sizes), n).copy_(
         torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0))
     v = tensor._values()
     v = v.new(n).copy_(torch.randn(n))
     return t(i, v, torch.Size(sizes))


 def small_2d(t):
     return make_tensor(t, S, S)


 def small_2d_scaled(t, scale=10):
     return make_tensor(t, S, S).mul(scale)


 def small_2d_oneish(t):
     if is_floating(t):
         return make_tensor(t, S, S).clamp(min=0.99, max=1.01)
     else:
         return t(S, S).fill_(1)


 def small_3d(t):
     return make_tensor(t, S, S, S)


 def medium_1d(t):
     return make_tensor(t, M)


 def medium_2d(t):
     return make_tensor(t, M, M)


 def medium_2d_scaled(t, scale=10):
     return make_tensor(t, M, M).mul(scale)


 def small_3d_ones(t):
     return t(S, S, S).copy_(torch.ones(S, S, S))


 def small_3d_positive(t):
     min_val = 1e-3 if is_floating(t) else 2
     return make_tensor(t, S, S, S).clamp_(min_val, 120)


 def small_3d_unique(t):
     return t(S, S, S).copy_(torch.arange(1, S * S * S + 1).view(S, S, S))


 def small_1d_lapack(t):
     return t(1, 3).copy_(torch.arange(1, 4).view(3))


 def small_2d_lapack(t):
     return t(3, 3).copy_(torch.arange(1, 10).view(3, 3))


 def small_2d_lapack_skinny(t):
     return t(3, 4).copy_(torch.arange(1, 13).view(3, 4))


 def small_2d_lapack_fat(t):
     return t(4, 3).copy_(torch.arange(1, 13).view(4, 3))


 def large_2d_lapack(t):
     return t(1000, 1000).normal_()


 def long_type(t):
     return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor


 def new_t(*sizes):
     def tmp(t):
         return t(*sizes).copy_(torch.randn(*sizes))
     return tmp

 tests = [
     ('add', small_3d, lambda t: [number(3.14, 3, t)]),
     ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'),
     ('sub', small_3d, lambda t: [number(3.14, 3, t)],),
     ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('mul', small_3d, lambda t: [number(3.14, 3, t)],),
     ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('div', small_3d, lambda t: [number(3.14, 3, t)],),
     ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
     ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1', float_types),
     ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2', float_types),
     ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3', float_types),
     ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
     ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types),
     ('pow', small_3d, lambda t: [small_3d(t).abs_()], 'tensor', float_types),
     ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types),
     ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
     ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
     ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],),
     ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
     ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
     ('addcdiv', small_2d_lapack, lambda t: [small_2d_lapack(t).mul(2), small_2d_lapack(t)],),
     ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t),
                                             small_2d_lapack(t).mul(2), small_2d_lapack(t)], 'scalar'),
     ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)],),
     ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
     ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)],),
     ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'),
     ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'),
     ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],),
     ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'),
     ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'),
     ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)],),
     ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'),
     ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'),
     ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types + [torch.HalfTensor]),
     ('fmod', small_3d, lambda t: [3], 'value'),
     ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('chunk', medium_2d, lambda t: [4],),
     ('chunk', medium_2d, lambda t: [4, 1], 'dim'),
     ('chunk', medium_2d, lambda t: [4, -2], 'neg_dim'),
     ('clamp', medium_2d_scaled, lambda t: [-1, 5],),
     ('clone', medium_2d, lambda t: [],),
     ('contiguous', medium_2d, lambda t: [],),
     ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],),
     ('cumprod', small_3d, lambda t: [1],),
     ('cumprod', small_3d, lambda t: [-1], 'neg_dim'),
     ('cumsum', small_3d, lambda t: [1],),
     ('cumsum', small_3d, lambda t: [-1], 'neg_dim'),
     ('dim', small_3d, lambda t: [],),
     ('dist', small_2d, lambda t: [small_2d(t)],),
     ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'),
     ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'),
     ('dot', medium_1d, lambda t: [medium_1d(t)],),
     ('element_size', medium_1d, lambda t: [],),
     ('eq', small_3d_ones, lambda t: [small_3d(t)],),
     ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
     ('ne', small_3d_ones, lambda t: [small_3d(t)],),
     ('ne', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
     ('equal', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
     ('equal', small_3d_ones, lambda t: [small_3d(t)],),
     ('expand', new_t(M, 1, M), lambda t: [M, 4, M],),
     ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],),
     ('fill', medium_2d, lambda t: [number(3.14, 3, t)],),
     ('ge', medium_2d, lambda t: [medium_2d(t)],),
     ('le', medium_2d, lambda t: [medium_2d(t)],),
     ('gt', medium_2d, lambda t: [medium_2d(t)],),
     ('lt', medium_2d, lambda t: [medium_2d(t)],),
     ('is_contiguous', medium_2d, lambda t: [],),
     # TODO: can't check negative case - GPU copy will be contiguous
     ('is_same_size', medium_2d, lambda t: [small_3d(t)], 'negative'),
     ('is_same_size', medium_2d, lambda t: [medium_2d(t)], 'positive'),
     ('is_set_to', medium_2d, lambda t: [medium_2d(t)],),
     # TODO: positive case
     ('kthvalue', small_3d_unique, lambda t: [3],),
     ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'),
     ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'),
     ('lerp', small_3d, lambda t: [small_3d(t), 0.3],),
     ('max', small_3d_unique, lambda t: [],),
     ('max', small_3d_unique, lambda t: [1], 'dim'),
     ('max', small_3d_unique, lambda t: [-1], 'neg_dim'),
     ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
     ('min', small_3d_unique, lambda t: [],),
     ('min', small_3d_unique, lambda t: [1], 'dim'),
     ('min', small_3d_unique, lambda t: [-1], 'neg_dim'),
     ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
     ('mean', small_3d, lambda t: [],),
     ('mean', small_3d, lambda t: [-1], 'neg_dim'),
     ('mean', small_3d, lambda t: [1], 'dim'),
     ('mode', small_3d, lambda t: [],),
     ('mode', small_3d, lambda t: [1], 'dim'),
     ('mode', small_3d, lambda t: [-1], 'neg_dim'),
     ('remainder', small_3d, lambda t: [3], 'value'),
     ('remainder', small_3d, lambda t: [-3], 'negative_value'),
     ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
     ('remainder', small_3d, lambda t: [0 - small_3d_positive(t)], 'negative_tensor'),
     ('std', small_3d, lambda t: [],),
     ('std', small_3d, lambda t: [1], 'dim'),
     ('std', small_3d, lambda t: [-1], 'neg_dim'),
     ('var', small_3d, lambda t: [],),
     ('var', small_3d, lambda t: [1], 'dim'),
     ('var', small_3d, lambda t: [-1], 'neg_dim'),
     ('ndimension', small_3d, lambda t: [],),
     ('nelement', small_3d, lambda t: [],),
     ('numel', small_3d, lambda t: [],),
     ('narrow', small_3d, lambda t: [1, 3, 2],),
     ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'),
     ('nonzero', small_3d, lambda t: [],),
     ('norm', small_3d, lambda t: [],),
     ('norm', small_3d, lambda t: [3], '3_norm'),
     ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
     ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'),
     ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
     ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
     ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])],),
     ('put_', new_t(2, 3), lambda t: [long_type(t)([]), t([])], 'empty'),
     ('put_', new_t(2, 2), lambda t: [long_type(t)([[1], [-3]]), t([[1], [2]]), True], 'accumulate'),
     ('prod', small_2d_oneish, lambda t: [],),
     ('prod', small_3d, lambda t: [1], 'dim'),
     ('prod', small_3d, lambda t: [-1], 'neg_dim'),
     ('sum', small_2d, lambda t: [],),
     ('sum', small_3d, lambda t: [1], 'dim'),
     ('sum', small_3d, lambda t: [-1], 'neg_dim'),
     ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'),
     ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'),
     ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'),
     ('repeat', small_2d, lambda t: [2, 2, 2],),
     ('size', new_t(1, 2, 3, 4), lambda t: [],),
     ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'),
     ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'),
     ('sort', small_3d_unique, lambda t: [],),
     ('sort', small_3d_unique, lambda t: [1], 'dim'),
     ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'),
     ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'),
     ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'),
     ('split', small_3d, lambda t: [2],),
     ('split', small_3d, lambda t: [2, 1], 'dim'),
     ('split', small_3d, lambda t: [2, -3], 'neg_dim'),
     ('squeeze', new_t(1, 2, 1, 4), lambda t: [],),
     ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'),
     ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'),
     ('t', new_t(1, 2), lambda t: [],),
     ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])],),
     ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],),
     ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'),
     ('to_list', small_3d, lambda t: [],),
     ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort'),
     ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort'),
     ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort'),
     ('trace', medium_2d, lambda t: [],),
     ('tril', medium_2d, lambda t: [],),
     ('tril', medium_2d, lambda t: [2], 'positive'),
     ('tril', medium_2d, lambda t: [-2], 'negative'),
     ('triu', medium_2d, lambda t: [],),
     ('triu', medium_2d, lambda t: [2], 'positive'),
     ('triu', medium_2d, lambda t: [-2], 'negative'),
     ('unsqueeze', new_t(2, 3, 4), lambda t: [2],),
     ('unsqueeze', new_t(2, 3, 4), lambda t: [-2], 'neg_dim'),
     ('view', small_3d, lambda t: [100, 10], 'contiguous'),
     ('view_as', small_3d, lambda t: [t(100, 10)],),
     ('zero', small_3d, lambda t: [],),
     ('zeros', small_3d, lambda t: [1, 2, 3, 4],),
     ('eye', small_2d, lambda t: [3, 4],),
     ('rsqrt', lambda t: small_3d(t) + 1, lambda t: [], None, float_types),
     ('sinh', lambda t: small_3d(t).clamp(-1, 1), lambda t: [], None, float_types),
     ('tan', lambda t: small_3d(t).clamp(-1, 1), lambda t: [], None, float_types),
     # lapack tests
     ('qr', small_2d_lapack, lambda t: [], 'square', float_types),
     ('qr', small_2d_lapack_skinny, lambda t: [], 'skinny', float_types),
     ('qr', small_2d_lapack_fat, lambda t: [], 'fat', float_types),
     ('qr', large_2d_lapack, lambda t: [], 'big', float_types),
     ('inverse', new_t(20, 20), lambda t: [], None, float_types),
     ('geqrf', new_t(20, 20), lambda t: [], None, float_types),
     # TODO: add det to here once Variable and Tensor are the same thing
 ]

 # TODO: random functions, cat, gather, scatter, index*, masked*,
 #       resize, resizeAs, storage_offset, storage, stride, unfold

 custom_precision = {
     'addbmm': 1e-4,
     'addmm': 1e-4,
     'addmv': 1e-4,
     'addr': 1e-4,
     'baddbmm': 1e-4,
     'rsqrt': 1e-4,
     'cumprod': 1e-4,
     'qr': 3e-4,
     'digamma': 1e0,  # large values lead to large absolute error but small relative error
 }

 simple_pointwise = [
     'abs',
     'sign',
 ]
 for fn in simple_pointwise:
     tests.append((fn, small_3d, lambda t: []))

 simple_pointwise_float = [
     'log',
     'log1p',
     'sigmoid',
     'sin',
     'sqrt',
     'tanh',
     'acos',
     'asin',
     'atan',
     'cos',
     'cosh',
     'erf',
     'erfinv',
     'exp',
     'expm1',
     'reciprocal',
     'floor',
     'frac',
     'neg',
     'round',
     'trunc',
     'ceil',
     'lgamma',
     'digamma',
     'trigamma',
 ]

 for fn in simple_pointwise_float:
     tests.append((fn, small_3d, lambda t: [], None, float_types))

 _cycles_per_ms = None


 def get_cycles_per_ms():
     """Approximate number of cycles per millisecond for torch.cuda._sleep"""
     global _cycles_per_ms
     if _cycles_per_ms is None:
         start = torch.cuda.Event(enable_timing=True)
         end = torch.cuda.Event(enable_timing=True)
         start.record()
         torch.cuda._sleep(1000000)
         end.record()
         end.synchronize()
         _cycles_per_ms = 1000000 / start.elapsed_time(end)
     return _cycles_per_ms


 def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5, force_gpu_half=False):
     def tmp(self):
         cpu_tensor = tensor_constructor(t)
         type_map = {}
         if force_gpu_half:
             type_map = {
                 'torch.FloatTensor': 'torch.cuda.HalfTensor',
                 'torch.DoubleTensor': 'torch.cuda.HalfTensor',
             }
         gpu_tensor = to_gpu(cpu_tensor, type_map)
         cpu_args = arg_constructor(t)
         gpu_args = [to_gpu(arg, type_map) for arg in cpu_args]
         cpu_result = getattr(cpu_tensor, fn)(*cpu_args)
         try:
             gpu_result = getattr(gpu_tensor, fn)(*gpu_args)
         except RuntimeError as e:
             reason = e.args[0]
             if 'only supports floating-point types' in reason or 'unimplemented data type' in reason:
                 raise unittest.SkipTest('unimplemented data type')
             raise
         except AttributeError as e:
             reason = e.args[0]
             if 'object has no attribute' in reason:
                 raise unittest.SkipTest('unimplemented data type')
             raise
         # If one changes, another should change as well
         self.assertEqual(cpu_tensor, gpu_tensor, precision)
         self.assertEqual(cpu_args, gpu_args, precision)
         # Compare results
         self.assertEqual(cpu_result, gpu_result, precision)
     return tmp


 class TestCuda(TestCase):

     @staticmethod
     def _test_memory_stats_generator(self, device=None, N=35):
         if device is None:
             device = torch.cuda.current_device()

         m0 = torch.cuda.memory_allocated(device)
         last_m_arr = [torch.cuda.memory_allocated(device)]
         max_m_arr = [torch.cuda.max_memory_allocated(device)]
         last_c_arr = [torch.cuda.memory_cached(device)]
         max_c_arr = [torch.cuda.max_memory_cached(device)]

         def alloc(*size):
             with torch.cuda.device(device):
                 # NOTE: do **not** use methods that can have additional
                 #       memory overhead, e.g., inplace random sampling methods.
                 #       they can leave some memory occupied even after being
                 #       deallocated, e.g., initialized RNG state, causing some
                 #       memory checks below to fail.
                 return torch.cuda.FloatTensor(*size)

         def assert_change(comp=1, empty_cache=False):
             # comp > 0: increased
             # comp = 0: equal
             # comp < 0: decreased
             new_m = torch.cuda.memory_allocated(device)
             new_max_m = torch.cuda.max_memory_allocated(device)
             if comp > 0:
                 self.assertGreater(new_m, last_m_arr[0])
             elif comp < 0:
                 self.assertLess(new_m, last_m_arr[0])
             else:
                 self.assertEqual(new_m, last_m_arr[0])
             self.assertLessEqual(new_m, new_max_m)
             self.assertGreaterEqual(new_max_m, max_m_arr[0])
             last_m_arr[0] = new_m
             max_m_arr[0] = new_max_m

             new_c = torch.cuda.memory_cached(device)
             new_max_c = torch.cuda.max_memory_cached(device)
             # emptying cache may happen (due to allocation or empty_cache), so
             # we can't assert new_c >= last_c
             self.assertLessEqual(new_c, new_max_c)
             self.assertGreaterEqual(new_max_c, max_c_arr[0])
             last_c_arr[0] = new_c
             max_c_arr[0] = new_max_c

             if empty_cache:
                 torch.cuda.empty_cache()
                 new_c = torch.cuda.memory_cached(device)
                 new_max_c = torch.cuda.max_memory_cached(device)
                 self.assertLessEqual(new_c, last_c_arr[0])
                 self.assertLessEqual(new_c, new_max_c)
                 self.assertEqual(new_max_c, max_c_arr[0])
                 last_c_arr[0] = new_c

         assert_change(0)
         assert_change(0)
         yield

         tensors1 = [alloc(1), alloc(10, 20), alloc(200, 300, 2000)]
         m1 = torch.cuda.memory_allocated(device)
         assert_change(1)
         yield

         tensors2 = []

         for i in range(1, int(N / 2) + 1):
             # small ones
             tensors2.append(alloc(i, i * 4))
             assert_change(1)
             yield

         for i in range(5, int(N / 2) + 5):
             # large ones
             tensors2.append(alloc(i, i * 7, i * 9, i * 11))
             assert_change(1)
             yield

         tensors2.append(alloc(0, 0, 0))
         assert_change(0)
         yield

         permute = []
         for i in torch.randperm(len(tensors2)):
             permute.append(tensors2[i])
             assert_change(0)
             yield

         del tensors2
         assert_change(0)
         yield
         tensors2 = permute
         assert_change(0)
         yield
         del permute
         assert_change(0)
         yield

         for i in range(int(N / 2)):
             x = tensors2[i].numel()
             del tensors2[i]
             assert_change(-x)  # in case that tensors2[i] is empty
             yield

         for i in range(2, int(2 * N / 3) + 2):
             tensors2.append(alloc(i, i * 3, i * 8))
             assert_change(1)
             yield

         del tensors2
         assert_change(-1)
         assert_change(0)
         self.assertEqual(torch.cuda.memory_allocated(device), m1)
         yield True

         del tensors1
         assert_change(-1)
         self.assertEqual(torch.cuda.memory_allocated(device), m0)

         # test empty_cache
         assert_change(0, empty_cache=True)

     def test_memory_stats(self):
         torch.cuda.empty_cache()
         for _ in self._test_memory_stats_generator(self):
             pass

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_memory_stats_multigpu(self):
         # advance a generator with a end flag
         def advance(gen, end):
             if not end:
                 try:
                     next(gen)
                 except StopIteration:
                     end = True
             return end

         # interlace
         torch.cuda.empty_cache()
         gen0 = self._test_memory_stats_generator(self, device=0, N=35)
         gen1 = self._test_memory_stats_generator(self, device=1, N=35)
         end0 = end1 = False
         while not (end0 and end1):
             end0 = advance(gen0, end0)
             end1 = advance(gen1, end1)

         # semi-random order
         torch.cuda.empty_cache()
         gen0 = self._test_memory_stats_generator(self, device=0, N=35)
         gen1 = self._test_memory_stats_generator(self, device=1, N=35)
         end0 = end1 = False

         while not (end0 and end1):
             end0 = advance(gen0, end0)
             if not end0:
                 gen1_max_times = torch.LongTensor(1).random_(0, 3)[0]
             else:
                 gen1_max_times = float('inf')
             t = 0
             while t < gen1_max_times and not end1:
                 end1 = advance(gen1, end1)
                 t += 1

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def _test_autogpu(self, TensorCtor):
         x = TensorCtor().cuda()
         y = TensorCtor().cuda()
         self.assertEqual(x.get_device(), 0)
         self.assertEqual(x.get_device(), 0)
         with torch.cuda.device(1):
             z = TensorCtor().cuda()
             self.assertEqual(z.get_device(), 1)
             q = x.add(y)
             self.assertEqual(q.get_device(), 0)
             w = TensorCtor().cuda()
             self.assertEqual(w.get_device(), 1)
             self.assertEqual(y.cuda().get_device(), 1)
             self.assertEqual(y.cuda(-1).get_device(), 1)
         z = z.cuda()
         self.assertEqual(z.get_device(), 0)

     def test_autogpu(self):
         # TODO: clean-up and merge with above code after Variable and Tensor
         # are merged
         self._test_autogpu(lambda: torch.randn(5, 5))
         self._test_autogpu(lambda: torch.autograd.Variable(torch.randn(5, 5)))

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_new(self):
         x = torch.autograd.Variable(torch.randn(3, 3).cuda())
         self.assertEqual(x.new([0, 1, 2]).get_device(), 0)
         self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1)
         with torch.cuda.device(1):
             self.assertEqual(x.new([0, 1, 2]).get_device(), 0)
             self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1)

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_copy_device(self):
         x = torch.randn(5, 5).cuda()
         with torch.cuda.device(1):
             y = x.cuda()
             self.assertEqual(y.get_device(), 1)
             self.assertIs(y.cuda(), y)
             z = y.cuda(0)
             self.assertEqual(z.get_device(), 0)
             self.assertIs(z.cuda(0), z)

         x = torch.randn(5, 5)
         with torch.cuda.device(1):
             y = x.cuda()
             self.assertEqual(y.get_device(), 1)
             self.assertIs(y.cuda(), y)
             z = y.cuda(0)
             self.assertEqual(z.get_device(), 0)
             self.assertIs(z.cuda(0), z)

     def test_serialization_array_with_storage(self):
         x = torch.randn(5, 5).cuda()
         y = torch.IntTensor(2, 5).fill_(0).cuda()
         q = [x, y, x, y.storage()]
         with tempfile.NamedTemporaryFile() as f:
             torch.save(q, f)
             f.seek(0)
             q_copy = torch.load(f)
         self.assertEqual(q_copy, q, 0)
         q_copy[0].fill_(5)
         self.assertEqual(q_copy[0], q_copy[2], 0)
         self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor))
         self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
         self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor))
         self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage))
         q_copy[1].fill_(10)
         self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))

     def test_type_conversions(self):
         x = torch.randn(5, 5)
         self.assertIs(type(x.float()), torch.FloatTensor)
         self.assertIs(type(x.cuda()), torch.cuda.DoubleTensor)
         self.assertIs(type(x.cuda().float()), torch.cuda.FloatTensor)
         self.assertIs(type(x.cuda().float().cpu()), torch.FloatTensor)
         self.assertIs(type(x.cuda().float().cpu().int()), torch.IntTensor)

         y = x.storage()
         self.assertIs(type(y.float()), torch.FloatStorage)
         self.assertIs(type(y.cuda()), torch.cuda.DoubleStorage)
         self.assertIs(type(y.cuda().float()), torch.cuda.FloatStorage)
         self.assertIs(type(y.cuda().float().cpu()), torch.FloatStorage)
         self.assertIs(type(y.cuda().float().cpu().int()), torch.IntStorage)

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_type_conversions_same_gpu(self):
         x = torch.randn(5, 5).cuda(1)
         self.assertEqual(x.int().get_device(), 1)

     def test_neg(self):
         TestTorch._test_neg(self, lambda t: t.cuda())

     def _test_broadcast(self, input):
         if torch.cuda.device_count() < 2:
             raise unittest.SkipTest("only one GPU detected")
         result = comm.broadcast(input, (0, 1))
         for i, t in enumerate(result):
             self.assertEqual(t.get_device(), i)
             self.assertEqual(t, input)

     def test_broadcast_cpu(self):
         self._test_broadcast(torch.randn(5, 5))

     def test_broadcast_gpu(self):
         self._test_broadcast(torch.randn(5, 5))

     @staticmethod
     def _test_broadcast_coalesced(self, tensors, buffer_size):
         b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors]
         for (_, bt), t in zip(b_tensors, tensors):
             self.assertEqual(bt.get_device(), 1)
             self.assertEqual(bt, t)
             self.assertIsInstance(bt, type(t))

         bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=buffer_size)
         bc_tensors_t = list(zip(*bc_tensors))
         self.assertEqual(b_tensors, bc_tensors_t)
         for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t):
             self.assertEqual(bt.get_device(), bct.get_device())
             self.assertIsInstance(bct, type(bt))

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_broadcast_coalesced(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
             make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
             make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
             make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
             make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
             make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
             make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
         self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_broadcast_coalesced_dense_only(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
         self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2)

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_reduce_add(self):
         x = torch.randn(5, 5)
         y = torch.randn(5, 5)
         x_cuda = x.cuda(0)
         y_cuda = y.cuda(1)
         result = comm.reduce_add((x_cuda, y_cuda))
         self.assertEqual(result.get_device(), 0)
         self.assertEqual(result.cpu(), x + y)

     @staticmethod
     def _test_reduce_add_coalesced(self, tensors, buffer_size):
         dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))]

         r_tensors = list(map(comm.reduce_add, zip(*dup_tensors)))
         for r, t in zip(r_tensors, tensors):
             self.assertEqual(r.get_device(), t.get_device())
             self.assertEqual(r, t * 2)
             self.assertIsInstance(r, type(t))

         rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size)
         self.assertEqual(r_tensors, rc_tensors)
         for r, rc in zip(r_tensors, rc_tensors):
             self.assertEqual(rc.get_device(), r.get_device())
             self.assertIsInstance(rc, type(r))

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_reduce_add_coalesced(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
             make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3),
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
             make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3),
             make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3),
             make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3),
             make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3),
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
             make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7),
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
         self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_reduce_add_coalesced_dense_only(self):
         numel = 5
         num_bytes = numel * 8
         tensors = [
             torch.randn(numel).long().cuda(),
             torch.randn(numel).cuda(),
             torch.randn(numel).long().cuda(),
             torch.randn(numel).long().cuda(),
             torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
             torch.randn(numel).cuda(),
         ]
         self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2)

     def _test_scatter(self, input, chunk_sizes=None, dim=0):
         if torch.cuda.device_count() < 2:
             raise unittest.SkipTest("only one GPU detected")
         result = comm.scatter(input, (0, 1), chunk_sizes, dim)
         self.assertEqual(len(result), 2)
         if chunk_sizes is None:
             chunk_sizes = tuple(repeat(input.size(dim) // 2, 2))
         chunk_start = 0
         for i, r in enumerate(result):
             chunk_end = chunk_start + chunk_sizes[i]
             index = [slice(None, None), slice(None, None)]
             index[dim] = slice(chunk_start, chunk_end)
             self.assertEqual(r, input[tuple(index)], 0)
             chunk_start = chunk_end

     def test_scatter_cpu(self):
         self._test_scatter(torch.randn(4, 4), dim=0)

     def test_scatter_cpu_dim(self):
         self._test_scatter(torch.randn(4, 4), dim=1)

     def test_scatter_cpu_neg_dim(self):
         self._test_scatter(torch.randn(4, 4), dim=-2)

     def test_scatter_cpu_sizes(self):
         self._test_scatter(torch.randn(6, 4), chunk_sizes=(2, 4))

     def test_scatter_gpu(self):
         self._test_scatter(torch.randn(4, 4).cuda(), dim=0)

     def test_scatter_gpu_dim(self):
         self._test_scatter(torch.randn(4, 4).cuda(), dim=1)

     def test_scatter_gpu_neg_dim(self):
         self._test_scatter(torch.randn(4, 4).cuda(), dim=-2)

     def test_scatter_gpu_sizes(self):
         self._test_scatter(torch.randn(6, 4).cuda(), chunk_sizes=(2, 4))

     def _test_gather(self, dim):
         if torch.cuda.device_count() < 2:
             raise unittest.SkipTest("only one GPU detected")
         x = torch.randn(2, 5).cuda(0)
         y = torch.randn(2, 5).cuda(1)
         result = comm.gather((x, y), dim)

         expected_size = list(x.size())
         expected_size[dim] += y.size(dim)
         expected_size = torch.Size(expected_size)
         self.assertEqual(result.get_device(), 0)
         self.assertEqual(result.size(), expected_size)

         index = [slice(None, None), slice(None, None)]
         index[dim] = slice(0, x.size(dim))
         self.assertEqual(result[tuple(index)], x)
         index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim))
         self.assertEqual(result[tuple(index)], y)

     def test_gather(self):
         self._test_gather(0)

     def test_gather_dim(self):
         self._test_gather(1)

     def test_from_sequence(self):
         seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)]
         reference = torch.arange(0, 20).resize_(5, 4)
         for t in types:
             cuda_type = get_gpu_type(t)
             self.assertEqual(cuda_type(seq), reference)

     def test_torch_manual_seed_seeds_cuda_devices(self):
         with freeze_rng_state():
             x = torch.zeros(4, 4).float().cuda()
             torch.manual_seed(2)
             self.assertEqual(torch.cuda.initial_seed(), 2)
             x.uniform_()
             torch.manual_seed(2)
             y = x.clone().uniform_()
             self.assertEqual(x, y)
             self.assertEqual(torch.cuda.initial_seed(), 2)

     def test_manual_seed(self):
         with freeze_rng_state():
             x = torch.zeros(4, 4).float().cuda()
             torch.cuda.manual_seed(2)
             self.assertEqual(torch.cuda.initial_seed(), 2)
             x.uniform_()
             torch.cuda.manual_seed(2)
             y = x.clone().uniform_()
             self.assertEqual(x, y)
             self.assertEqual(torch.cuda.initial_seed(), 2)

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_cat_autogpu(self):
         x = torch.randn(4, 4).cuda(1)
         y = torch.randn(4, 4).cuda(1)
         z = torch.cat([x, y], 0)
         self.assertEqual(z.get_device(), x.get_device())

     def test_cat(self):
         SIZE = 10
         for dim in range(-3, 3):
             pos_dim = dim if dim >= 0 else 3 + dim
             x = torch.rand(13, SIZE, SIZE).transpose(0, pos_dim).cuda()
             y = torch.rand(17, SIZE, SIZE).transpose(0, pos_dim).cuda()
             z = torch.rand(19, SIZE, SIZE).transpose(0, pos_dim).cuda()

             res1 = torch.cat((x, y, z), dim)
             self.assertEqual(res1.narrow(pos_dim, 0, 13), x, 0)
             self.assertEqual(res1.narrow(pos_dim, 13, 17), y, 0)
             self.assertEqual(res1.narrow(pos_dim, 30, 19), z, 0)

         x = torch.randn(20, SIZE, SIZE).cuda()
         self.assertEqual(torch.cat(torch.split(x, 7)), x)
         self.assertEqual(torch.cat(torch.chunk(x, 7)), x)

         y = torch.randn(1, SIZE, SIZE).cuda()
         z = torch.cat([x, y])
         self.assertEqual(z.size(), (21, SIZE, SIZE))

     def test_cat_bad_input_sizes(self):
         x = torch.randn(2, 1).cuda()
         y = torch.randn(2, 1, 1).cuda()
         z = torch.randn(2, 1, 1).cuda()
         self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z]))

         x = torch.randn(2, 1, 2).cuda()
         y = torch.randn(2, 1, 1).cuda()
         z = torch.randn(2, 2, 1).cuda()
         self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z], dim=1))

     def test_serialization(self):
         x = torch.randn(4, 4).cuda()
         with tempfile.NamedTemporaryFile() as f:
             torch.save(x, f)
             f.seek(0)
             x_copy = torch.load(f)
         self.assertEqual(x_copy, x)
         self.assertIs(type(x_copy), type(x))
         self.assertEqual(x_copy.get_device(), x.get_device())

     def test_serialization_array_with_empty(self):
         x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()]
         with tempfile.NamedTemporaryFile() as f:
             torch.save(x, f)
             f.seek(0)
             x_copy = torch.load(f)
         for original, copy in zip(x, x_copy):
             self.assertEqual(copy, original)
             self.assertIs(type(copy), type(original))
             self.assertEqual(copy.get_device(), original.get_device())

     @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
     def test_multigpu_serialization(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
         with tempfile.NamedTemporaryFile() as f:
             torch.save(x, f)
             f.seek(0)
             x_copy = torch.load(f)
         for original, copy in zip(x, x_copy):
             self.assertEqual(copy, original)
             self.assertIs(type(copy), type(original))
             self.assertEqual(copy.get_device(), original.get_device())

     @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
     def test_multigpu_serialization_remap(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]

         def gpu_remap(storage, location):
             if location == 'cuda:1':
                 return storage.cuda(0)

         with tempfile.NamedTemporaryFile() as f:
             torch.save(x, f)
             f.seek(0)
             x_copy = torch.load(f, map_location=gpu_remap)

         for original, copy in zip(x, x_copy):
             self.assertEqual(copy, original)
             self.assertIs(type(copy), type(original))
             self.assertEqual(copy.get_device(), 0)

     @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
     def test_multigpu_serialization_remap_dict(self):
         x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
         with tempfile.NamedTemporaryFile() as f:
             torch.save(x, f)
             f.seek(0)
             x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
         for original, copy in zip(x, x_copy):
             self.assertEqual(copy, original)
             self.assertIs(type(copy), type(original))
             self.assertEqual(copy.get_device(), 0)

     @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
     def test_cuda_set_device(self):
         x = torch.randn(5, 5)
         with torch.cuda.device(1):
             self.assertEqual(x.cuda().get_device(), 1)
             torch.cuda.set_device(0)
             self.assertEqual(x.cuda().get_device(), 0)
             with torch.cuda.device(1):
                 self.assertEqual(x.cuda().get_device(), 1)
             self.assertEqual(x.cuda().get_device(), 0)
             torch.cuda.set_device(1)
         self.assertEqual(x.cuda().get_device(), 0)

     def test_is_tensor(self):
         for t in types:
             tensor = get_gpu_type(t)()
             self.assertTrue(torch.is_tensor(tensor))
         self.assertTrue(torch.is_tensor(torch.cuda.HalfTensor()))

     def test_cuda_synchronize(self):
         torch.cuda.synchronize()

     def test_streams(self):
         default_stream = torch.cuda.current_stream()
         user_stream = torch.cuda.Stream()
         self.assertEqual(torch.cuda.current_stream(), default_stream)
         self.assertNotEqual(default_stream, user_stream)
         self.assertEqual(default_stream.cuda_stream, 0)
         self.assertNotEqual(user_stream.cuda_stream, 0)
         with torch.cuda.stream(user_stream):
             self.assertEqual(torch.cuda.current_stream(), user_stream)
         self.assertTrue(user_stream.query())
         # copy 10 MB tensor from CPU-GPU which should take some time
         tensor1 = torch.ByteTensor(10000000).pin_memory()
         tensor2 = tensor1.cuda(async=True)
         self.assertFalse(default_stream.query())
         default_stream.synchronize()
         self.assertTrue(default_stream.query())

     @unittest.skipIf(torch.cuda.device_count() < 2, "detected only one GPU")
     def test_streams_multi_gpu(self):
         default_stream = torch.cuda.current_stream()
         self.assertEqual(default_stream.device, 0)
         stream = torch.cuda.Stream(device=1)
         self.assertEqual(stream.device, 1)
         with torch.cuda.device(1):
             self.assertEqual(torch.cuda.current_stream().device, 1)
             self.assertNotEqual(torch.cuda.current_stream(), default_stream)

     @unittest.skipIf(torch.cuda.device_count() < 2, "multi-GPU not supported")
     def test_tensor_device(self):
         self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 0)
         self.assertEqual(torch.cuda.FloatTensor(1, device=1).get_device(), 1)
         with torch.cuda.device(1):
             self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 1)
             self.assertEqual(torch.cuda.FloatTensor(1, device=0).get_device(), 0)
             self.assertEqual(torch.cuda.FloatTensor(1, device=None).get_device(), 1)

     def test_events(self):
         stream = torch.cuda.current_stream()
         event = torch.cuda.Event(enable_timing=True)
         self.assertTrue(event.query())
         start_event = torch.cuda.Event(enable_timing=True)
         stream.record_event(start_event)
         torch.cuda._sleep(int(50 * get_cycles_per_ms()))
         stream.record_event(event)
         self.assertFalse(event.query())
         event.synchronize()
         self.assertTrue(event.query())
         self.assertGreater(start_event.elapsed_time(event), 0)

     def test_record_stream(self):
         cycles_per_ms = get_cycles_per_ms()

         t = torch.FloatTensor([1, 2, 3, 4]).pin_memory()
         result = torch.cuda.FloatTensor(t.size())
         stream = torch.cuda.Stream()
         ptr = [None]

         # Performs the CPU->GPU copy in a background stream
         def perform_copy():
             with torch.cuda.stream(stream):
                 tmp = t.cuda(async=True)
                 ptr[0] = tmp.data_ptr()
             torch.cuda.current_stream().wait_stream(stream)
             tmp.record_stream(torch.cuda.current_stream())
             torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
             result.copy_(tmp)

         perform_copy()
         with torch.cuda.stream(stream):
             tmp2 = torch.cuda.FloatTensor(t.size())
             tmp2.zero_()
             self.assertNotEqual(tmp2.data_ptr(), ptr[0], 'allocation re-used to soon')

         self.assertEqual(result.tolist(), [1, 2, 3, 4])

         # Check that the block will be re-used after the main stream finishes
         torch.cuda.current_stream().synchronize()
         with torch.cuda.stream(stream):
             tmp3 = torch.cuda.FloatTensor(t.size())
             self.assertEqual(tmp3.data_ptr(), ptr[0], 'allocation not re-used')

     def test_noncontiguous_pinned_memory(self):
         # See issue #3266
         x = torch.arange(0, 10).view((2, 5))
         self.assertEqual(x.t(), x.t().pin_memory())

     def test_caching_pinned_memory(self):
         cycles_per_ms = get_cycles_per_ms()

         # check that allocations are re-used after deletion
         t = torch.FloatTensor([1]).pin_memory()
         ptr = t.data_ptr()
         del t
         t = torch.FloatTensor([1]).pin_memory()
         self.assertEqual(t.data_ptr(), ptr, 'allocation not reused')

         # check that the allocation is not re-used if it's in-use by a copy
         gpu_tensor = torch.cuda.FloatTensor([0])
         torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
         gpu_tensor.copy_(t, async=True)
         del t
         t = torch.FloatTensor([1]).pin_memory()
         self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')
         self.assertEqual(list(gpu_tensor), [1])

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_caching_pinned_memory_multi_gpu(self):
         # checks that the events preventing pinned memory from being re-used
         # too early are recorded on the correct GPU
         cycles_per_ms = get_cycles_per_ms()

         t = torch.FloatTensor([1]).pin_memory()
         ptr = t.data_ptr()
         gpu_tensor0 = torch.cuda.FloatTensor([0], device=0)
         gpu_tensor1 = torch.cuda.FloatTensor([0], device=1)

         with torch.cuda.device(1):
             torch.cuda._sleep(int(50 * cycles_per_ms))  # delay the copy
             gpu_tensor1.copy_(t, async=True)

         del t
         t = torch.FloatTensor([2]).pin_memory()
         self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon')

         with torch.cuda.device(0):
             gpu_tensor0.copy_(t, async=True)

         self.assertEqual(gpu_tensor1[0], 1)
         self.assertEqual(gpu_tensor0[0], 2)

     @staticmethod
     def _select_broadcastable_dims(dims_full=None):
         return TestTorch._select_broadcastable_dims(dims_full)

     @unittest.skipIf(not HAS_MAGMA, "no MAGMA library detected")
     def test_det(self):
         TestTorch._test_det(self, lambda t: t.cuda())

     def test_view(self):
         TestTorch._test_view(self, lambda t: t.cuda())

     def test_stft(self):
         TestTorch._test_stft(self, lambda t: t.cuda())

     def test_broadcast(self):
         TestTorch._test_broadcast(self, lambda t: t.cuda())

     def test_contiguous(self):
         TestTorch._test_contiguous(self, lambda t: t.cuda())

     def test_broadcast_fallback(self):
         TestTorch._test_broadcast_fallback(self, lambda t: t.cuda())

     def test_broadcast_fused_matmul(self):
         TestTorch._test_broadcast_fused_matmul(self, lambda t: t.cuda())

     def test_broadcast_batched_matmul(self):
         TestTorch._test_broadcast_batched_matmul(self, lambda t: t.cuda())

     def test_index(self):
         TestTorch._test_index(self, lambda t: t.cuda())

     def test_advancedindex(self):
         TestTorch._test_advancedindex(self, lambda t: t.cuda())

     def test_advancedindex_big(self):
         TestTorch._test_advancedindex_big(self, lambda t: t.cuda())

     def test_btrifact(self):
         TestTorch._test_btrifact(self, lambda t: t.cuda())

     def test_btrisolve(self):
         TestTorch._test_btrisolve(self, lambda t: t.cuda())

     def test_dim_reduction(self):
         TestTorch._test_dim_reduction(self, lambda t: t.cuda())

     def test_tensor_gather(self):
         TestTorch._test_gather(self, lambda t: t.cuda(), False)

     def test_tensor_scatter(self):
         TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', test_bounds=False)

     def test_tensor_scatterAdd(self):
         TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_add_', test_bounds=False)

     def test_tensor_scatterFill(self):
         TestTorch._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', True, test_bounds=False)

     def test_var(self):
         cpu_tensor = torch.randn(2, 3, 3)
         gpu_tensor = cpu_tensor.cuda()
         self.assertEqual(gpu_tensor.var(), cpu_tensor.var())
         self.assertEqual(gpu_tensor.var(1), cpu_tensor.var(1))
         self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2))
         self.assertEqual(gpu_tensor.std(), cpu_tensor.std())
         self.assertEqual(gpu_tensor.std(1), cpu_tensor.std(1))
         self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2))

         cpu_tensor = torch.randn(100)
         gpu_tensor = cpu_tensor.cuda()
         self.assertEqual(gpu_tensor.var(), cpu_tensor.var())

     def test_var_unbiased(self):
         tensor = torch.randn(100).cuda()
         self.assertEqual(tensor.var(0), tensor.var(0, unbiased=True))
         self.assertEqual(tensor.var(), tensor.var(unbiased=True))
         self.assertEqual(tensor.var(unbiased=False), tensor.var(0, unbiased=False)[0])

         tensor = torch.FloatTensor([1.0, 2.0]).cuda()
         self.assertEqual(tensor.var(unbiased=True), 0.5)
         self.assertEqual(tensor.var(unbiased=False), 0.25)

         tensor = torch.randn(100).cuda()
         self.assertEqual(tensor.std(0), tensor.std(0, unbiased=True))
         self.assertEqual(tensor.std(), tensor.std(unbiased=True))
         self.assertEqual(tensor.std(unbiased=False), tensor.std(0, unbiased=False)[0])

     def test_var_large_input(self):
         # Large, not-nice input
         tensor_cpu = torch.randn(2 * 32 * 1024 + 1, 2, 67)
         tensor_cuda = tensor_cpu.cuda()

         self.assertEqual(tensor_cpu.var(2), tensor_cuda.var(2).cpu())

     def test_var_stability(self):
         tensor = torch.FloatTensor([2281.5, 2281.25]).cuda()

         # Stability for inner dim
         self.assertEqual(tensor.var(0)[0], 0.03125)

         # General stability
         self.assertEqual(tensor.var(), 0.03125)

         # Stability for outer dimensions
         tensor = tensor.unsqueeze(1)
         self.assertEqual(tensor.var(0)[0], 0.03125)

     def test_digamma(self):
         def test(use_double=False):
             cpu_tensor = torch.randn(10, 10, 10)
             gpu_tensor = cpu_tensor.cuda()
             zeros = torch.zeros(10, 10, 10)
             if (use_double):
                 cpu_tensor = cpu_tensor.double()
                 gpu_tensor = gpu_tensor.double()
                 zeros = zeros.double()
             cpu_out = cpu_tensor.digamma()
             gpu_out = gpu_tensor.digamma()
             norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
             self.assertEqual(norm_errors, zeros)

         test(True)
         test(False)

     def test_polygamma(self):
         def test(use_double=False):
             cpu_tensor = torch.randn(10, 10, 10)
             gpu_tensor = cpu_tensor.cuda()
             zeros = torch.zeros(10, 10, 10)
             if (use_double):
                 cpu_tensor = cpu_tensor.double()
                 gpu_tensor = gpu_tensor.double()
                 zeros = zeros.double()
             for n in [0, 1]:
                 cpu_out = cpu_tensor.polygamma(n)
                 gpu_out = gpu_tensor.polygamma(n)
                 norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out
                 self.assertEqual(norm_errors, zeros)

         test(True)
         test(False)

     @unittest.skipIf(not HAS_MAGMA, "no MAGMA library detected")
     def test_symeig(self):
         # Small case
         tensor = torch.randn(3, 3).cuda()
         tensor = torch.mm(tensor, tensor.t())
         eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
         self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))

         # Large case
         tensor = torch.randn(257, 257).cuda()
         tensor = torch.mm(tensor, tensor.t())
         eigval, eigvec = torch.symeig(tensor, eigenvectors=True)
         self.assertEqual(tensor, torch.mm(torch.mm(eigvec, eigval.diag()), eigvec.t()))

     def test_arange(self):
         for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']:
             a = torch.cuda.__dict__[t]()
             torch.arange(0, 10, out=a)
             b = torch.__dict__[t]()
             torch.arange(0, 10, out=b)
             self.assertEqual(a, b.cuda())

     @unittest.skipIf(torch.cuda.device_count() < 2, "only one GPU detected")
     def test_get_set_rng_state_all(self):
         states = torch.cuda.get_rng_state_all()
         before0 = torch.cuda.FloatTensor(100, device=0).normal_()
         before1 = torch.cuda.FloatTensor(100, device=1).normal_()
         torch.cuda.set_rng_state_all(states)
         after0 = torch.cuda.FloatTensor(100, device=0).normal_()
         after1 = torch.cuda.FloatTensor(100, device=1).normal_()
         self.assertEqual(before0, after0, 0)
         self.assertEqual(before1, after1, 0)

     def test_nvtx(self):
         # Just making sure we can see the symbols
         torch.cuda.nvtx.range_push("foo")
         torch.cuda.nvtx.mark("bar")
         torch.cuda.nvtx.range_pop()


 if HAS_CUDA:
     for decl in tests:
         for t in types:
             tensor = t()
             gpu_tensor = get_gpu_type(t)()
             if len(decl) == 3:
                 name, constr, arg_constr = decl
                 desc = ''
             elif len(decl) == 4:
                 name, constr, arg_constr, desc = decl
             elif len(decl) == 5:
                 name, constr, arg_constr, desc, type_subset = decl
                 if t not in type_subset:
                     continue

             precision = custom_precision.get(name, TestCuda.precision)
             for inplace in (True, False):
                 if inplace:
                     name_inner = name + '_'
                 else:
                     name_inner = name
                 if not hasattr(tensor, name_inner):
                     continue
                 if not hasattr(gpu_tensor, name_inner):
                     print("Ignoring {}, because it's not implemented by torch.cuda.{}".format(
                         name_inner, gpu_tensor.__class__.__name__))
                     continue

                 test_name = 'test_' + t.__name__ + '_' + name_inner
                 if desc:
                     test_name += '_' + desc

                 assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
                 setattr(TestCuda,
                         test_name,
                         compare_cpu_gpu(constr, arg_constr, name_inner, t, precision))
                 if t == torch.FloatTensor and not IS_WINDOWS:  # CUDA HalfTensor currently doesn't work on Windows
                     assert not hasattr(TestCuda, test_name + '_gpu_half'), "Duplicated test name: " + test_name
                     setattr(TestCuda,
                             test_name + '_gpu_half',
                             compare_cpu_gpu(constr, arg_constr, name_inner, t,
                                             precision, force_gpu_half=True))


 if __name__ == '__main__':
     run_tests()