| import io |
| import math |
| import tempfile |
| import re |
| import unittest |
| import sys |
| from itertools import repeat |
| import os |
| from contextlib import contextmanager |
| import threading |
| import queue |
| |
| import torch |
| import torch.cuda |
| import torch.cuda.comm as comm |
| from torch import multiprocessing as mp |
| from torch._six import inf, nan |
| |
| from test_torch import _TestTorchMixin |
| |
| from common_methods_invocations import tri_tests_args, tri_large_tests_args, \ |
| run_additional_tri_tests, _compare_trilu_indices, _compare_large_trilu_indices |
| from common_utils import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \ |
| PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, skipIfRocm, TEST_NUMPY, TEST_WITH_ROCM, load_tests, iter_indices |
| |
| # load_tests from common_utils is used to automatically filter tests for |
| # sharding on sandcastle. This line silences flake warnings |
| load_tests = load_tests |
| |
| # We cannot import TEST_CUDA and TEST_MULTIGPU from common_cuda here, |
| # because if we do that, the TEST_CUDNN line from common_cuda will be executed |
| # multiple times as well during the execution of this test suite, and it will |
| # cause CUDA OOM error on Windows. |
| TEST_CUDA = torch.cuda.is_available() |
| TEST_MULTIGPU = TEST_CUDA and torch.cuda.device_count() >= 2 |
| |
| if not TEST_CUDA: |
| print('CUDA not available, skipping tests') |
| TestCase = object # noqa: F811 |
| |
| TEST_MAGMA = TEST_CUDA |
| TEST_LARGE_TENSOR = TEST_CUDA |
| if TEST_CUDA: |
| torch.ones(1).cuda() # has_magma shows up after cuda is initialized |
| TEST_MAGMA = torch.cuda.has_magma |
| TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 9e9 |
| |
| floating_set = {torch.FloatTensor, torch.DoubleTensor, torch.cuda.FloatTensor, |
| torch.cuda.DoubleTensor, torch.HalfTensor, torch.cuda.HalfTensor} |
| |
| |
| def is_floating(t): |
| if not isinstance(t, type): |
| raise TypeError('t should be an instance of type') |
| assert t != torch.autograd.Variable |
| return t in floating_set |
| |
| |
| def is_half(t): |
| if isinstance(t, torch.Tensor): |
| return t.dtype == torch.float16 |
| assert isinstance(t, type) |
| assert t != torch.autograd.Variable |
| return t in [torch.HalfTensor, torch.cuda.HalfTensor] |
| |
| |
| types = [ |
| torch.FloatTensor, |
| torch.DoubleTensor, |
| torch.LongTensor, |
| torch.IntTensor, |
| torch.ShortTensor, |
| torch.CharTensor, |
| torch.ByteTensor, |
| torch.HalfTensor, |
| ] |
| |
| signed_types = [ |
| torch.FloatTensor, |
| torch.DoubleTensor, |
| torch.LongTensor, |
| torch.IntTensor, |
| torch.ShortTensor, |
| torch.CharTensor, |
| ] |
| |
| unsigned_types = [ |
| torch.ByteTensor, |
| ] |
| |
| float_types = [ |
| torch.FloatTensor, |
| torch.DoubleTensor, |
| torch.HalfTensor, |
| ] |
| |
| float_types_no_half = [ |
| torch.FloatTensor, |
| torch.DoubleTensor, |
| ] |
| |
| |
| def number(floating, integer, t): |
| return floating if is_floating(t) else integer |
| |
| |
| def cast_tensor(tensor, t): |
| return t(tensor.size()).copy_(tensor) |
| |
| S = 10 |
| M = 50 |
| G = 275000000 |
| |
| |
| def make_tensor(t, *sizes): |
| if 'Half' in t.__name__: |
| return t(*sizes).copy_(torch.randn(*sizes)) |
| else: |
| tensor = t(*sizes) |
| if tensor.is_floating_point(): |
| return tensor.normal_() |
| else: |
| return tensor.random_(0, 10) |
| |
| |
| def make_sparse_tensor(t, n, *sizes): |
| assert t.is_sparse |
| tensor = t() |
| i = tensor._indices() |
| i = i.new(len(sizes), n).copy_( |
| torch.cat([torch.LongTensor(1, n).random_(s) for s in sizes], 0)) |
| v = tensor._values() |
| v = v.new(n).copy_(torch.randn(n)) |
| return t(i, v, torch.Size(sizes)) |
| |
| |
| def tensor_clamp(t, min, max): |
| if is_half(t): |
| return t.float().clamp(min, max).half() |
| else: |
| return t.clamp(min, max) |
| |
| |
| def tensor_mul(t, scale): |
| if is_half(t): |
| return t.float().mul(scale).half() |
| else: |
| return t.mul(scale) |
| |
| |
| def tensor_abs_(t): |
| if is_half(t): |
| return t.float().abs_().half() |
| else: |
| return t.abs_() |
| |
| |
| def constant_tensor_sub(a, b): |
| # helper function to address const - torch.HalfTensor where it doesn't |
| # have resize_as() |
| if is_half(b): |
| return (a - b.float()).half() |
| else: |
| return a - b |
| |
| |
| def constant_tensor_add(a, b): |
| # helper function to address const + torch.HalfTensor where it doesn't |
| # have add() |
| if is_half(b): |
| return (a + b.float()).half() |
| else: |
| return a + b |
| |
| |
| def small_0d(t): |
| return make_tensor(t, (1,)).squeeze() |
| |
| |
| def small_2d(t): |
| return make_tensor(t, S, S) |
| |
| |
| def small_2d_scaled(t, scale=10): |
| return tensor_mul(make_tensor(t, S, S), scale) |
| |
| |
| def small_2d_oneish(t): |
| if is_floating(t): |
| return tensor_clamp(make_tensor(t, S, S), min=0.99, max=1.01) |
| else: |
| return t(S, S).fill_(1) |
| |
| |
| def small_3d(t): |
| return make_tensor(t, S, S, S) |
| |
| |
| def medium_1d(t): |
| return make_tensor(t, M) |
| |
| |
| def medium_2d(t): |
| return make_tensor(t, M, M) |
| |
| |
| def medium_2d_expanded(t): |
| return t(1).expand(M, M) |
| |
| |
| def medium_2d_scaled(t, scale=10): |
| return tensor_mul(make_tensor(t, M, M), scale) |
| |
| |
| def small_3d_ones(t): |
| return t(S, S, S).copy_(torch.ones(S, S, S)) |
| |
| |
| def small_3d_positive(t): |
| # In div_tensor(), half cannot achieve float precision |
| min_val = 1e-3 if is_floating(t) and not is_half(t) else 2 |
| return tensor_clamp(make_tensor(t, S, S, S), min_val, 120) |
| |
| |
| def small_3d_unique(t): |
| return t(S, S, S).copy_(torch.arange(1, S * S * S + 1).view(S, S, S)) |
| |
| |
| def small_1d_lapack(t): |
| return t(1, 3).copy_(torch.arange(1, 4).view(3)) |
| |
| |
| def small_2d_lapack(t): |
| return t(3, 3).copy_(torch.arange(1, 10).view(3, 3)) |
| |
| |
| def small_2d_lapack_skinny(t): |
| return t(3, 4).copy_(torch.arange(1, 13).view(3, 4)) |
| |
| |
| def small_2d_lapack_fat(t): |
| return t(4, 3).copy_(torch.arange(1, 13).view(4, 3)) |
| |
| |
| def large_2d_lapack(t): |
| return t(1000, 1000).normal_() |
| |
| |
| def giant_1d_ones(t): |
| return t(G).copy_(torch.ones(G)) |
| |
| |
| def long_type(t): |
| return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor |
| |
| |
| def new_t(*sizes): |
| def tmp(t): |
| return t(*sizes).copy_(torch.randn(*sizes)) |
| return tmp |
| |
| # Content of each tuple: |
| # - function name |
| # - constructor for the tensor, signature: fn(tensor_type) -> tensor |
| # - constructor for the arguments, signature: fn(tensor_type) -> list |
| # - postfix name for the test (must be unique for a given function) (default='') |
| # - tensor types to use (default=types) |
| # - disable inplace test, if set to True, no inplace test will be done (default=False) |
| # - decorator, e.g., unittest.skipIf (default is no decorator) |
| tests = [ |
| ('add', small_3d, lambda t: [number(3.14, 3, t)]), |
| ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), |
| ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'), |
| ('sub', small_3d, lambda t: [number(3.14, 3, t)]), |
| ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), |
| ('mul', small_3d, lambda t: [number(3.14, 3, t)]), |
| ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), |
| ('mul', small_0d, lambda t: [small_0d(torch.IntTensor)], 'scalar', types, True), |
| ('div', small_3d, lambda t: [number(3.14, 3, t)]), |
| ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), |
| ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types), |
| ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1'), |
| ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2'), |
| ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3'), |
| ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types), |
| # HalfTensor gives bad result at pow-2 with data sampled from torch.randn |
| ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half, False, |
| "skipIfRocm:FloatTensor"), |
| ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types), |
| ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types), |
| ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'), |
| ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'), |
| ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],), |
| ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'), |
| ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'), |
| ('bmm', small_3d, lambda t: [small_3d(t)], '', float_types_no_half), |
| ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)]), |
| ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)], |
| 'scalar'), |
| ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)]), |
| ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'), |
| ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)]), |
| ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'), |
| ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'), |
| ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],), |
| ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'), |
| ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'), |
| ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)]), |
| ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'), |
| ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'), |
| ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types + [torch.HalfTensor]), |
| ('fmod', small_3d, lambda t: [3], 'value',), |
| ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), |
| ('chunk', medium_2d, lambda t: [4],), |
| ('chunk', medium_2d, lambda t: [4, 1], 'dim'), |
| ('chunk', medium_2d, lambda t: [4, -2], 'neg_dim'), |
| ('clamp', medium_2d_scaled, lambda t: [-1, 5], None, signed_types), |
| ('clamp', medium_2d_scaled, lambda t: [1, 5], None, unsigned_types), |
| ('clone', medium_2d, lambda t: [],), |
| ('contiguous', medium_2d, lambda t: [],), |
| ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],), |
| ('cumprod', small_3d, lambda t: [1]), |
| ('cumprod', small_3d, lambda t: [-1], 'neg_dim'), |
| ('cumsum', small_3d, lambda t: [1]), |
| ('cumsum', small_3d, lambda t: [-1], 'neg_dim'), |
| ('dim', small_3d, lambda t: [],), |
| ('dist', small_2d, lambda t: [small_2d(t)]), |
| ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'), |
| ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'), |
| ('dot', medium_1d, lambda t: [medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"), |
| ('element_size', medium_1d, lambda t: [],), |
| ('eq', small_3d_ones, lambda t: [small_3d(t)],), |
| ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'), |
| ('ne', small_3d_ones, lambda t: [small_3d(t)],), |
| ('ne', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'), |
| ('equal', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'), |
| ('equal', small_3d_ones, lambda t: [small_3d(t)],), |
| ('expand', new_t(M, 1, M), lambda t: [M, 4, M],), |
| ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],), |
| ('fill', medium_2d, lambda t: [number(3.14, 3, t)]), |
| ('ge', medium_2d, lambda t: [medium_2d(t)],), |
| ('le', medium_2d, lambda t: [medium_2d(t)],), |
| ('gt', medium_2d, lambda t: [medium_2d(t)],), |
| ('lt', medium_2d, lambda t: [medium_2d(t)],), |
| ('is_contiguous', medium_2d, lambda t: [],), |
| # TODO: can't check negative case - GPU copy will be contiguous |
| ('is_same_size', medium_2d, lambda t: [small_3d(t)], 'negative'), |
| ('is_same_size', medium_2d, lambda t: [medium_2d(t)], 'positive'), |
| ('is_set_to', medium_2d, lambda t: [medium_2d(t)],), |
| # TODO: positive case |
| ('kthvalue', small_3d_unique, lambda t: [3],), |
| ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'), |
| ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'), |
| ('lerp', small_3d, lambda t: [small_3d(t), 0.3]), |
| ('max', small_3d_unique, lambda t: []), |
| ('max', small_3d_unique, lambda t: [1], 'dim'), |
| ('max', small_3d_unique, lambda t: [-1], 'neg_dim'), |
| ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'), |
| ('min', small_3d_unique, lambda t: []), |
| ('min', small_3d_unique, lambda t: [1], 'dim'), |
| ('min', small_3d_unique, lambda t: [-1], 'neg_dim'), |
| ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'), |
| ('mean', small_3d, lambda t: []), |
| ('mean', small_3d, lambda t: [-1], 'neg_dim'), |
| ('mean', small_3d, lambda t: [1], 'dim'), |
| ('mean', giant_1d_ones, lambda t: [], '64bit_indexing', |
| # Double here because otherwise the CPU result will be |
| # wrong. |
| [torch.DoubleTensor]), |
| ('mode', small_3d, lambda t: []), |
| ('mode', small_3d, lambda t: [1], 'dim'), |
| ('mode', small_3d, lambda t: [-1], 'neg_dim'), |
| ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.1, 10), lambda t: [1], '2d_p=1', float_types_no_half), |
| ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.6, 10), lambda t: [2], '2d_p=2', float_types_no_half), |
| ('remainder', small_3d, lambda t: [3], 'value',), |
| ('remainder', small_3d, lambda t: [-3], 'negative_value', signed_types), |
| ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'), |
| ('remainder', small_3d, lambda t: [constant_tensor_sub(0, small_3d_positive(t))], 'negative_tensor', signed_types), |
| ('std', small_3d, lambda t: []), |
| ('std', small_3d, lambda t: [1], 'dim', types, False), |
| ('std', small_3d, lambda t: [-1], 'neg_dim', types, False), |
| ('var', small_3d, lambda t: []), |
| ('var', small_3d, lambda t: [1], 'dim'), |
| ('var', small_3d, lambda t: [-1], 'neg_dim'), |
| ('ndimension', small_3d, lambda t: [],), |
| ('nelement', small_3d, lambda t: [],), |
| ('numel', small_3d, lambda t: [],), |
| ('narrow', small_3d, lambda t: [1, 3, 2],), |
| ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'), |
| ('nonzero', small_3d, lambda t: [], '', types, False), |
| ('norm', small_3d, lambda t: []), |
| ('norm', small_3d, lambda t: [3], '3_norm'), |
| ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'), |
| ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'), |
| ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],), |
| ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],), |
| ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])], '', types, False), |
| ('put_', new_t(2, 3), lambda t: [long_type(t)([]), t([])], 'empty'), |
| ('put_', new_t(2, 2), lambda t: [long_type(t)([[1], [-3]]), t([[1], [2]]), True], 'accumulate'), |
| ('prod', small_2d_oneish, lambda t: []), |
| ('prod', small_3d, lambda t: [1], 'dim'), |
| ('prod', small_3d, lambda t: [-1], 'neg_dim'), |
| ('sum', small_2d, lambda t: []), |
| ('sum', small_3d, lambda t: [1], 'dim'), |
| ('sum', small_3d, lambda t: [-1], 'neg_dim'), |
| ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'), |
| ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'), |
| ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'), |
| ('repeat', small_2d, lambda t: [2, 2, 2],), |
| ('size', new_t(1, 2, 3, 4), lambda t: [],), |
| ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'), |
| ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'), |
| ('sort', small_3d_unique, lambda t: [], ''), |
| ('sort', small_3d_unique, lambda t: [1], 'dim'), |
| ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'), |
| ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'), |
| ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'), |
| ('split', small_3d, lambda t: [2],), |
| ('split', small_3d, lambda t: [2, 1], 'dim'), |
| ('split', small_3d, lambda t: [2, -3], 'neg_dim'), |
| ('squeeze', new_t(1, 2, 1, 4), lambda t: [],), |
| ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'), |
| ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'), |
| ('t', new_t(1, 2), lambda t: [],), |
| ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])], '', types, False), |
| ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],), |
| ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'), |
| ('to_list', small_3d, lambda t: [],), |
| ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort',), |
| ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort',), |
| ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort',), |
| ('trace', medium_2d, lambda t: []), |
| ('tril', medium_2d, lambda t: [],), |
| ('tril', medium_2d_expanded, lambda t: [], 'zero_stride', types, True), |
| ('tril', medium_2d, lambda t: [2], 'positive'), |
| ('tril', medium_2d, lambda t: [-2], 'negative'), |
| ('triu', medium_2d, lambda t: [],), |
| ('triu', medium_2d_expanded, lambda t: [], 'zero_stride', types, True), |
| ('triu', medium_2d, lambda t: [2], 'positive'), |
| ('triu', medium_2d, lambda t: [-2], 'negative'), |
| ('unsqueeze', new_t(2, 3, 4), lambda t: [2],), |
| ('unsqueeze', new_t(2, 3, 4), lambda t: [-2], 'neg_dim'), |
| ('view', small_3d, lambda t: [100, 10], 'contiguous'), |
| ('view_as', small_3d, lambda t: [make_tensor(t, 100, 10)],), |
| ('zero', small_3d, lambda t: [],), |
| ('zeros', small_3d, lambda t: [1, 2, 3, 4],), |
| ('eye', small_2d, lambda t: [3, 4],), |
| ('flip', small_3d, lambda t: [0], 'd0', types, True), |
| ('flip', small_3d, lambda t: [0, 1, 2], 'd012', types, True), |
| ('flip', small_3d, lambda t: [0, 2], 'd02', types, True), |
| ('flip', small_3d, lambda t: [2, 0], 'd20', types, True), |
| ('flip', small_3d, lambda t: [-1], 'neg_d', types, True), |
| ('rot90', small_2d, lambda t: [1, [0, 1]], 'k1_d01', types, True), |
| ('rot90', small_3d, lambda t: [1, [1, 2]], 'k1_d12', types, True), |
| ('rot90', small_3d, lambda t: [1, [1, -1]], 'k1_neg_d', types, True), |
| ('rot90', small_3d, lambda t: [], 'default', types, True), |
| ('rsqrt', lambda t: constant_tensor_add(1, small_3d(t)), lambda t: [], None, float_types), |
| ('sinh', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types), |
| ('tan', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types), |
| ('__lshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(1, 5), t)), |
| lambda t: [2], None, signed_types), |
| ('__rshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(3, 7), t)), |
| lambda t: [2], None, signed_types), |
| # lapack tests |
| ('qr', small_2d_lapack, lambda t: [], 'square', float_types, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('qr', small_2d_lapack_skinny, lambda t: [], 'skinny', float_types, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('qr', small_2d_lapack_fat, lambda t: [], 'fat', float_types, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('qr', large_2d_lapack, lambda t: [], 'big', float_types, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('geqrf', new_t(20, 20), lambda t: [], None, float_types, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('svd', new_t(10, 10), lambda t: [], 'square', float_types_no_half, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('svd', lambda t: new_t(10, 10)(t).t(), lambda t: [True], 'square_col_maj', |
| float_types_no_half, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('svd', new_t(20, 5), lambda t: [True], 'tall_some', float_types_no_half, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('svd', new_t(20, 5), lambda t: [False], 'tall_all', float_types_no_half, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [True], |
| 'tall_some_col_maj', float_types_no_half, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [False], |
| 'tall_all_col_maj', float_types_no_half, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ('eig', new_t(10, 10), lambda t: [True], 'with_eigvec', float_types_no_half, False, |
| unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")), |
| ] |
| |
| # TODO: random functions, cat, gather, scatter, index*, masked*, |
| # resize, resizeAs, storage_offset, storage, stride, unfold |
| |
| custom_precision = { |
| 'addbmm': 1e-4, |
| 'addmm': 1e-4, |
| 'addmv': 1e-4, |
| 'addr': 1e-4, |
| 'baddbmm': 1e-4, |
| 'rsqrt': 1e-4, |
| 'cumprod': 1e-4, |
| 'qr': 3e-4, |
| 'digamma': 1e0, # large values lead to large absolute error but small relative error |
| } |
| |
| custom_half_precision = { |
| 'add': 1e-2, |
| 'acos': 1e-3, |
| 'addbmm': 1e-1, |
| 'addcdiv': 1e-2, |
| 'addcmul': 1e-2, |
| 'addmm': 1e-1, |
| 'addmv': 1e-2, |
| 'addr': 1e-2, |
| 'asin': 1e-3, |
| 'atan2': 1e-3, |
| 'atan': 1e-3, |
| 'baddbmm': 1e-2, |
| 'cos': 1e-3, |
| 'cosh': 1e-2, |
| 'cross': 1e-2, |
| 'cumprod': 1e-2, |
| 'cumsum': 1e-2, |
| 'dist': 1e-2, |
| 'div': 1e-3, |
| 'dot': 1e-2, |
| 'erf': 1e-3, |
| 'erfc': 1e-3, |
| 'erfinv': 1e-3, |
| 'exp': 1e-2, |
| 'expm1': 1e-2, |
| 'fill': 1e-3, |
| 'lerp': 1e-2, |
| 'lgamma': 1e-2, |
| 'log': 1e-2, |
| 'log10': 1e-2, |
| 'log1p': 1e-3, |
| 'log2': 1e-2, |
| 'mean': 1e-3, |
| 'mul': 1e-2, |
| 'norm': 1e-1, |
| 'pow': 1e-1, |
| 'prod': 1e-3, |
| 'reciprocal': 1e-1, |
| 'remainder': 1e-3, |
| 'renorm': 1e-3, |
| 'rsqrt': 1e-2, |
| 'sigmoid': 1e-3, |
| 'sin': 1e-3, |
| 'sinh': 1e-3, |
| 'sqrt': 1e-3, |
| 'std': 1e-3, |
| 'sub': 1e-2, |
| 'sum': 1e-2, |
| 'tan': 1e-3, |
| 'tanh': 1e-3, |
| 'trace': 1e-3, |
| 'var': 1e-3, |
| '__lshift__': 1e-3, |
| '__rshift__': 1e-3, |
| } |
| |
| simple_pointwise = [ |
| 'abs', |
| 'sign', |
| ] |
| for fn in simple_pointwise: |
| tests.append((fn, small_3d, lambda t: [])) |
| |
| simple_pointwise_float = [ |
| 'log', |
| 'log10', |
| 'log1p', |
| 'log2', |
| 'sigmoid', |
| 'sin', |
| 'sqrt', |
| 'tanh', |
| 'acos', |
| 'asin', |
| 'atan', |
| 'cos', |
| 'cosh', |
| 'erf', |
| 'erfc', |
| 'erfinv', |
| 'exp', |
| 'expm1', |
| 'reciprocal', |
| 'floor', |
| 'frac', |
| 'neg', |
| 'round', |
| 'trunc', |
| 'ceil', |
| 'lgamma', |
| 'digamma', |
| 'trigamma', |
| ] |
| |
| for fn in simple_pointwise_float: |
| tests.append((fn, small_3d, lambda t: [], None, float_types)) |
| |
| _cycles_per_ms = None |
| |
| |
| def get_cycles_per_ms(): |
| """Approximate number of cycles per millisecond for torch.cuda._sleep""" |
| global _cycles_per_ms |
| if _cycles_per_ms is None: |
| start = torch.cuda.Event(enable_timing=True) |
| end = torch.cuda.Event(enable_timing=True) |
| start.record() |
| torch.cuda._sleep(1000000) |
| end.record() |
| end.synchronize() |
| _cycles_per_ms = 1000000 / start.elapsed_time(end) |
| return _cycles_per_ms |
| |
| |
| def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5): |
| def tmp(self): |
| cpu_tensor = tensor_constructor(t) |
| gpu_tensor = to_gpu(cpu_tensor) |
| cpu_args = arg_constructor(t) |
| gpu_args = [to_gpu(arg) for arg in cpu_args] |
| if is_half(t): |
| cpu_tensor = cpu_tensor.float() |
| cpu_args = [arg.float() if isinstance(arg, torch.Tensor) and is_half(arg) else arg for arg in cpu_args] |
| cpu_result = getattr(cpu_tensor, fn)(*cpu_args) |
| try: |
| gpu_result = getattr(gpu_tensor, fn)(*gpu_args) |
| except RuntimeError as e: |
| reason = e.args[0] |
| if 'only supports floating-point types' in reason or 'unimplemented data type' in reason: |
| raise unittest.SkipTest('unimplemented data type') |
| raise |
| except AttributeError as e: |
| reason = e.args[0] |
| if 'object has no attribute' in reason: |
| raise unittest.SkipTest('unimplemented data type') |
| raise |
| # If one changes, another should change as well |
| self.assertEqual(cpu_tensor, gpu_tensor, precision) |
| self.assertEqual(cpu_args, gpu_args, precision) |
| # Compare results |
| if fn == 'element_size' and t.__name__ == 'HalfTensor': |
| # Workaround since cpu_result is float |
| self.assertEqual(2, gpu_result) |
| else: |
| self.assertEqual(cpu_result, gpu_result, precision) |
| return tmp |
| |
| |
| class TestCuda(TestCase): |
| _do_cuda_memory_leak_check = True |
| FIFTY_MIL_CYCLES = 50000000 |
| |
| @staticmethod |
| def _test_memory_stats_generator(self, device=None, N=35): |
| if device is None: |
| device = torch.cuda.current_device() |
| |
| m0 = torch.cuda.memory_allocated(device) |
| last_m_arr = [torch.cuda.memory_allocated(device)] |
| max_m_arr = [torch.cuda.max_memory_allocated(device)] |
| last_c_arr = [torch.cuda.memory_cached(device)] |
| max_c_arr = [torch.cuda.max_memory_cached(device)] |
| |
| def alloc(*size): |
| with torch.cuda.device(device): |
| # NOTE: do **not** use methods that can have additional |
| # memory overhead, e.g., inplace random sampling methods. |
| # they can leave some memory occupied even after being |
| # deallocated, e.g., initialized RNG state, causing some |
| # memory checks below to fail. |
| return torch.cuda.FloatTensor(*size) |
| |
| def assert_change(comp=1, empty_cache=False, reset_max_alloc=False, reset_max_cached=False): |
| # comp > 0: increased |
| # comp = 0: equal |
| # comp < 0: decreased |
| new_m = torch.cuda.memory_allocated(device) |
| new_max_m = torch.cuda.max_memory_allocated(device) |
| if comp > 0: |
| self.assertGreater(new_m, last_m_arr[0]) |
| elif comp < 0: |
| self.assertLess(new_m, last_m_arr[0]) |
| else: |
| self.assertEqual(new_m, last_m_arr[0]) |
| self.assertLessEqual(new_m, new_max_m) |
| self.assertGreaterEqual(new_max_m, max_m_arr[0]) |
| last_m_arr[0] = new_m |
| max_m_arr[0] = new_max_m |
| |
| new_c = torch.cuda.memory_cached(device) |
| new_max_c = torch.cuda.max_memory_cached(device) |
| # emptying cache may happen (due to allocation or empty_cache), so |
| # we can't assert new_c >= last_c |
| self.assertLessEqual(new_c, new_max_c) |
| self.assertGreaterEqual(new_max_c, max_c_arr[0]) |
| last_c_arr[0] = new_c |
| max_c_arr[0] = new_max_c |
| |
| if empty_cache: |
| torch.cuda.empty_cache() |
| new_c = torch.cuda.memory_cached(device) |
| new_max_c = torch.cuda.max_memory_cached(device) |
| self.assertLessEqual(new_c, last_c_arr[0]) |
| self.assertLessEqual(new_c, new_max_c) |
| self.assertEqual(new_max_c, max_c_arr[0]) |
| last_c_arr[0] = new_c |
| |
| if reset_max_alloc: |
| torch.cuda.reset_max_memory_allocated(device) |
| self.assertEqual(torch.cuda.memory_allocated(device), last_m_arr[0]) |
| self.assertEqual(torch.cuda.max_memory_allocated(device), last_m_arr[0]) |
| max_m_arr[0] = last_m_arr[0] |
| self.assertEqual(torch.cuda.memory_cached(device), last_c_arr[0]) |
| self.assertEqual(torch.cuda.max_memory_cached(device), max_c_arr[0]) |
| |
| if reset_max_cached: |
| torch.cuda.reset_max_memory_cached(device) |
| self.assertEqual(torch.cuda.memory_allocated(device), last_m_arr[0]) |
| self.assertEqual(torch.cuda.max_memory_allocated(device), max_m_arr[0]) |
| self.assertEqual(torch.cuda.memory_cached(device), last_c_arr[0]) |
| self.assertEqual(torch.cuda.max_memory_cached(device), last_c_arr[0]) |
| max_c_arr[0] = last_c_arr[0] |
| |
| assert_change(0) |
| assert_change(0, reset_max_alloc=True) |
| assert_change(0, empty_cache=True) |
| assert_change(0, reset_max_cached=True) |
| assert_change(0) |
| yield |
| |
| tensors1 = [alloc(1), alloc(10, 20), alloc(200, 300, 2000)] |
| m1 = torch.cuda.memory_allocated(device) |
| assert_change(1) |
| yield |
| |
| tensors2 = [] |
| |
| for i in range(1, int(N / 2) + 1): |
| # small ones |
| tensors2.append(alloc(i, i * 4)) |
| assert_change(1) |
| yield |
| |
| for i in range(5, int(N / 2) + 5): |
| # large ones |
| tensors2.append(alloc(i, i * 7, i * 9, i * 11)) |
| assert_change(1, reset_max_alloc=(i % 2 == 0), reset_max_cached=(i % 2 == 1)) |
| yield |
| |
| tensors2.append(alloc(0, 0, 0)) |
| assert_change(0) |
| yield |
| |
| permute = [] |
| for i in torch.randperm(len(tensors2)): |
| permute.append(tensors2[i]) |
| assert_change(0) |
| yield |
| |
| del tensors2 |
| assert_change(0) |
| yield |
| tensors2 = permute |
| assert_change(0) |
| yield |
| del permute |
| assert_change(0, reset_max_alloc=True) |
| yield |
| |
| for i in range(int(N / 2)): |
| x = tensors2[i].numel() |
| del tensors2[i] |
| assert_change(-x) # in case that tensors2[i] is empty |
| yield |
| |
| for i in range(2, int(2 * N / 3) + 2): |
| tensors2.append(alloc(i, i * 3, i * 8)) |
| assert_change(1) |
| yield |
| |
| del tensors2 |
| assert_change(-1, reset_max_cached=True) |
| assert_change(0) |
| self.assertEqual(torch.cuda.memory_allocated(device), m1) |
| yield True |
| |
| del tensors1 |
| assert_change(-1, reset_max_alloc=True) |
| self.assertEqual(torch.cuda.memory_allocated(device), m0) |
| |
| # test empty_cache and reset_max_memory_* |
| assert_change(0, empty_cache=True) |
| assert_change(0, reset_max_cached=True) |
| assert_change(0, reset_max_alloc=True) |
| |
| def test_memory_stats(self): |
| torch.cuda.empty_cache() |
| for _ in self._test_memory_stats_generator(self): |
| pass |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_memory_stats_multigpu(self): |
| # advance a generator with a end flag |
| def advance(gen, end): |
| if not end: |
| try: |
| next(gen) |
| except StopIteration: |
| end = True |
| return end |
| |
| # interlace |
| torch.cuda.empty_cache() |
| gen0 = self._test_memory_stats_generator(self, device='cuda:0', N=35) |
| gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35) |
| end0 = end1 = False |
| while not (end0 and end1): |
| end0 = advance(gen0, end0) |
| end1 = advance(gen1, end1) |
| |
| # semi-random order |
| torch.cuda.empty_cache() |
| gen0 = self._test_memory_stats_generator(self, device=0, N=35) |
| gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35) |
| end0 = end1 = False |
| |
| while not (end0 and end1): |
| end0 = advance(gen0, end0) |
| if not end0: |
| gen1_max_times = torch.LongTensor(1).random_(0, 3)[0] |
| else: |
| gen1_max_times = inf |
| t = 0 |
| while t < gen1_max_times and not end1: |
| end1 = advance(gen1, end1) |
| t += 1 |
| |
| def test_out_of_memory(self): |
| tensor = torch.zeros(1024, device='cuda') |
| |
| with self.assertRaisesRegex(RuntimeError, "Tried to allocate 80.00 GiB"): |
| torch.empty(1024 * 1024 * 1024 * 80, dtype=torch.int8, device='cuda') |
| |
| # ensure out of memory error doesn't disturb subsequent kernel |
| tensor.fill_(1) |
| self.assertTrue((tensor == 1).all()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_autogpu(self): |
| x = torch.randn(5, 5).cuda() |
| y = torch.randn(5, 5).cuda() |
| self.assertEqual(x.get_device(), 0) |
| self.assertEqual(x.get_device(), 0) |
| with torch.cuda.device(1): |
| z = torch.randn(5, 5).cuda() |
| self.assertEqual(z.get_device(), 1) |
| q = x.add(y) |
| self.assertEqual(q.get_device(), 0) |
| w = torch.randn(5, 5).cuda() |
| self.assertEqual(w.get_device(), 1) |
| self.assertEqual(y.cuda().get_device(), 1) |
| z = z.cuda() |
| self.assertEqual(z.get_device(), 0) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_new(self): |
| x = torch.randn(3, 3).cuda() |
| self.assertEqual(x.new([0, 1, 2]).get_device(), 0) |
| self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1) |
| |
| with torch.cuda.device(1): |
| self.assertEqual(x.new([0, 1, 2]).get_device(), 0) |
| self.assertEqual(x.new([0, 1, 2], device=1).get_device(), 1) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_copy_device(self): |
| x = torch.randn(5, 5).cuda() |
| with torch.cuda.device(1): |
| y = x.cuda() |
| self.assertEqual(y.get_device(), 1) |
| self.assertIs(y.cuda(), y) |
| z = y.cuda(0) |
| self.assertEqual(z.get_device(), 0) |
| self.assertIs(z.cuda(0), z) |
| |
| x = torch.randn(5, 5) |
| with torch.cuda.device(1): |
| y = x.cuda() |
| self.assertEqual(y.get_device(), 1) |
| self.assertIs(y.cuda(), y) |
| z = y.cuda(0) |
| self.assertEqual(z.get_device(), 0) |
| self.assertIs(z.cuda(0), z) |
| |
| def test_copy_non_blocking(self): |
| x = torch.randn(5, 5).cuda() |
| y = torch.zeros(5, 5) |
| y.copy_(x, non_blocking=True) |
| self.assertEqual(x, y) |
| |
| x = torch.randn(5, 5) |
| y = torch.zeros(5, 5).cuda() |
| y.copy_(x, non_blocking=True) |
| self.assertEqual(x, y) |
| |
| def test_serialization_array_with_storage(self): |
| x = torch.randn(5, 5).cuda() |
| y = torch.IntTensor(2, 5).fill_(0).cuda() |
| q = [x, y, x, y.storage()] |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(q, f) |
| f.seek(0) |
| q_copy = torch.load(f) |
| self.assertEqual(q_copy, q, 0) |
| q_copy[0].fill_(5) |
| self.assertEqual(q_copy[0], q_copy[2], 0) |
| self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor)) |
| self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor)) |
| self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor)) |
| self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage)) |
| q_copy[1].fill_(10) |
| self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10)) |
| |
| def test_type_conversions(self): |
| x = torch.randn(5, 5) |
| self.assertIsInstance(x.float(), torch.FloatTensor) |
| self.assertIsInstance(x.cuda(), torch.cuda.DoubleTensor) |
| self.assertIsInstance(x.cuda().float(), torch.cuda.FloatTensor) |
| self.assertIsInstance(x.cuda().float().cpu(), torch.FloatTensor) |
| self.assertIsInstance(x.cuda().float().cpu().int(), torch.IntTensor) |
| |
| y = x.storage() |
| self.assertIsInstance(y.float(), torch.FloatStorage) |
| self.assertIsInstance(y.cuda(), torch.cuda.DoubleStorage) |
| self.assertIsInstance(y.cuda().float(), torch.cuda.FloatStorage) |
| self.assertIsInstance(y.cuda().float().cpu(), torch.FloatStorage) |
| self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage) |
| |
| def test_mul_intertype_scalar(self): |
| def test_mul(dtype): |
| x = torch.tensor(1.5, dtype=dtype, device='cuda') |
| y = torch.tensor(3, dtype=torch.int32, device='cuda') |
| |
| self.assertEqual(x * y, 4.5) |
| self.assertEqual(y * x, 4.5) |
| with self.assertRaisesRegex(RuntimeError, "doesn't match the desired type"): |
| y *= x |
| x *= y |
| self.assertEqual(x, 4.5) |
| |
| test_mul(torch.float16) |
| test_mul(torch.float32) |
| test_mul(torch.float64) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_type_conversions_same_gpu(self): |
| x = torch.randn(5, 5).cuda(1) |
| self.assertEqual(x.int().get_device(), 1) |
| self.assertEqual(x.type(torch.int).get_device(), 1) |
| self.assertEqual(x.to(torch.int).get_device(), 1) |
| |
| def test_neg(self): |
| _TestTorchMixin._test_neg(self, lambda t: t.cuda()) |
| |
| def test_isinf(self): |
| _TestTorchMixin._test_isinf(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory") |
| def test_arithmetic_large_tensor(self): |
| x = torch.empty(2**30, device='cuda') |
| |
| x.fill_(1) |
| self.assertEqual(x.sum(), 2**30) |
| |
| x += 1 |
| self.assertEqual(x.sum(), 2**31) |
| |
| x.fill_(1) |
| x -= 0.5 |
| self.assertEqual(x.sum(), 2**29) |
| |
| x.fill_(1) |
| x *= 2 |
| self.assertEqual(x.sum(), 2**31) |
| |
| x.fill_(1) |
| x /= 2 |
| self.assertEqual(x.sum(), 2**29) |
| |
| def _test_broadcast(self, input): |
| if not TEST_MULTIGPU: |
| raise unittest.SkipTest("only one GPU detected") |
| result = comm.broadcast(input, (0, 1)) |
| for i, t in enumerate(result): |
| self.assertEqual(t.get_device(), i) |
| self.assertEqual(t, input) |
| if input.is_cuda and input.get_device() == i: |
| self.assertEqual(t.data_ptr(), input.data_ptr()) |
| |
| def test_broadcast_cpu(self): |
| self._test_broadcast(torch.randn(5, 5)) |
| |
| def test_broadcast_gpu(self): |
| self._test_broadcast(torch.randn(5, 5).cuda()) |
| |
| def test_min_max_nan(self): |
| tests = [(lambda x: x.min(), 'min'), |
| (lambda x: x.max(), 'max'), |
| (lambda x: x.min(0)[0], 'min_dim'), |
| (lambda x: x.max(0)[0], 'max_dim')] |
| for f, name in tests: |
| a = torch.arange(25.0).view(5, 5) |
| a[2, 2] = nan |
| actual = f(a.cuda()).cpu() |
| expected = f(a).cpu() |
| self.assertEqual(torch.isnan(actual), torch.isnan(expected), 'nans for {}'.format(name)) |
| self.assertEqual(actual[~torch.isnan(actual)], |
| expected[~torch.isnan(expected)], 'nans for {}'.format(name)) |
| |
| @staticmethod |
| def _test_broadcast_coalesced(self, tensors, buffer_size): |
| b_tensors = [comm.broadcast(t, (0, 1)) for t in tensors] |
| for (_, bt), t in zip(b_tensors, tensors): |
| self.assertEqual(bt.get_device(), 1) |
| self.assertEqual(bt, t) |
| self.assertIsInstance(bt, type(t)) |
| |
| bc_tensors = comm.broadcast_coalesced(tensors, (0, 1), buffer_size=buffer_size) |
| bc_tensors_t = list(zip(*bc_tensors)) |
| self.assertEqual(b_tensors, bc_tensors_t) |
| for (_, bt), (_, bct) in zip(b_tensors, bc_tensors_t): |
| self.assertEqual(bt.get_device(), bct.get_device()) |
| self.assertIsInstance(bct, type(bt)) |
| |
| # check that tensors on device[0] are returned as-is |
| for out_tensors in (b_tensors, bc_tensors_t): |
| for inp_t, (out_t, _) in zip(tensors, out_tensors): |
| self.assertIs(inp_t, out_t) |
| |
| # check that the tensors not on device[0] have different version counters |
| # NOTE [ Version Counter in comm.*_coalesced ] |
| versions = [t._version for _, t in bc_tensors_t] |
| for old_version, (_, t) in zip(versions, bc_tensors_t): |
| self.assertEqual(t._version, old_version) |
| t.zero_() |
| self.assertEqual(t._version, old_version + 1) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| @skipIfRocm |
| def test_broadcast_coalesced(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3), |
| make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_broadcast_coalesced_dense_only(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_broadcast_coalesced(self, tensors, num_bytes * 5 // 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_reduce_add(self): |
| x = torch.randn(5, 5) |
| y = torch.randn(5, 5) |
| x_cuda = x.cuda(0) |
| y_cuda = y.cuda(1) |
| result = comm.reduce_add((x_cuda, y_cuda)) |
| self.assertEqual(result.get_device(), 0) |
| self.assertEqual(result.cpu(), x + y) |
| |
| @staticmethod |
| def _test_reduce_add_coalesced(self, tensors, buffer_size): |
| dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))] |
| |
| r_tensors = list(map(comm.reduce_add, zip(*dup_tensors))) |
| for r, t in zip(r_tensors, tensors): |
| self.assertEqual(r.get_device(), t.get_device()) |
| self.assertEqual(r, t * 2) |
| self.assertEqual(r.type(), t.type()) |
| |
| rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size) |
| self.assertEqual(r_tensors, rc_tensors) |
| for r, rc in zip(r_tensors, rc_tensors): |
| self.assertEqual(rc.get_device(), r.get_device()) |
| self.assertEqual(rc.type(), r.type()) |
| |
| # Since we have both cuda:0 and cuda:1 inputs, the outputs must be new. |
| # We can check that they have different version counters. |
| # NOTE [ Version Counter in comm.*_coalesced ] |
| versions = [t._version for t in rc_tensors] |
| for old_version, t in zip(versions, rc_tensors): |
| self.assertEqual(t._version, old_version) |
| t.zero_() |
| self.assertEqual(t._version, old_version + 1) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| @skipIfRocm |
| def test_reduce_add_coalesced(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 1, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 10, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.DoubleTensor, 5, 2, 3), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 7, 3, 3), |
| make_sparse_tensor(torch.cuda.sparse.FloatTensor, 2, 2, 3), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| make_sparse_tensor(torch.cuda.sparse.LongTensor, 3, 2, 7), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_reduce_add_coalesced_dense_only(self): |
| numel = 5 |
| num_bytes = numel * 8 |
| tensors = [ |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel).long().cuda(), |
| torch.randn(numel * 2).int().cuda(), # int is 2x shorter |
| torch.randn(numel).cuda(), |
| ] |
| self._test_reduce_add_coalesced(self, tensors, num_bytes * 5 // 2) |
| |
| def _test_scatter(self, input, chunk_sizes=None, dim=0): |
| if not TEST_MULTIGPU: |
| raise unittest.SkipTest("only one GPU detected") |
| result = comm.scatter(input, (0, 1), chunk_sizes, dim) |
| self.assertEqual(len(result), 2) |
| if chunk_sizes is None: |
| chunk_sizes = tuple(repeat(input.size(dim) // 2, 2)) |
| chunk_start = 0 |
| for i, r in enumerate(result): |
| chunk_end = chunk_start + chunk_sizes[i] |
| index = [slice(None, None), slice(None, None)] |
| index[dim] = slice(chunk_start, chunk_end) |
| self.assertEqual(r, input[tuple(index)], 0) |
| chunk_start = chunk_end |
| |
| def test_scatter_cpu(self): |
| self._test_scatter(torch.randn(4, 4), dim=0) |
| |
| def test_scatter_cpu_dim(self): |
| self._test_scatter(torch.randn(4, 4), dim=1) |
| |
| def test_scatter_cpu_neg_dim(self): |
| self._test_scatter(torch.randn(4, 4), dim=-2) |
| |
| def test_scatter_cpu_sizes(self): |
| self._test_scatter(torch.randn(6, 4), chunk_sizes=(2, 4)) |
| |
| def test_scatter_gpu(self): |
| self._test_scatter(torch.randn(4, 4).cuda(), dim=0) |
| |
| @skipIfRocm |
| def test_scatter_gpu_dim(self): |
| self._test_scatter(torch.randn(4, 4).cuda(), dim=1) |
| |
| def test_scatter_gpu_neg_dim(self): |
| self._test_scatter(torch.randn(4, 4).cuda(), dim=-2) |
| |
| def test_scatter_gpu_sizes(self): |
| self._test_scatter(torch.randn(6, 4).cuda(), chunk_sizes=(2, 4)) |
| |
| def _test_gather(self, dim): |
| if not TEST_MULTIGPU: |
| raise unittest.SkipTest("only one GPU detected") |
| x = torch.randn(2, 5).cuda(0) |
| y = torch.randn(2, 5).cuda(1) |
| result = comm.gather((x, y), dim) |
| |
| expected_size = list(x.size()) |
| expected_size[dim] += y.size(dim) |
| expected_size = torch.Size(expected_size) |
| self.assertEqual(result.get_device(), 0) |
| self.assertEqual(result.size(), expected_size) |
| |
| index = [slice(None, None), slice(None, None)] |
| index[dim] = slice(0, x.size(dim)) |
| self.assertEqual(result[tuple(index)], x) |
| index[dim] = slice(x.size(dim), x.size(dim) + y.size(dim)) |
| self.assertEqual(result[tuple(index)], y) |
| |
| def test_gather(self): |
| self._test_gather(0) |
| |
| def test_gather_dim(self): |
| self._test_gather(1) |
| |
| def test_from_sequence(self): |
| seq = [list(range(i * 4, i * 4 + 4)) for i in range(5)] |
| reference = torch.arange(0, 20).resize_(5, 4) |
| for t in types: |
| cuda_type = get_gpu_type(t) |
| self.assertEqual(cuda_type(seq), reference) |
| |
| def test_torch_manual_seed_seeds_cuda_devices(self): |
| with freeze_rng_state(): |
| x = torch.zeros(4, 4).float().cuda() |
| torch.manual_seed(2) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| x.uniform_() |
| torch.manual_seed(2) |
| y = x.clone().uniform_() |
| self.assertEqual(x, y) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| |
| def test_manual_seed(self): |
| with freeze_rng_state(): |
| x = torch.zeros(4, 4).float().cuda() |
| torch.cuda.manual_seed(2) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| x.uniform_() |
| a = torch.bernoulli(torch.full_like(x, 0.5)) |
| torch.cuda.manual_seed(2) |
| y = x.clone().uniform_() |
| b = torch.bernoulli(torch.full_like(x, 0.5)) |
| self.assertEqual(x, y) |
| self.assertEqual(a, b) |
| self.assertEqual(torch.cuda.initial_seed(), 2) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_cat_autogpu(self): |
| x = torch.randn(4, 4).cuda(1) |
| y = torch.randn(4, 4).cuda(1) |
| z = torch.cat([x, y], 0) |
| self.assertEqual(z.get_device(), x.get_device()) |
| |
| def test_clamp(self): |
| _TestTorchMixin._test_clamp(self, 'cuda') |
| |
| def test_cat(self): |
| SIZE = 10 |
| for dim in range(-3, 3): |
| pos_dim = dim if dim >= 0 else 3 + dim |
| x = torch.rand(13, SIZE, SIZE).transpose(0, pos_dim).cuda() |
| y = torch.rand(17, SIZE, SIZE).transpose(0, pos_dim).cuda() |
| z = torch.rand(19, SIZE, SIZE).transpose(0, pos_dim).cuda() |
| |
| res1 = torch.cat((x, y, z), dim) |
| self.assertEqual(res1.narrow(pos_dim, 0, 13), x, 0) |
| self.assertEqual(res1.narrow(pos_dim, 13, 17), y, 0) |
| self.assertEqual(res1.narrow(pos_dim, 30, 19), z, 0) |
| |
| x = torch.randn(20, SIZE, SIZE).cuda() |
| self.assertEqual(torch.cat(torch.split(x, 7)), x) |
| self.assertEqual(torch.cat(torch.chunk(x, 7)), x) |
| |
| y = torch.randn(1, SIZE, SIZE).cuda() |
| z = torch.cat([x, y]) |
| self.assertEqual(z.size(), (21, SIZE, SIZE)) |
| |
| def test_cat_empty_legacy(self): |
| _TestTorchMixin._test_cat_empty_legacy(self, use_cuda=True) |
| |
| def test_cat_empty(self): |
| _TestTorchMixin._test_cat_empty(self, use_cuda=True) |
| |
| def test_bernoulli(self): |
| _TestTorchMixin._test_bernoulli(self, torch.float32, torch.float64, 'cuda') |
| _TestTorchMixin._test_bernoulli(self, torch.float32, torch.float16, 'cuda') |
| _TestTorchMixin._test_bernoulli(self, torch.float16, torch.float64, 'cuda') |
| _TestTorchMixin._test_bernoulli(self, torch.float16, torch.float16, 'cuda') |
| # test that it works with integral tensors |
| _TestTorchMixin._test_bernoulli(self, torch.uint8, torch.float64, 'cuda') |
| _TestTorchMixin._test_bernoulli(self, torch.uint8, torch.float16, 'cuda') |
| _TestTorchMixin._test_bernoulli(self, torch.int64, torch.float64, 'cuda') |
| _TestTorchMixin._test_bernoulli(self, torch.int64, torch.float16, 'cuda') |
| |
| def test_cat_bad_input_sizes(self): |
| x = torch.randn(2, 1).cuda() |
| y = torch.randn(2, 1, 1).cuda() |
| z = torch.randn(2, 1, 1).cuda() |
| self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z])) |
| |
| x = torch.randn(2, 1, 2).cuda() |
| y = torch.randn(2, 1, 1).cuda() |
| z = torch.randn(2, 2, 1).cuda() |
| self.assertRaises(RuntimeError, lambda: torch.cat([x, y, z], dim=1)) |
| |
| @unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor") |
| @unittest.skipIf(not PY3, "Tensor was serialized with Python 3") |
| def test_load_nonexistent_device(self): |
| # Setup: create a serialized file object with a 'cuda:9' restore location |
| tensor = torch.randn(2, device='cuda') |
| buf = io.BytesIO() |
| torch.save(tensor, buf) |
| # NB: this might not work in the future if serialization changes |
| buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9')) |
| |
| msg = r'Attempting to deserialize object on CUDA device 9' |
| with self.assertRaisesRegex(RuntimeError, msg): |
| _ = torch.load(buf) |
| |
| def test_serialization(self): |
| x = torch.randn(4, 4).cuda() |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f) |
| self.assertEqual(x_copy, x) |
| self.assertIs(type(x_copy), type(x)) |
| self.assertEqual(x_copy.get_device(), x.get_device()) |
| |
| def test_serialization_array_with_empty(self): |
| x = [torch.randn(4, 4).cuda(), torch.cuda.FloatTensor()] |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f) |
| for original, copy in zip(x, x_copy): |
| self.assertEqual(copy, original) |
| self.assertIs(type(copy), type(original)) |
| self.assertEqual(copy.get_device(), original.get_device()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_multigpu_serialization(self): |
| x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f) |
| for original, copy in zip(x, x_copy): |
| self.assertEqual(copy, original) |
| self.assertIs(type(copy), type(original)) |
| self.assertEqual(copy.get_device(), original.get_device()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_multigpu_serialization_remap(self): |
| x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] |
| |
| def gpu_remap(storage, location): |
| if location == 'cuda:1': |
| return storage.cuda(0) |
| |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f, map_location=gpu_remap) |
| |
| for original, copy in zip(x, x_copy): |
| self.assertEqual(copy, original) |
| self.assertIs(type(copy), type(original)) |
| self.assertEqual(copy.get_device(), 0) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_multigpu_serialization_remap_dict(self): |
| x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)] |
| with tempfile.NamedTemporaryFile() as f: |
| torch.save(x, f) |
| f.seek(0) |
| x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'}) |
| for original, copy in zip(x, x_copy): |
| self.assertEqual(copy, original) |
| self.assertIs(type(copy), type(original)) |
| self.assertEqual(copy.get_device(), 0) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_multigpu_storage_clone(self): |
| x = torch.randn(4, 4, device='cuda:1').storage() |
| y = x.clone() |
| self.assertEqual(x.get_device(), y.get_device()) |
| for t in ['byte', 'char', 'short', 'int', 'long', 'half', 'double']: |
| self.assertEqual(getattr(x, t)().get_device(), x.get_device()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| def test_cuda_set_device(self): |
| x = torch.randn(5, 5) |
| with torch.cuda.device(1): |
| self.assertEqual(x.cuda().get_device(), 1) |
| torch.cuda.set_device(0) |
| self.assertEqual(x.cuda().get_device(), 0) |
| with torch.cuda.device(1): |
| self.assertEqual(x.cuda().get_device(), 1) |
| self.assertEqual(x.cuda().get_device(), 0) |
| torch.cuda.set_device(1) |
| self.assertEqual(x.cuda().get_device(), 0) |
| |
| def test_is_tensor(self): |
| for t in types: |
| tensor = get_gpu_type(t)() |
| self.assertTrue(torch.is_tensor(tensor)) |
| self.assertTrue(torch.is_tensor(torch.cuda.HalfTensor())) |
| |
| def test_cuda_synchronize(self): |
| torch.cuda.synchronize() |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_current_stream(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.current_stream(device=1) |
| s2 = torch.cuda.current_stream(device=0) |
| |
| self.assertEqual(d0, s0.device) |
| self.assertEqual(d1, s1.device) |
| self.assertEqual(d0, s2.device) |
| self.assertEqual(s0, s2) |
| |
| with torch.cuda.device(d1): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.current_stream(1) |
| s2 = torch.cuda.current_stream(d0) |
| |
| self.assertEqual(d1, s0.device) |
| self.assertEqual(d1, s1.device) |
| self.assertEqual(d0, s2.device) |
| self.assertEqual(s0, s1) |
| |
| with self.assertRaisesRegex(ValueError, |
| "Expected a cuda device, but got: cpu"): |
| torch.cuda.current_stream(torch.device('cpu')) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_default_stream(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.default_stream() |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.default_stream() |
| |
| s2 = torch.cuda.default_stream(device=0) |
| s3 = torch.cuda.default_stream(d1) |
| |
| self.assertEqual(d0, s0.device) |
| self.assertEqual(d1, s1.device) |
| self.assertEqual(d0, s2.device) |
| self.assertEqual(d1, s3.device) |
| self.assertEqual(s0, s2) |
| self.assertEqual(s1, s3) |
| |
| with torch.cuda.device(d0): |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| |
| with torch.cuda.device(d1): |
| self.assertEqual(torch.cuda.current_stream(), s1) |
| |
| with self.assertRaisesRegex(ValueError, |
| "Expected a cuda device, but got: cpu"): |
| torch.cuda.default_stream(torch.device('cpu')) |
| |
| def test_streams(self): |
| default_stream = torch.cuda.current_stream() |
| user_stream = torch.cuda.Stream() |
| self.assertEqual(torch.cuda.current_stream(), default_stream) |
| self.assertNotEqual(default_stream, user_stream) |
| self.assertEqual(default_stream.cuda_stream, 0) |
| self.assertNotEqual(user_stream.cuda_stream, 0) |
| with torch.cuda.stream(user_stream): |
| self.assertEqual(torch.cuda.current_stream(), user_stream) |
| self.assertTrue(user_stream.query()) |
| # copy 10 MB tensor from CPU-GPU which should take some time |
| tensor1 = torch.ByteTensor(10000000).pin_memory() |
| tensor2 = tensor1.cuda(non_blocking=True) |
| self.assertFalse(default_stream.query()) |
| default_stream.synchronize() |
| self.assertTrue(default_stream.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_stream_event_device(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| e0 = torch.cuda.Event() |
| |
| self.assertEqual(None, e0.device) |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| s0.record_event(e0) |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.Stream() |
| e1 = s1.record_event() |
| |
| self.assertEqual(s0.device, torch.device('cuda:0')) |
| self.assertEqual(e0.device, torch.device('cuda:0')) |
| self.assertEqual(s1.device, torch.device('cuda:1')) |
| self.assertEqual(e1.device, torch.device('cuda:1')) |
| |
| def test_stream_event_repr(self): |
| s = torch.cuda.current_stream() |
| self.assertTrue("torch.cuda.Stream" in s.__repr__()) |
| e = torch.cuda.Event() |
| self.assertTrue("torch.cuda.Event" in e.__repr__()) |
| s.record_event(e) |
| self.assertTrue("torch.cuda.Event" in e.__repr__()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_stream_context(self): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.Stream(device=1) |
| s2 = torch.cuda.Stream(device=0) |
| |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| self.assertEqual(0, torch.cuda.current_device()) |
| with torch.cuda.stream(s1): |
| self.assertEqual(torch.cuda.current_stream(), s1) |
| self.assertEqual(1, torch.cuda.current_device()) |
| with torch.cuda.stream(s2): |
| self.assertEqual(torch.cuda.current_stream(), s2) |
| self.assertEqual(0, torch.cuda.current_device()) |
| with torch.cuda.stream(s0): |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| self.assertEqual(0, torch.cuda.current_device()) |
| self.assertEqual(torch.cuda.current_stream(), s2) |
| self.assertEqual(0, torch.cuda.current_device()) |
| self.assertEqual(torch.cuda.current_stream(), s1) |
| self.assertEqual(1, torch.cuda.current_device()) |
| |
| self.assertEqual(torch.cuda.current_stream(), s0) |
| self.assertEqual(0, torch.cuda.current_device()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_streams_multi_gpu(self): |
| default_stream = torch.cuda.current_stream() |
| self.assertEqual(default_stream.device, torch.device('cuda:0')) |
| stream = torch.cuda.Stream(device=1) |
| self.assertEqual(stream.device, torch.device('cuda:1')) |
| with torch.cuda.device(1): |
| self.assertEqual( |
| torch.cuda.current_stream().device, torch.device('cuda:1')) |
| self.assertNotEqual(torch.cuda.current_stream(), default_stream) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_streams_multi_gpu_query(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| |
| self.assertTrue(s0.query()) |
| self.assertFalse(s1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(s0.query()) |
| self.assertFalse(s1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(s0.query()) |
| self.assertFalse(s1.query()) |
| |
| # deliberately using a different device |
| with torch.cuda.device(d0): |
| s1.synchronize() |
| |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_streams_multi_gpu_eq(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.current_stream() |
| |
| with torch.cuda.device(d1): |
| s2 = torch.cuda.current_stream() |
| s3 = torch.cuda.current_stream() |
| |
| self.assertTrue(s0 == s0) |
| self.assertTrue(s0 == s1) |
| self.assertTrue(s2 == s2) |
| self.assertTrue(s2 == s3) |
| self.assertFalse(s0 == s2) |
| self.assertFalse(s1 == s3) |
| |
| self.assertEqual(s0.device, s1.device) |
| self.assertEqual(s0.cuda_stream, s1.cuda_stream) |
| self.assertEqual(s2.device, s3.device) |
| self.assertEqual(s2.cuda_stream, s3.cuda_stream) |
| self.assertNotEqual(s0.device, s3.device) |
| |
| self.assertEqual(hash(s0), hash(s1)) |
| self.assertEqual(hash(s2), hash(s3)) |
| self.assertNotEqual(hash(s0), hash(s3)) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| @skipIfRocm |
| def test_streams_priority(self): |
| low, high = torch.cuda.Stream.priority_range() |
| s0 = torch.cuda.Stream(device=0, priority=low) |
| |
| self.assertEqual(low, s0.priority) |
| self.assertEqual(torch.device('cuda:0'), s0.device) |
| |
| s1 = torch.cuda.Stream(device=1, priority=high) |
| |
| self.assertEqual(high, s1.priority) |
| self.assertEqual(torch.device('cuda:1'), s1.device) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") |
| def test_tensor_device(self): |
| self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 0) |
| self.assertEqual(torch.cuda.FloatTensor(1, device=1).get_device(), 1) |
| with torch.cuda.device(1): |
| self.assertEqual(torch.cuda.FloatTensor(1).get_device(), 1) |
| self.assertEqual(torch.cuda.FloatTensor(1, device=0).get_device(), 0) |
| self.assertEqual(torch.cuda.FloatTensor(1, device=None).get_device(), 1) |
| |
| @skipIfRocm |
| def test_events(self): |
| stream = torch.cuda.current_stream() |
| event = torch.cuda.Event(enable_timing=True) |
| self.assertTrue(event.query()) |
| start_event = torch.cuda.Event(enable_timing=True) |
| stream.record_event(start_event) |
| torch.cuda._sleep(int(50 * get_cycles_per_ms())) |
| stream.record_event(event) |
| self.assertFalse(event.query()) |
| event.synchronize() |
| self.assertTrue(event.query()) |
| self.assertGreater(start_event.elapsed_time(event), 0) |
| |
| @staticmethod |
| def _stream_synchronize(self, spin_time_cycles): |
| s = torch.cuda.current_stream() |
| e_tik = torch.cuda.Event(enable_timing=True) |
| e_tok = torch.cuda.Event(enable_timing=True) |
| |
| e_tik.record(s) |
| torch.cuda._sleep(spin_time_cycles) |
| e_tok.record(s) |
| s.synchronize() |
| |
| self.assertTrue(s.query()) |
| |
| # not necessary to check e_tik and e_tok, as elapsed_time would throw |
| # exception if otherwise. |
| return e_tik.elapsed_time(e_tok) |
| |
| @staticmethod |
| def _event_synchronize(self, spin_time_cycles): |
| s = torch.cuda.current_stream() |
| e_tik = torch.cuda.Event(enable_timing=True) |
| e_tok = torch.cuda.Event(enable_timing=True) |
| |
| e_tik.record(s) |
| torch.cuda._sleep(spin_time_cycles) |
| s.record_event(e_tok) |
| e_tok.synchronize() |
| |
| self.assertTrue(s.query()) |
| |
| # not necessary to check e_tik and e_tok, as elapsed_time would throw |
| # exception if otherwise. |
| return e_tik.elapsed_time(e_tok) |
| |
| @staticmethod |
| def _event_wait(self, spin_time_cycles): |
| s0 = torch.cuda.current_stream() |
| s1 = torch.cuda.Stream() |
| e_tik = torch.cuda.Event(blocking=True, enable_timing=True) |
| e_tok = torch.cuda.Event(blocking=True, enable_timing=True) |
| |
| e_tik.record(s0) |
| torch.cuda._sleep(spin_time_cycles - 10) |
| e_sync = torch.cuda.Event(blocking=True) |
| e_sync.record() |
| e_sync.wait(s1) |
| with torch.cuda.stream(s1): |
| torch.cuda._sleep(10) |
| s1.synchronize() |
| s1.record_event(e_tok) |
| |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| self.assertTrue(e_sync.query()) |
| |
| # not necessary to check e_tik and e_tok, as elapsed_time would throw |
| # exception if otherwise. |
| return e_tik.elapsed_time(e_tok) |
| |
| @staticmethod |
| def _test_stream_event_nogil(self, sync_func, p2c, c2p): |
| with torch.cuda.device('cuda:1'): |
| c2p.put(0) |
| p2c.get() |
| c2p.put(sync_func(self, TestCuda.FIFTY_MIL_CYCLES)) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_stream_event_nogil(self): |
| for sync_func in [TestCuda._stream_synchronize, |
| TestCuda._event_synchronize, |
| TestCuda._event_wait]: |
| p2c = queue.Queue() |
| c2p = queue.Queue() |
| e_tik = torch.cuda.Event(enable_timing=True) |
| e_tok = torch.cuda.Event(enable_timing=True) |
| |
| t = threading.Thread( |
| target=TestCuda._test_stream_event_nogil, |
| args=(self, sync_func, p2c, c2p)) |
| t.daemon = True |
| t.start() |
| |
| c2p.get() |
| with torch.cuda.device('cuda:0'): |
| e_tik.record() |
| p2c.put(0) |
| parent_time = sync_func(self, TestCuda.FIFTY_MIL_CYCLES) |
| child_time = c2p.get() |
| e_tok.record() |
| e_tok.synchronize() |
| total_time = e_tik.elapsed_time(e_tok) |
| |
| # Without GIL, synchronizations in parent and child threads can |
| # overlap. The total execution time should be a little bit longer |
| # than spinning fifty million cycles and much shorter than twice of |
| # that. However, testing absolute execution time is not reliable as |
| # it may vary on different hardware in different environments. |
| # Therefore, this test uses relative comparisons, checking if the |
| # sum of parent and child threads execution time is greater than the |
| # real execution time by least 40%. |
| self.assertGreater(parent_time + child_time, total_time * 1.4) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_events_wait(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| e0 = torch.cuda.Event() |
| s0.record_event(e0) |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| |
| self.assertFalse(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| s1.wait_event(e0) |
| s1.synchronize() |
| |
| self.assertTrue(e0.query()) |
| self.assertTrue(s0.query()) |
| self.assertTrue(s1.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_events_multi_gpu_query(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| e0 = s0.record_event() |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| e1 = s1.record_event() |
| |
| self.assertTrue(e0.query()) |
| self.assertFalse(e1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(e0.query()) |
| self.assertFalse(e1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(e0.query()) |
| self.assertFalse(e1.query()) |
| |
| # deliberately using a different device |
| with torch.cuda.device(d0): |
| e1.synchronize() |
| |
| self.assertTrue(e0.query()) |
| self.assertTrue(e1.query()) |
| |
| with torch.cuda.device(d0): |
| self.assertTrue(e0.query()) |
| self.assertTrue(e1.query()) |
| |
| with torch.cuda.device(d1): |
| self.assertTrue(e0.query()) |
| self.assertTrue(e1.query()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU") |
| @skipIfRocm |
| def test_events_multi_gpu_elapsed_time(self): |
| d0 = torch.device('cuda:0') |
| d1 = torch.device('cuda:1') |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| e0 = torch.cuda.Event(enable_timing=True) |
| torch.cuda._sleep(10) |
| s0.record_event(e0) |
| |
| with torch.cuda.device(d1): |
| s1 = torch.cuda.current_stream() |
| e1 = torch.cuda.Event(enable_timing=True) |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| s1.record_event(e1) |
| |
| e0.synchronize() |
| e1.synchronize() |
| with torch.cuda.device(d0): |
| with self.assertRaises(RuntimeError): |
| self.assertGreater(e0.elapsed_time(e1), 0) |
| |
| with torch.cuda.device(d1): |
| with self.assertRaises(RuntimeError): |
| self.assertGreater(e0.elapsed_time(e1), 0) |
| |
| with torch.cuda.device(d0): |
| s0 = torch.cuda.current_stream() |
| e2 = torch.cuda.Event(enable_timing=True) |
| torch.cuda._sleep(TestCuda.FIFTY_MIL_CYCLES) |
| s0.record_event(e2) |
| s0.synchronize() |
| |
| self.assertGreater(e0.elapsed_time(e2), 0) |
| |
| # deliberately calling from a different device |
| with torch.cuda.device(d1): |
| self.assertGreater(e0.elapsed_time(e2), 0) |
| |
| @skipIfRocm |
| def test_record_stream(self): |
| cycles_per_ms = get_cycles_per_ms() |
| |
| t = torch.FloatTensor([1, 2, 3, 4]).pin_memory() |
| result = torch.cuda.FloatTensor(t.size()) |
| stream = torch.cuda.Stream() |
| ptr = [None] |
| |
| # Performs the CPU->GPU copy in a background stream |
| def perform_copy(): |
| with torch.cuda.stream(stream): |
| tmp = t.cuda(non_blocking=True) |
| ptr[0] = tmp.data_ptr() |
| torch.cuda.current_stream().wait_stream(stream) |
| tmp.record_stream(torch.cuda.current_stream()) |
| torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy |
| result.copy_(tmp) |
| |
| perform_copy() |
| with torch.cuda.stream(stream): |
| tmp2 = torch.cuda.FloatTensor(t.size()) |
| tmp2.zero_() |
| self.assertNotEqual(tmp2.data_ptr(), ptr[0], 'allocation re-used to soon') |
| |
| self.assertEqual(result.tolist(), [1, 2, 3, 4]) |
| |
| # Check that the block will be re-used after the main stream finishes |
| torch.cuda.current_stream().synchronize() |
| with torch.cuda.stream(stream): |
| tmp3 = torch.cuda.FloatTensor(t.size()) |
| self.assertEqual(tmp3.data_ptr(), ptr[0], 'allocation not re-used') |
| |
| def test_noncontiguous_pinned_memory(self): |
| # See issue #3266 |
| x = torch.arange(0, 10).view((2, 5)) |
| self.assertEqual(x.t(), x.t().pin_memory()) |
| |
| @skipIfRocm |
| def test_caching_pinned_memory(self): |
| cycles_per_ms = get_cycles_per_ms() |
| |
| # check that allocations are re-used after deletion |
| t = torch.FloatTensor([1]).pin_memory() |
| ptr = t.data_ptr() |
| del t |
| t = torch.FloatTensor([1]).pin_memory() |
| self.assertEqual(t.data_ptr(), ptr, 'allocation not reused') |
| |
| # check that the allocation is not re-used if it's in-use by a copy |
| gpu_tensor = torch.cuda.FloatTensor([0]) |
| torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy |
| gpu_tensor.copy_(t, non_blocking=True) |
| del t |
| t = torch.FloatTensor([1]).pin_memory() |
| self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon') |
| self.assertEqual(list(gpu_tensor), [1]) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| @skipIfRocm |
| def test_caching_pinned_memory_multi_gpu(self): |
| # checks that the events preventing pinned memory from being re-used |
| # too early are recorded on the correct GPU |
| cycles_per_ms = get_cycles_per_ms() |
| |
| t = torch.FloatTensor([1]).pin_memory() |
| ptr = t.data_ptr() |
| gpu_tensor0 = torch.cuda.FloatTensor([0], device=0) |
| gpu_tensor1 = torch.cuda.FloatTensor([0], device=1) |
| |
| with torch.cuda.device(1): |
| torch.cuda._sleep(int(50 * cycles_per_ms)) # delay the copy |
| gpu_tensor1.copy_(t, non_blocking=True) |
| |
| del t |
| t = torch.FloatTensor([2]).pin_memory() |
| self.assertNotEqual(t.data_ptr(), ptr, 'allocation re-used too soon') |
| |
| with torch.cuda.device(0): |
| gpu_tensor0.copy_(t, non_blocking=True) |
| |
| self.assertEqual(gpu_tensor1[0], 1) |
| self.assertEqual(gpu_tensor0[0], 2) |
| |
| def test_sum_cpu_gpu_mismatch(self): |
| x = torch.randn(20, dtype=torch.float32, device='cuda') |
| y = torch.randn(1, dtype=torch.float32) |
| with self.assertRaisesRegex(RuntimeError, 'expected type' |
| ' torch.FloatTensor but got' |
| ' torch.cuda.FloatTensor'): |
| torch.sum(x, dim=[0], dtype=torch.float32, out=y) |
| # makeing sure half to float promotion is also properly working. |
| x = x.half() |
| with self.assertRaisesRegex(RuntimeError, 'expected type' |
| ' torch.FloatTensor but got' |
| ' torch.cuda.HalfTensor'): |
| torch.sum(x, dim=[0], dtype=torch.float32, out=y) |
| |
| @skipIfRocm |
| def test_sum_noncontig(self): |
| x = torch.randn(1, 75, 57, 20, device='cuda').permute(0, 3, 1, 2) |
| y = x.cpu() |
| self.assertEqual(x.sum().cpu(), y.sum()) |
| self.assertEqual(x.sum(dim=(-1, -2)).cpu(), y.sum(dim=(-1, -2))) |
| self.assertEqual(x.sum(dim=(1, 3)).cpu(), y.sum(dim=(1, 3))) |
| |
| def test_sum_fp16(self): |
| x = torch.zeros(10, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum(), 0) |
| |
| x = torch.ones(65504, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum(), 65504) |
| self.assertEqual(x.sum(dtype=torch.float32), 65504) |
| |
| x = torch.ones(65536, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum(dtype=torch.float32), 65536) |
| |
| a = torch.zeros(1203611).bernoulli_(0.0005) |
| x = a.to(device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum().item(), a.sum().item()) |
| |
| a = torch.zeros(100, 121, 80).bernoulli_(0.0005) |
| x = a.to(device='cuda', dtype=torch.float16) |
| self.assertEqual(x.sum((0, 2)).float().cpu(), a.sum((0, 2))) |
| |
| def test_mean_fp16(self): |
| x = torch.ones(65536, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.mean(), 1) |
| |
| x = torch.ones(65536, device='cuda', dtype=torch.float16) |
| self.assertEqual(x.mean(dtype=torch.float32), 1) |
| |
| def test_prod_large(self): |
| # tests global reduction (should_global_reduce = true) in case of non-zero identity element |
| x = torch.ones(240000, device='cuda', dtype=torch.float32) |
| self.assertEqual(x.prod(), 1) |
| |
| @staticmethod |
| def _select_broadcastable_dims(dims_full=None): |
| return _TestTorchMixin._select_broadcastable_dims(dims_full) |
| |
| @skipIfRocm |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_inverse(self): |
| _TestTorchMixin._test_inverse(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_pinverse(self): |
| _TestTorchMixin._test_pinverse(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_matrix_rank(self): |
| _TestTorchMixin._test_matrix_rank(self, lambda x: x.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_matrix_power(self): |
| _TestTorchMixin._test_matrix_power(self, conv_fn=lambda t: t.cuda()) |
| |
| def test_chain_matmul(self): |
| _TestTorchMixin._test_chain_matmul(self, cast=lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_det_logdet_slogdet(self): |
| _TestTorchMixin._test_det_logdet_slogdet(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_gesv(self): |
| _TestTorchMixin._test_gesv(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_gesv_batched(self): |
| _TestTorchMixin._test_gesv_batched(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_gesv_batched_dims(self): |
| _TestTorchMixin._test_gesv_batched_dims(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_cholesky_solve(self): |
| _TestTorchMixin._test_cholesky_solve(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_cholesky_solve_batched(self): |
| _TestTorchMixin._test_cholesky_solve_batched(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_cholesky_solve_batched_dims(self): |
| _TestTorchMixin._test_cholesky_solve_batched_dims(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_cholesky(self): |
| _TestTorchMixin._test_cholesky(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_cholesky_batched(self): |
| _TestTorchMixin._test_cholesky_batched(self, lambda t: t.cuda()) |
| |
| def test_view(self): |
| _TestTorchMixin._test_view(self, lambda t: t.cuda()) |
| |
| def test_flip(self): |
| _TestTorchMixin._test_flip(self, use_cuda=True) |
| |
| def test_rot90(self): |
| _TestTorchMixin._test_rot90(self, use_cuda=True) |
| |
| def test_signal_window_functions(self): |
| _TestTorchMixin._test_signal_window_functions(self, device=torch.device('cuda')) |
| |
| @skipIfRocm |
| def test_fft_ifft_rfft_irfft(self): |
| _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda')) |
| |
| @contextmanager |
| def plan_cache_max_size(n): |
| original = torch.backends.cuda.cufft_plan_cache.max_size |
| torch.backends.cuda.cufft_plan_cache.max_size = n |
| yield |
| torch.backends.cuda.cufft_plan_cache.max_size = original |
| |
| with plan_cache_max_size(max(1, torch.backends.cuda.cufft_plan_cache.size - 10)): |
| _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda')) |
| |
| with plan_cache_max_size(0): |
| _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda')) |
| |
| torch.backends.cuda.cufft_plan_cache.clear() |
| |
| # check that stll works after clearing cache |
| with plan_cache_max_size(10): |
| _TestTorchMixin._test_fft_ifft_rfft_irfft(self, device=torch.device('cuda')) |
| |
| with self.assertRaisesRegex(RuntimeError, r"must be non-negative"): |
| torch.backends.cuda.cufft_plan_cache.max_size = -1 |
| |
| with self.assertRaisesRegex(RuntimeError, r"read-only property"): |
| torch.backends.cuda.cufft_plan_cache.size = -1 |
| |
| def test_stft(self): |
| _TestTorchMixin._test_stft(self, device=torch.device('cuda')) |
| |
| def test_multinomial(self): |
| _TestTorchMixin._test_multinomial(self, torch.cuda.FloatTensor) |
| |
| # Test two corner cases from older PyTorch (Issue #4858) |
| freqs = torch.cuda.FloatTensor([ |
| 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, |
| 0.03178183361887932, 0.027680952101945877, 0.033176131546497345, |
| 0.046052902936935425, 0.07742464542388916, 0.11543981730937958, |
| 0.14148041605949402, 0.15784293413162231, 0.13180233538150787, |
| 0.08271478116512299, 0.049702685326337814, 0.027557924389839172, |
| 0.018125897273421288, 0.011851548217236996, 0.010252203792333603, |
| 0.007422595750540495, 0.005372154992073774, 0.0045109698548913, |
| 0.0036087757907807827, 0.0035267581697553396, 0.0018864056328311563, |
| 0.0024605290964245796, 0.0022964938543736935, 0.0018453967059031129, |
| 0.0010662291897460818, 0.0009842115687206388, 0.00045109697384759784, |
| 0.0007791675161570311, 0.00020504408166743815, 0.00020504408166743815, |
| 0.00020504408166743815, 0.00012302644609007984, 0.0, |
| 0.00012302644609007984, 4.100881778867915e-05, 0.0, 0.0, 0.0, 0.0, |
| 0.0, 0.0]) |
| |
| torch.cuda.manual_seed(11042) |
| sample = torch.multinomial(freqs, 1000, True) |
| self.assertNotEqual(freqs[sample].min(), 0) |
| |
| p = torch.zeros(3421, 2, device="cuda", dtype=torch.float) |
| p[:, 1] = 1 |
| torch.cuda.manual_seed(5214) |
| r = torch.multinomial(p, 1) |
| self.assertNotEqual(r.min().item(), 0) |
| |
| # test corner case from Issue #13867 |
| torch.cuda.manual_seed(33) |
| probs = torch.randn(1000000, device='cuda').clamp(min=0) * 3e-5 |
| samples = probs.multinomial(1000000, replacement=True) |
| self.assertGreater(probs[samples].min().item(), 0) |
| |
| @staticmethod |
| def mute(): |
| os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stderr.fileno()) |
| |
| def _spawn_method(self, method, arg): |
| ctx = mp.get_context("spawn") |
| with ctx.Pool(1, initializer=self.mute) as pool: |
| errors = pool.map(method, [arg]) |
| for e in errors: |
| if 'device-side assert triggered' not in str(e): |
| self.fail(e) |
| |
| @staticmethod |
| def _test_multinomial_invalid_probs_cuda(probs): |
| try: |
| with torch.random.fork_rng(devices=[0]): |
| torch.multinomial(probs.to('cuda'), 2) |
| torch.cuda.synchronize() |
| return False # Should not be reached |
| except RuntimeError as e: |
| return e |
| |
| @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \ |
| don't support multiprocessing with spawn start method") |
| @unittest.skipIf(IS_WINDOWS, 'FIXME: CUDA OOM error on Windows') |
| @unittest.skipIf(not PY3, |
| "spawn start method is not supported in Python 2, \ |
| but we need it for creating another process with CUDA") |
| def test_multinomial_invalid_probs_cuda(self): |
| test_method = TestCuda._test_multinomial_invalid_probs_cuda |
| self._spawn_method(test_method, torch.Tensor([1, -1, 1])) |
| self._spawn_method(test_method, torch.Tensor([1, inf, 1])) |
| self._spawn_method(test_method, torch.Tensor([1, -inf, 1])) |
| self._spawn_method(test_method, torch.Tensor([1, 1, nan])) |
| self._spawn_method(test_method, torch.Tensor([0, 1, 0])) |
| |
| def test_broadcast(self): |
| _TestTorchMixin._test_broadcast(self, lambda t: t.cuda()) |
| |
| def test_contiguous(self): |
| _TestTorchMixin._test_contiguous(self, lambda t: t.cuda()) |
| |
| def test_broadcast_fused_matmul(self): |
| _TestTorchMixin._test_broadcast_fused_matmul(self, lambda t: t.cuda()) |
| |
| def test_broadcast_batched_matmul(self): |
| _TestTorchMixin._test_broadcast_batched_matmul(self, lambda t: t.cuda()) |
| |
| def test_index(self): |
| _TestTorchMixin._test_index(self, lambda t: t.cuda()) |
| |
| def test_advancedindex(self): |
| _TestTorchMixin._test_advancedindex(self, lambda t: t.cuda()) |
| |
| def test_advancedindex_mixed_cpu_cuda(self): |
| def test(x, ia, ib): |
| # test getitem |
| self.assertEqual(x[:, ia, None, ib, 0].cpu(), |
| x.cpu()[:, ia.cpu(), None, ib.cpu(), 0]) |
| self.assertEqual(x[ia], x.cpu()[ia.cpu()]) |
| # test setitem |
| x_clone1 = x.clone() |
| x_clone2 = x.clone() |
| first_shape = x[:, ia, None, ib, 0].shape |
| second_shape = x[ia].shape |
| x_clone1[:, ia, None, ib, 0] = torch.randn(first_shape).to(x_clone1) |
| x_clone2[ia] = torch.randn(second_shape).to(x_clone2) |
| |
| cpu = torch.device('cpu') |
| for device in ['cuda:0', 'cuda:1'] if torch.cuda.device_count() > 1 else ['cuda']: |
| # Index cpu tensor with cuda tensor |
| x = torch.randn(3, 4, 4, 4, 3) |
| ia = torch.tensor([0, 2, 1]).to(device) |
| ib = torch.tensor([0, 2, 1]).to(device) |
| test(x, ia, ib) |
| |
| # Index cuda tensor with cpu tensor |
| x = x.to(device) |
| ia = ia.to(cpu) |
| ib = ib.to(cpu) |
| test(x, ia, ib) |
| |
| # Index cpu tensor with mixed cpu, cuda tensors |
| x = x.to(cpu) |
| ia = ia.to(cpu) |
| ib = ib.to(device) |
| test(x, ia, ib) |
| |
| # Index cuda tensor with mixed cpu, cuda tensors |
| x = x.to(device) |
| ia = ia.to(cpu) |
| ib = ib.to(device) |
| test(x, ia, ib) |
| |
| if torch.cuda.device_count() > 1: |
| other_device = 'cuda:0' if device != 'cuda:0' else 'cuda:1' |
| # Index cuda tensor with mixed cpu, cuda tensors on different devices |
| x = x.to(device) |
| ia = ia.to(cpu) |
| ib = ib.to(other_device) |
| test(x, ia, ib) |
| |
| def test_advancedindex_big(self): |
| _TestTorchMixin._test_advancedindex_big(self, lambda t: t.cuda()) |
| |
| @skipIfRocm |
| def test_btrifact(self): |
| _TestTorchMixin._test_btrifact(self, lambda t: t.cuda()) |
| |
| @skipIfRocm |
| def test_btrisolve(self): |
| _TestTorchMixin._test_btrisolve(self, lambda t: t.cuda()) |
| |
| @skipIfRocm |
| def test_btriunpack(self): |
| _TestTorchMixin._test_btriunpack(self, lambda t: t.cuda()) |
| |
| def test_dim_reduction(self): |
| _TestTorchMixin._test_dim_reduction(self, lambda t: t.cuda()) |
| |
| def test_tensor_gather(self): |
| _TestTorchMixin._test_gather(self, lambda t: t.cuda(), False) |
| |
| def test_tensor_scatter(self): |
| _TestTorchMixin._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', test_bounds=False) |
| |
| def test_tensor_scatterAdd(self): |
| _TestTorchMixin._test_scatter_base(self, lambda t: t.cuda(), 'scatter_add_', test_bounds=False) |
| |
| def test_tensor_scatterFill(self): |
| _TestTorchMixin._test_scatter_base(self, lambda t: t.cuda(), 'scatter_', True, test_bounds=False) |
| |
| def test_min_max_inits(self): |
| # Testing if THC_reduceAll received the correct index initialization. |
| # This affects the result of THC_reduceAll operations at extreme values |
| x = torch.cuda.ByteTensor([0]) |
| y = torch.cuda.ByteTensor([255]) |
| expected = torch.cuda.LongTensor([0])[0] |
| |
| _, v = x.max(dim=0) |
| self.assertEqual(v, expected) |
| |
| _, v = y.min(dim=0) |
| self.assertEqual(v, expected) |
| |
| def test_max_with_inf(self): |
| _TestTorchMixin._test_max_with_inf(self, (torch.half, torch.float, torch.double), 'cuda') |
| |
| def test_min_with_inf(self): |
| _TestTorchMixin._test_min_with_inf(self, (torch.half, torch.float, torch.double), 'cuda') |
| |
| def test_rpow(self): |
| _TestTorchMixin._test_rpow(self, lambda x: x.cuda()) |
| |
| def test_int_pow(self): |
| _TestTorchMixin._test_int_pow(self, lambda x: x.cuda()) |
| |
| def test_remainder_overflow(self): |
| _TestTorchMixin._test_remainder_overflow(self, dtype=torch.int64, device='cuda') |
| |
| def test_var(self): |
| cpu_tensor = torch.randn(2, 3, 3) |
| gpu_tensor = cpu_tensor.cuda() |
| self.assertEqual(gpu_tensor.var(), cpu_tensor.var()) |
| self.assertEqual(gpu_tensor.var(1), cpu_tensor.var(1)) |
| self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2)) |
| self.assertEqual(gpu_tensor.std(), cpu_tensor.std()) |
| self.assertEqual(gpu_tensor.std(1), cpu_tensor.std(1)) |
| self.assertEqual(gpu_tensor.var(2), cpu_tensor.var(2)) |
| |
| cpu_tensor = torch.randn(100) |
| gpu_tensor = cpu_tensor.cuda() |
| self.assertEqual(gpu_tensor.var(), cpu_tensor.var()) |
| |
| def test_var_unbiased(self): |
| tensor = torch.randn(100).cuda() |
| self.assertEqual(tensor.var(0), tensor.var(0, unbiased=True)) |
| self.assertEqual(tensor.var(), tensor.var(unbiased=True)) |
| self.assertEqual(tensor.var(unbiased=False), tensor.var(0, unbiased=False)) |
| |
| tensor = torch.FloatTensor([1.0, 2.0]).cuda() |
| self.assertEqual(tensor.var(unbiased=True), 0.5) |
| self.assertEqual(tensor.var(unbiased=False), 0.25) |
| |
| tensor = torch.randn(100).cuda() |
| self.assertEqual(tensor.std(0), tensor.std(0, unbiased=True)) |
| self.assertEqual(tensor.std(), tensor.std(unbiased=True)) |
| self.assertEqual(tensor.std(unbiased=False), tensor.std(0, unbiased=False)) |
| |
| def test_var_large_input(self): |
| # Large, not-nice input |
| tensor_cpu = torch.randn(2 * 32 * 1024 + 1, 2, 67) |
| tensor_cuda = tensor_cpu.cuda() |
| |
| self.assertEqual(tensor_cpu.var(2), tensor_cuda.var(2).cpu()) |
| |
| def test_var_stability(self): |
| tensor = torch.FloatTensor([2281.5, 2281.25]).cuda() |
| |
| # Stability for inner dim |
| self.assertEqual(tensor.var(0), 0.03125) |
| |
| # General stability |
| self.assertEqual(tensor.var(), 0.03125) |
| |
| # Stability for outer dimensions |
| tensor = tensor.unsqueeze(1) |
| self.assertEqual(tensor.var(0), 0.03125) |
| |
| @skipIfRocm |
| def test_digamma(self): |
| def test(use_double=False): |
| cpu_tensor = torch.randn(10, 10, 10) |
| gpu_tensor = cpu_tensor.cuda() |
| zeros = torch.zeros(10, 10, 10) |
| if (use_double): |
| cpu_tensor = cpu_tensor.double() |
| gpu_tensor = gpu_tensor.double() |
| zeros = zeros.double() |
| cpu_out = cpu_tensor.digamma() |
| gpu_out = gpu_tensor.digamma() |
| norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out |
| self.assertEqual(norm_errors, zeros) |
| |
| test(True) |
| test(False) |
| |
| # Test float32 behavior near and at poles. |
| cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111, |
| -100.99999994, -1931.99999994, 0.000000111, |
| -0.000000111, 0, -1, -2, -931]) |
| expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan]) |
| gpu_tensor = cpu_tensor.cuda() |
| cpu_out = cpu_tensor.digamma() |
| gpu_out = gpu_tensor.digamma() |
| norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out |
| self.assertEqual(norm_errors, expected_errors) |
| |
| @skipIfRocm |
| def test_polygamma(self): |
| def test(use_double=False): |
| cpu_tensor = torch.randn(10, 10, 10) |
| gpu_tensor = cpu_tensor.cuda() |
| zeros = torch.zeros(10, 10, 10) |
| if (use_double): |
| cpu_tensor = cpu_tensor.double() |
| gpu_tensor = gpu_tensor.double() |
| zeros = zeros.double() |
| for n in [0, 1]: |
| cpu_out = cpu_tensor.polygamma(n) |
| gpu_out = gpu_tensor.polygamma(n) |
| norm_errors = (gpu_out - cpu_out.cuda()) / gpu_out |
| self.assertEqual(norm_errors, zeros) |
| |
| test(True) |
| test(False) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_symeig(self): |
| _TestTorchMixin._test_symeig(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_svd_no_singularvectors(self): |
| _TestTorchMixin._test_svd_no_singularvectors(self, lambda t: t.cuda()) |
| |
| def test_arange(self): |
| for t in ['IntTensor', 'LongTensor', 'FloatTensor', 'DoubleTensor']: |
| a = torch.cuda.__dict__[t]() |
| torch.arange(0, 10, out=a) |
| b = torch.__dict__[t]() |
| torch.arange(0, 10, out=b) |
| self.assertEqual(a, b.cuda()) |
| |
| def test_linspace(self): |
| a = torch.linspace(0, 10, 10, device='cuda') |
| b = torch.linspace(0, 10, 10) |
| self.assertEqual(a, b.cuda()) |
| |
| def test_logspace(self): |
| a = torch.logspace(1, 10, 10, device='cuda') |
| b = torch.logspace(1, 10, 10) |
| self.assertEqual(a, b.cuda()) |
| |
| def test_diagonal(self): |
| _TestTorchMixin._test_diagonal(self, dtype=torch.float32, device='cuda') |
| |
| def test_diagflat(self): |
| _TestTorchMixin._test_diagflat(self, dtype=torch.float32, device='cuda') |
| |
| @unittest.skipIf(not TEST_NUMPY, "NumPy not found") |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| @skipIfRocm |
| def test_norm(self): |
| _TestTorchMixin._test_norm(self, device='cuda') |
| |
| def test_dist(self): |
| _TestTorchMixin._test_dist(self, device='cuda') |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_geqrf(self): |
| _TestTorchMixin._test_geqrf(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected") |
| def test_trtrs(self): |
| _TestTorchMixin._test_trtrs(self, lambda t: t.cuda()) |
| |
| @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected") |
| def test_get_set_rng_state_all(self): |
| states = torch.cuda.get_rng_state_all() |
| before0 = torch.cuda.FloatTensor(100, device=0).normal_() |
| before1 = torch.cuda.FloatTensor(100, device=1).normal_() |
| torch.cuda.set_rng_state_all(states) |
| after0 = torch.cuda.FloatTensor(100, device=0).normal_() |
| after1 = torch.cuda.FloatTensor(100, device=1).normal_() |
| self.assertEqual(before0, after0, 0) |
| self.assertEqual(before1, after1, 0) |
| |
| @skipIfRocm |
| def test_nvtx(self): |
| # Just making sure we can see the symbols |
| torch.cuda.nvtx.range_push("foo") |
| torch.cuda.nvtx.mark("bar") |
| torch.cuda.nvtx.range_pop() |
| |
| def test_randperm_cuda(self): |
| cuda = torch.device('cuda:0') |
| |
| # For small inputs, randperm is offloaded to CPU instead |
| with torch.random.fork_rng(devices=[0]): |
| res1 = torch.randperm(100, device=cuda) |
| res2 = torch.cuda.LongTensor() |
| torch.randperm(100, out=res2, device=cuda) |
| self.assertEqual(res1, res2, 0) |
| |
| with torch.random.fork_rng(devices=[0]): |
| res1 = torch.randperm(100000, device=cuda) |
| res2 = torch.cuda.LongTensor() |
| torch.randperm(100000, out=res2, device=cuda) |
| self.assertEqual(res1, res2, 0) |
| |
| with torch.random.fork_rng(devices=[0]): |
| res1 = torch.randperm(100, dtype=torch.half, device=cuda) |
| res2 = torch.cuda.HalfTensor() |
| torch.randperm(100, out=res2, device=cuda) |
| self.assertEqual(res1, res2, 0) |
| |
| with torch.random.fork_rng(devices=[0]): |
| res1 = torch.randperm(50000, dtype=torch.half, device=cuda) |
| res2 = torch.cuda.HalfTensor() |
| torch.randperm(50000, out=res2, device=cuda) |
| self.assertEqual(res1, res2, 0) |
| |
| # randperm of 0 elements is an empty tensor |
| res1 = torch.randperm(0, device=cuda) |
| res2 = torch.cuda.LongTensor(5) |
| torch.randperm(0, out=res2, device=cuda) |
| self.assertEqual(res1.numel(), 0) |
| self.assertEqual(res2.numel(), 0) |
| |
| def test_random_neg_values(self): |
| _TestTorchMixin._test_random_neg_values(self, use_cuda=True) |
| |
| def test_bincount_cuda(self): |
| _TestTorchMixin._test_bincount(self, device='cuda') |
| # ensure CUDA code coverage |
| input_size = (5000,) |
| w = torch.randn(input_size, device='cuda') |
| w_cpu = w.cpu() |
| # test shared memory impl |
| t = torch.randint(50, input_size, dtype=torch.int8, device='cuda') |
| self.assertEqual(t.cpu().bincount(), t.bincount()) |
| self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) |
| # test multi block memory impl |
| # see `THRESH_NUMBER_BINS_FOR_MULTI_BLOCK_MEM` in SummaryOps.cu |
| t = torch.randint(500, input_size, dtype=torch.int64, device='cuda') |
| self.assertEqual(t.cpu().bincount(), t.bincount()) |
| self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) |
| # test global memory impl |
| # see `THRESH_NUMBER_BINS_FOR_GLOBAL_MEM` in SummaryOps.cu |
| t = torch.randint(2000, input_size, dtype=torch.int64, device='cuda') |
| self.assertEqual(t.cpu().bincount(), t.bincount()) |
| self.assertEqual(t.cpu().bincount(w_cpu), t.bincount(w)) |
| |
| @skipIfRocm |
| def test_histc_cuda(self): |
| _TestTorchMixin._test_histc(self, device='cuda') |
| |
| def test_tiny_half_norm_(self): |
| a = torch.arange(25).cuda().float() |
| a /= 100000000 |
| b = a.half() |
| self.assertGreater(b.norm().item(), 0) |
| |
| def test_norm_type_conversion(self): |
| a = torch.ones(65536).cuda().half() |
| self.assertEqual(a.norm(p=0, dtype=torch.float32), 65536) |
| |
| @skipIfRocm |
| # Test that wrap_with_cuda_memory_check successfully detects leak |
| def test_cuda_memory_leak_detection(self): |
| l = [] |
| |
| @self.wrap_with_cuda_memory_check |
| def no_leak(): |
| pass |
| |
| @self.wrap_with_cuda_memory_check |
| def leak_gpu0(): |
| l.append(torch.tensor(10, device=torch.device("cuda:0"))) |
| |
| no_leak() |
| |
| with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 0"): |
| leak_gpu0() |
| |
| if TEST_MULTIGPU: |
| @self.wrap_with_cuda_memory_check |
| def leak_gpu1(): |
| l.append(torch.tensor(10, device=torch.device("cuda:1"))) |
| |
| with self.assertRaisesRegex(AssertionError, r"leaked \d+ bytes CUDA memory on device 1"): |
| leak_gpu1() |
| |
| def test_cuda_memory_leak_detection_propagates_errors(self): |
| with self.assertRaisesRegex(RuntimeError, r"The size of tensor a \(3\) must match"): |
| with self.assertLeaksNoCudaTensors(): |
| x = torch.randn(3, 1, device='cuda') |
| y = torch.randn(2, 1, device='cuda') |
| z = x + y |
| |
| def test_trilu_indices(self): |
| for test_args in tri_tests_args: |
| _compare_trilu_indices(self, *test_args, device='cuda') |
| |
| # test default options |
| x = torch.ones( |
| 3, 3, dtype=torch.long, device='cuda', layout=torch.strided) |
| self.assertEqual( |
| x.tril(0).nonzero().transpose(0, 1), |
| torch.tril_indices(3, 3, device='cuda')) |
| self.assertEqual( |
| x.triu(0).nonzero().transpose(0, 1), |
| torch.triu_indices(3, 3, device='cuda')) |
| |
| def test_large_trilu_indices(self): |
| for test_args in tri_large_tests_args: |
| _compare_large_trilu_indices(self, *test_args, device='cuda') |
| |
| def test_triu_tril(self): |
| _TestTorchMixin._test_triu_tril(self, lambda t: t.cuda()) |
| |
| |
| def load_ignore_file(): |
| from os.path import join, dirname |
| global ignores |
| path = join(dirname(__file__), 'data', 'test_cuda_ignores.txt') |
| with open(path, 'r') as f: |
| ignores = {l for l in f.read().splitlines() if not l.startswith('#')} |
| |
| |
| def generate_tests(): |
| for decl in tests: |
| for t in types: |
| tensor = t() |
| |
| # Default values |
| desc = '' |
| type_subset = types |
| no_inplace = False |
| decorator = None |
| if len(decl) == 3: |
| name, constr, arg_constr = decl |
| elif len(decl) == 4: |
| name, constr, arg_constr, desc = decl |
| elif len(decl) == 5: |
| name, constr, arg_constr, desc, type_subset = decl |
| elif len(decl) == 6: |
| name, constr, arg_constr, desc, type_subset, no_inplace = decl |
| elif len(decl) == 7: |
| name, constr, arg_constr, desc, type_subset, no_inplace, decorator = decl |
| |
| if t not in type_subset: |
| continue |
| if TEST_WITH_ROCM and decorator is not None: |
| if (isinstance(decorator, str)): |
| tensor_type_name = str(t.__name__) |
| decorator_list = decorator.split(":") |
| skip_type_list = decorator_list[1].split(",") |
| if (("ByteTensor" in skip_type_list) and tensor_type_name == "ByteTensor") \ |
| or (("CharTensor" in skip_type_list) and tensor_type_name == "CharTensor") \ |
| or (("DoubleTensor" in skip_type_list) and tensor_type_name == "DoubleTensor") \ |
| or (("FloatTensor" in skip_type_list) and tensor_type_name == "FloatTensor") \ |
| or (("HalfTensor" in skip_type_list) and tensor_type_name == "HalfTensor") \ |
| or (("IntTensor" in skip_type_list) and tensor_type_name == "IntTensor") \ |
| or (("LongTensor" in skip_type_list) and tensor_type_name == "LongTensor") \ |
| or (("ShortTensor" in skip_type_list) and tensor_type_name == "ShortTensor"): |
| decorator = skipIfRocm |
| else: |
| decorator = None |
| elif ((not TEST_WITH_ROCM) and (decorator is not None)): |
| if (isinstance(decorator, str)): |
| decorator = None |
| |
| precision = custom_precision.get(name, TestCuda.precision) |
| if is_half(t): |
| precision = custom_half_precision.get(name, precision) |
| |
| for inplace in (True, False): |
| if inplace and no_inplace: |
| continue |
| if inplace: |
| name_inner = name + '_' |
| else: |
| name_inner = name |
| |
| if t != torch.HalfTensor and not hasattr(tensor, name_inner): |
| # torch.HalfTensor doesn't support most operations, |
| # but we use torch.FloatTensor as cpu baseline |
| continue |
| full_name = '{}.{}'.format(tensor.type(), name_inner) |
| if full_name in ignores: |
| continue |
| |
| test_name = 'test_' + t.__name__ + '_' + name_inner |
| if desc: |
| test_name += '_' + desc |
| |
| assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name |
| |
| test_fn = compare_cpu_gpu(constr, arg_constr, name_inner, t, precision) |
| |
| if decorator is not None: |
| test_fn = decorator(test_fn) |
| |
| setattr(TestCuda, test_name, test_fn) |
| |
| |
| if __name__ == '__main__': |
| if TEST_CUDA: |
| load_ignore_file() |
| generate_tests() |
| |
| run_tests() |