| // Implements the math functions for GPU. |
| |
| #define EIGEN_USE_GPU |
| |
| #include "caffe2/utils/math.h" |
| |
| #include <limits> |
| #include <numeric> |
| #include <vector> |
| |
| #include <cub/block/block_reduce.cuh> |
| #include <cub/cub.cuh> |
| |
| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/utils/conversions.h" |
| |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| #include "unsupported/Eigen/CXX11/Tensor" |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| |
| #if THRUST_VERSION >= 100800 |
| #define THRUST_SUPPORTS_PER_THREAD |
| #endif // THRUST_VERSION >= 100800 |
| |
| namespace caffe2 { |
| |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| template <typename T, int D> |
| using EigenTensorMap = Eigen::TensorMap<Eigen::Tensor<T, D>>; |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| |
| namespace math { |
| |
| #define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Funcname, function) \ |
| __global__ \ |
| void _Kernel_##T##_##Funcname(const int N, const T* x, T* y) { \ |
| CUDA_1D_KERNEL_LOOP(i, N) { \ |
| y[i] = function(x[i]); \ |
| } \ |
| } \ |
| template <> \ |
| void Funcname<T, CUDAContext>( \ |
| const int N, const T* x, T* y, \ |
| CUDAContext* context) { \ |
| _Kernel_##T##_##Funcname<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, \ |
| 0, context->cuda_stream()>>>( \ |
| N, x, y); \ |
| } |
| |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, InvSqrt, rsqrtf); |
| |
| __device__ float cuda_sqrf(const float x) { return x * x; } |
| |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, cuda_sqrf); |
| |
| #undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION |
| |
| #define DELEGATE_SINCOS_CUDA_FUNCTION(T) \ |
| __global__ void _Kernel_##T##_##SinCos( \ |
| const int N, const T* x, T* ys, T* yc) { \ |
| CUDA_1D_KERNEL_LOOP(i, N) { \ |
| sincos(x[i], ys + i, yc + i); \ |
| } \ |
| } \ |
| template <> \ |
| void SinCos<T, CUDAContext>( \ |
| const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \ |
| _Kernel_##T##_##SinCos<<< \ |
| CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, x, ys, yc); \ |
| } |
| |
| DELEGATE_SINCOS_CUDA_FUNCTION(float) |
| DELEGATE_SINCOS_CUDA_FUNCTION(double) |
| |
| #undef DELEGATE_SINCOS_CUDA_FUNCTION |
| |
| #define DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(T, Funcname, expr) \ |
| __global__ void _Kernel_##T##_##Funcname( \ |
| const int N, const T* a, const T* b, T* y) { \ |
| CUDA_1D_KERNEL_LOOP(i, N) { \ |
| float r = convert::To<T, float>(a[i]) expr convert::To<T, float>(b[i]); \ |
| y[i] = convert::To<float, T>(r); \ |
| } \ |
| } \ |
| template <> \ |
| void Funcname<T, CUDAContext>( \ |
| const int N, const T* a, const T* b, T* y, CUDAContext* context) { \ |
| _Kernel_##T##_##Funcname<<< \ |
| CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, a, b, y); \ |
| } |
| |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Add, +); |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(int32_t, Add, +); |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Sub, -); |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Mul, *); |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float, Div, /); |
| |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Add, +); |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Sub, -); |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Mul, *); |
| DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION(float16, Div, /); |
| |
| #undef DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION |
| |
| #define DELEGATE_SIMPLE_CUDA_BINARY_PREFIX_FUNCTION(T, Funcname, func) \ |
| __global__ void _Kernel_##T##_##Funcname( \ |
| const int N, const T* a, const T* b, T* y) { \ |
| CUDA_1D_KERNEL_LOOP(i, N) { \ |
| float r = \ |
| func(convert::To<T, float>(a[i]), convert::To<T, float>(b[i])); \ |
| y[i] = convert::To<float, T>(r); \ |
| } \ |
| } \ |
| template <> \ |
| void Funcname<T, CUDAContext>( \ |
| const int N, const T* a, const T* b, T* y, CUDAContext* context) { \ |
| _Kernel_##T##_##Funcname<<< \ |
| CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, a, b, y); \ |
| } |
| |
| DELEGATE_SIMPLE_CUDA_BINARY_PREFIX_FUNCTION(float, ElemwiseMax, fmaxf); |
| |
| #undef DELEGATE_SIMPLE_CUDA_BINARY_INFIX_FUNCTION |
| |
| #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \ |
| template <> \ |
| void Funcname<T, CUDAContext>( \ |
| const int N, \ |
| const T* src, \ |
| T* dst, \ |
| Tensor<CUDAContext>* scratch_ptr, \ |
| CUDAContext* context) { \ |
| size_t memRequired = 0; \ |
| cub::DeviceReduce::func( \ |
| nullptr, memRequired, src, dst, N, context->cuda_stream()); \ |
| auto buffer_size = \ |
| static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \ |
| scratch_ptr->Resize(std::vector<TIndex>{buffer_size}); \ |
| cub::DeviceReduce::func( \ |
| static_cast<void*>(scratch_ptr->mutable_data<T>()), \ |
| memRequired, \ |
| src, \ |
| dst, \ |
| N, \ |
| context->cuda_stream()); \ |
| } |
| |
| DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min) |
| DELEGATE_REDUCTION_FUNCTION(float, ReduceMax, Max) |
| DELEGATE_REDUCTION_FUNCTION(int32_t, ReduceMax, Max) |
| DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max) |
| |
| |
| #undef DELEGATE_REDUCTION_FUNCTION |
| |
| // Caffe2 gemm provides a simpler interface to the gemm functions, with the |
| // limitation that the data has to be contiguous in memory. |
| template <> |
| void Gemm<float, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| int lda = (TransA == CblasNoTrans) ? K : M; |
| int ldb = (TransB == CblasNoTrans) ? N : K; |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_ENFORCE(cublasSgemm( |
| context->cublas_handle(), |
| cuTransB, |
| cuTransA, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| ldb, |
| A, |
| lda, |
| &beta, |
| C, |
| N)); |
| } |
| |
| template <> |
| void Gemm<float16, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float16* A, |
| const float16* B, |
| const float beta, |
| float16* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| int lda = (TransA == CblasNoTrans) ? K : M; |
| int ldb = (TransB == CblasNoTrans) ? N : K; |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| if (math_type == TensorProto_DataType_FLOAT) { |
| CUBLAS_CHECK(cublasSgemmEx( |
| context->cublas_handle(), |
| cuTransB, |
| cuTransA, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| CUDA_R_16F, |
| ldb, |
| A, |
| CUDA_R_16F, |
| lda, |
| &beta, |
| C, |
| CUDA_R_16F, |
| N)); |
| |
| } else if (math_type == TensorProto_DataType_FLOAT16) { |
| // convert alpha, beta from float -> __half |
| auto alpha_fp16 = convert::floatToHalf(alpha); |
| auto beta_fp16 = convert::floatToHalf(beta); |
| |
| // call cublasHgemm |
| CUBLAS_CHECK(cublasHgemm( |
| context->cublas_handle(), |
| cuTransB, |
| cuTransA, |
| N, |
| M, |
| K, |
| &alpha_fp16, |
| (const __half*)B, |
| ldb, |
| (const __half*)A, |
| lda, |
| &beta_fp16, |
| (__half*)C, |
| N)); |
| } else { |
| // fail |
| CAFFE_THROW("Unsupported math type"); |
| } |
| } |
| |
| template <> |
| void GemmBatched<float, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch, |
| TensorProto::DataType math_type) { |
| const int a_stride = M * K; |
| const int b_stride = K * N; |
| const int c_stride = M * N; |
| #if __CUDACC_VER_MAJOR__ < 8 |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| math::Gemm<float, CUDAContext>( |
| TransA, |
| TransB, |
| M, |
| N, |
| K, |
| alpha, |
| A + a_stride * i, |
| B + b_stride * i, |
| beta, |
| C + c_stride * i, |
| context); |
| } |
| #else |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (TransA == CblasNoTrans) ? K : M; |
| const int ldb = (TransB == CblasNoTrans) ? N : K; |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_ENFORCE(cublasSgemmStridedBatched( |
| context->cublas_handle(), |
| cuTransB, |
| cuTransA, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| ldb, |
| b_stride, |
| A, |
| lda, |
| a_stride, |
| &beta, |
| C, |
| N, |
| c_stride, |
| batch_size)); |
| #endif |
| } |
| |
| namespace { |
| |
| __global__ void FloatToHalfKernel(const int N, const float* X, half* Y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| Y[i] = __float2half(X[i]); |
| } |
| } |
| |
| __global__ void HalfToFloatKernel(const int N, const half* X, float* Y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| Y[i] = __half2float(X[i]); |
| } |
| } |
| |
| }; |
| |
| template <> |
| void GemmBatched<float16, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float16* A, |
| const float16* B, |
| const float beta, |
| float16* C, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch, |
| TensorProto::DataType math_type) { |
| const int a_stride = M * K; |
| const int b_stride = K * N; |
| const int c_stride = M * N; |
| #if __CUDACC_VER_MAJOR__ < 8 |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| math::Gemm<float16, CUDAContext>( |
| TransA, |
| TransB, |
| M, |
| N, |
| K, |
| alpha, |
| A + a_stride * i, |
| B + b_stride * i, |
| beta, |
| C + c_stride * i, |
| context); |
| } |
| #else |
| // 3 options: |
| // 1) scratch != null = cast to fp32, SgemmStridedBatched, cast result to fp16 |
| // 2) math_type == FLOAT, scratch == nullptr = looped SgemmEx |
| // 3) math_type == FLOAT16, scratch == nullptr = batched Hgemm |
| |
| if (scratch != nullptr) { |
| const int A_size = a_stride * batch_size; |
| const int B_size = b_stride * batch_size; |
| // cast, cublasSgemmStridedBatched, cast |
| size_t in_elems = A_size + B_size; |
| size_t out_elems = c_stride * batch_size; |
| |
| scratch->Resize(in_elems + out_elems); |
| float* scratch_ptr = scratch->mutable_data<float>(); |
| |
| float* A_fp32 = scratch_ptr; |
| float* B_fp32 = scratch_ptr + A_size; |
| float* C_fp32 = scratch_ptr + A_size + B_size; |
| |
| // cast A, B into fp32 |
| HalfToFloatKernel<<<CAFFE_GET_BLOCKS(A_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(A_size, (half*)A, A_fp32); |
| HalfToFloatKernel<<<CAFFE_GET_BLOCKS(B_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(B_size, (half*)B, B_fp32); |
| |
| // run fp32 batched Gemm |
| GemmBatched<float, CUDAContext>( |
| TransA, |
| TransB, |
| batch_size, |
| M, |
| N, |
| K, |
| alpha, |
| A_fp32, |
| B_fp32, |
| beta, |
| C_fp32, |
| context); |
| |
| // cast result back to fp16 |
| FloatToHalfKernel<<< |
| CAFFE_GET_BLOCKS(batch_size * M * N), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(batch_size * M * N, C_fp32, (half*)C); |
| } else { |
| if (math_type == TensorProto_DataType_FLOAT) { |
| // loop over matrices in the batch |
| for (int i = 0; i < batch_size; ++i) { |
| math::Gemm<float16, CUDAContext>( |
| TransA, |
| TransB, |
| M, |
| N, |
| K, |
| alpha, |
| A + a_stride * i, |
| B + b_stride * i, |
| beta, |
| C + c_stride * i, |
| context); |
| } |
| } else if (math_type == TensorProto_DataType_FLOAT16) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| const int lda = (TransA == CblasNoTrans) ? K : M; |
| const int ldb = (TransB == CblasNoTrans) ? N : K; |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| |
| // convert alpha, beta from float -> __half |
| auto alpha_fp16 = convert::floatToHalf(alpha); |
| auto beta_fp16 = convert::floatToHalf(beta); |
| CUBLAS_ENFORCE(cublasHgemmStridedBatched( |
| context->cublas_handle(), |
| cuTransB, |
| cuTransA, |
| N, |
| M, |
| K, |
| &alpha_fp16, |
| (const __half*)B, |
| ldb, |
| b_stride, |
| (const __half*)A, |
| lda, |
| a_stride, |
| &beta_fp16, |
| (__half*)C, |
| N, |
| c_stride, |
| batch_size)); |
| } |
| } |
| #endif |
| } |
| |
| #if CUDA_VERSION >= 9000 |
| |
| // No change, but required. Defer to default CUDA engine |
| template <> |
| void Gemm<float, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| return Gemm<float,CUDAContext>(TransA, |
| TransB, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| B, |
| beta, |
| C, |
| context, |
| math_type); |
| } |
| |
| template <> |
| void Gemm<float16, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float16* A, |
| const float16* B, |
| const float beta, |
| float16* C, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| int lda = (TransA == CblasNoTrans) ? K : M; |
| int ldb = (TransB == CblasNoTrans) ? N : K; |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| |
| // enable TensorCore for this call on this handle |
| if (TensorCoreAvailable()) { |
| CUBLAS_ENFORCE(cublasSetMathMode( |
| context->cublas_handle(), |
| CUBLAS_TENSOR_OP_MATH)); |
| } |
| |
| CUBLAS_CHECK(cublasGemmEx( |
| context->cublas_handle(), |
| cuTransB, |
| cuTransA, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| CUDA_R_16F, |
| ldb, |
| A, |
| CUDA_R_16F, |
| lda, |
| &beta, |
| C, |
| CUDA_R_16F, |
| N, |
| CUDA_R_32F, |
| CUBLAS_GEMM_DFALT_TENSOR_OP)); |
| |
| // Now disable TensorCore math for subsequent calls to this handle |
| if (TensorCoreAvailable()) { |
| CUBLAS_ENFORCE(cublasSetMathMode( |
| context->cublas_handle(), |
| CUBLAS_DEFAULT_MATH)); |
| } |
| } |
| |
| template <> |
| void GemmBatched<float, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const float* B, |
| const float beta, |
| float* C, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch, |
| TensorProto::DataType math_type) { |
| return GemmBatched<float, CUDAContext, DefaultEngine>( |
| TransA, |
| TransB, |
| batch_size, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| B, |
| beta, |
| C, |
| context, |
| scratch, |
| math_type); |
| } |
| |
| template <> |
| void GemmBatched<float16, CUDAContext, TensorCoreEngine>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int batch_size, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float16* A, |
| const float16* B, |
| const float beta, |
| float16* C, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch, |
| TensorProto::DataType math_type) { |
| return GemmBatched<float16, CUDAContext, DefaultEngine>( |
| TransA, |
| TransB, |
| batch_size, |
| M, |
| N, |
| K, |
| alpha, |
| A, |
| B, |
| beta, |
| C, |
| context, |
| scratch, |
| math_type); |
| } |
| |
| #endif // CUDA_VERSION >= 9000 |
| |
| template <> |
| void GemmEx<float, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, |
| const CBLAS_TRANSPOSE TransB, |
| const int M, |
| const int N, |
| const int K, |
| const float alpha, |
| const float* A, |
| const int lda, |
| const float* B, |
| const int ldb, |
| const float beta, |
| float* C, |
| const int ldc, |
| CUDAContext* context) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_ENFORCE(cublasSgemm( |
| context->cublas_handle(), |
| cuTransB, |
| cuTransA, |
| N, |
| M, |
| K, |
| &alpha, |
| B, |
| ldb, |
| A, |
| lda, |
| &beta, |
| C, |
| ldc)); |
| } |
| |
| template <> |
| void Gemv<float, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, |
| const int M, |
| const int N, |
| const float alpha, |
| const float* A, |
| const float* x, |
| const float beta, |
| float* y, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; |
| CUBLAS_ENFORCE(cublasSgemv( |
| context->cublas_handle(), |
| cuTransA, |
| N, |
| M, |
| &alpha, |
| A, |
| N, |
| x, |
| 1, |
| &beta, |
| y, |
| 1)); |
| } |
| |
| // Batched Add variants |
| namespace { |
| |
| template <typename T> |
| __global__ void AddStripedBatchKernel( |
| const int N, |
| const T* first, |
| T* Y, |
| const int stripe, |
| const int batch) { |
| for (int j = 0; j < batch; j++) { |
| const T* x = first + j * stripe; |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| float tmpY = convert::To<T, float>(Y[i]); |
| tmpY += convert::To<T,float>(x[i]); |
| Y[i] = convert::To<float,T>(tmpY); |
| } |
| } |
| } |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T) \ |
| template <> \ |
| void AddStripedBatch<T, CUDAContext>( \ |
| const int N, \ |
| const T* first, \ |
| T* Y, \ |
| const int stripe, \ |
| const int batch, \ |
| CUDAContext* context) { \ |
| AddStripedBatchKernel<T><<< \ |
| CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, first, Y, stripe, batch); \ |
| } |
| |
| CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float); |
| CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16); |
| #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH |
| |
| template <> |
| void Gemv<float16, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, |
| const int M, |
| const int N, |
| const float alpha, |
| const float16* A, |
| const float16* x, |
| const float beta, |
| float16* y, |
| CUDAContext* context, |
| TensorProto::DataType math_type) { |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; |
| |
| // sort out what we need to call cublasSgemmEx / cublasHgemm |
| int m = (cuTransA == CUBLAS_OP_N) ? N : M; |
| int k = (cuTransA == CUBLAS_OP_N) ? M : N; |
| int LDA = (cuTransA == CUBLAS_OP_N) ? m : k; |
| int LDC = m; |
| |
| if (math_type == TensorProto_DataType_FLOAT) { |
| CUBLAS_CHECK(cublasSgemmEx( |
| context->cublas_handle(), |
| cuTransA, |
| CUBLAS_OP_N, |
| m, |
| 1, |
| k, |
| &alpha, |
| A, |
| CUDA_R_16F, |
| LDA, |
| x, |
| CUDA_R_16F, |
| k, |
| &beta, |
| y, |
| CUDA_R_16F, |
| LDC)); |
| } else if (math_type == TensorProto_DataType_FLOAT16) { |
| auto alpha_fp16 = convert::floatToHalf(alpha); |
| auto beta_fp16 = convert::floatToHalf(beta); |
| |
| CUBLAS_CHECK(cublasHgemm( |
| context->cublas_handle(), |
| cuTransA, |
| CUBLAS_OP_N, |
| m, |
| 1, |
| k, |
| &alpha_fp16, |
| (const __half*)A, |
| LDA, |
| (const __half*)x, |
| k, |
| &beta_fp16, |
| (__half*)y, |
| LDC)); |
| } else { |
| // fail |
| CAFFE_THROW("Unsupported math type"); |
| } |
| } |
| |
| namespace { |
| template <typename T> |
| __global__ void SetKernel(const int N, const T alpha, T* Y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| Y[i] = alpha; |
| } |
| } |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_SET(T) \ |
| template <> \ |
| void Set<T, CUDAContext>( \ |
| const size_t N, const T alpha, T* Y, CUDAContext* context) { \ |
| SetKernel<<< \ |
| CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, alpha, Y); \ |
| } |
| |
| CAFFE2_SPECIALIZED_CUDA_SET(float); |
| CAFFE2_SPECIALIZED_CUDA_SET(double); |
| CAFFE2_SPECIALIZED_CUDA_SET(bool); |
| CAFFE2_SPECIALIZED_CUDA_SET(int8_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(int16_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(float16); |
| CAFFE2_SPECIALIZED_CUDA_SET(int); |
| CAFFE2_SPECIALIZED_CUDA_SET(int64_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(char); |
| CAFFE2_SPECIALIZED_CUDA_SET(uint8_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(uint16_t); |
| #undef CAFFE2_SPECIALIZED_CUDA_SET |
| |
| namespace { |
| template <typename T> |
| __global__ void |
| UniformShift(const size_t N, const float min, const float max, T* x) { |
| float scale = max - min; |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min); |
| } |
| } |
| |
| __global__ void |
| UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) { |
| int* x_int = reinterpret_cast<int*>(x); |
| int range = (max - min + 1); |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| x_int[i] = min + static_cast<int>(x[i] % range); |
| } |
| } |
| } // namespace |
| |
| template <> |
| void RandUniform<float, CUDAContext>( |
| const size_t n, |
| const float min, |
| const float max, |
| float* r, |
| CUDAContext* context) { |
| CURAND_ENFORCE(curandGenerateUniform(context->curand_generator(), r, n)); |
| UniformShift<float> |
| <<<CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(n, min, max, r); |
| } |
| |
| template <> |
| void RandUniform<double, CUDAContext>( |
| const size_t n, |
| const double min, |
| const double max, |
| double* r, |
| CUDAContext* context) { |
| CURAND_ENFORCE( |
| curandGenerateUniformDouble(context->curand_generator(), r, n)); |
| UniformShift<double> |
| <<<CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(n, min, max, r); |
| } |
| |
| template <> |
| void RandUniform<int, CUDAContext>( |
| const size_t n, |
| const int min, |
| const int max, |
| int* r, |
| CUDAContext* context) { |
| CURAND_ENFORCE(curandGenerate( |
| context->curand_generator(), reinterpret_cast<unsigned int*>(r), n)); |
| UniformIntFit<<< |
| CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| n, min, max, reinterpret_cast<unsigned int*>(r)); |
| } |
| |
| template <typename T> |
| size_t HandleOddLengthRandGaussian( |
| const size_t n, |
| const T mean, |
| const T std, |
| T* r, |
| CUDAContext* context) { |
| if (n % 2 == 1) { |
| std::default_random_engine generator; |
| std::normal_distribution<T> distribution(mean, std); |
| const T random_value = distribution(generator); |
| math::Set<T, CUDAContext>(1, random_value, r + (n - 1), context); |
| return n - 1; |
| } |
| return n; |
| } |
| |
| template <> |
| void RandGaussian<float, CUDAContext>( |
| const size_t n, |
| const float mean, |
| const float std, |
| float* r, |
| CUDAContext* context) { |
| // If n is odd, we add a random Gaussian value at the end manually |
| // and generate n-1 random values using curandGenerateNormal. |
| // curandGenerateNormal requires n to be even. |
| const size_t even_n = |
| HandleOddLengthRandGaussian<float>(n, mean, std, r, context); |
| CURAND_ENFORCE( |
| curandGenerateNormal(context->curand_generator(), r, even_n, mean, std)); |
| } |
| |
| template <> |
| void RandGaussian<double, CUDAContext>( |
| const size_t n, |
| const double mean, |
| const double std, |
| double* r, |
| CUDAContext* context) { |
| const size_t even_n = |
| HandleOddLengthRandGaussian<double>(n, mean, std, r, context); |
| CURAND_ENFORCE(curandGenerateNormalDouble( |
| context->curand_generator(), r, even_n, mean, std)); |
| } |
| |
| template <> |
| void Dot<float, CUDAContext>( |
| const int n, |
| const float* a, |
| const float* b, |
| float* y, |
| CUDAContext* context) { |
| float result; |
| CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result)); |
| context->Copy<float, CPUContext, CUDAContext>(1, &result, y); |
| } |
| |
| template <> |
| void Dot<float16, CUDAContext>( |
| const int n, |
| const float16* a, |
| const float16* b, |
| float16* y, |
| CUDAContext* context) { |
| float16 result; |
| // execute with 32-bit math |
| CUBLAS_CHECK(cublasDotEx( |
| context->cublas_handle(), |
| n, |
| a, |
| CUDA_R_16F, |
| 1, |
| b, |
| CUDA_R_16F, |
| 1, |
| &result, |
| CUDA_R_16F, |
| CUDA_R_32F)); |
| context->Copy<float16, CPUContext, CUDAContext>(1, &result, y); |
| } |
| |
| // A previous version of caffe2 used Thrust but it turns out that thrust |
| // reduction has an implicit scratch space allocation and deallocation, which |
| // may interfere with NCCL and create a deadlock. Hence we are using a custom |
| // reduction here. |
| #define SUM_KERNEL_NTHREADS 128 |
| template <typename T> |
| __global__ void SumKernel(const int N, const T* X, T* Y, bool square) { |
| const int idx = threadIdx.x; |
| __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS]; |
| |
| reduction_buffer[idx] = 0; |
| |
| // A multilevel reduction. |
| // N -> 128 |
| if (!square) { |
| for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) { |
| reduction_buffer[idx] += convert::To<T, float>(X[i]); |
| } |
| } else { |
| for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) { |
| float Xi = convert::To<T, float>(X[i]); |
| reduction_buffer[idx] += Xi * Xi; |
| } |
| } |
| __syncthreads(); |
| // 128 -> 32 |
| if (idx < 32) { |
| reduction_buffer[idx] += |
| reduction_buffer[idx + 32] + |
| reduction_buffer[idx + 64] + |
| reduction_buffer[idx + 96]; |
| } |
| __syncthreads(); |
| // 32 -> 1 |
| if (idx == 0) { |
| float tmp = 0; |
| for (int i = 0; i < 32; ++i) { |
| tmp += reduction_buffer[i]; |
| } |
| *Y = convert::To<float, T>(tmp); |
| } |
| } |
| |
| // According to the benchmarks script |
| // caffe2/caffe2/experiments/python/device_reduce_sum_bench.py, |
| // device reduce is slower for N <= 10000. |
| #define DEVICE_REDUCE_SIZE_THRESHOLD 10000 |
| |
| namespace { |
| |
| template <typename T> |
| __global__ void SumConvertKernel(float* sum, T* dest) { |
| *dest = convert::To<float, T>(*sum); |
| } |
| |
| template <typename T, typename IterT> |
| void SumGenericIter( |
| const int N, |
| IterT it, |
| T*& dest, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| size_t memRequired = 0; |
| cub::DeviceReduce::Sum( |
| nullptr, memRequired, it, dest, N, context->cuda_stream()); |
| auto buffer_size = |
| static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); |
| if (!dest) { |
| // allocate one more T at the end of scratch for dest |
| scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1}); |
| dest = scratch_ptr->template mutable_data<T>() + buffer_size; |
| } else { |
| scratch_ptr->Resize(std::vector<TIndex>{buffer_size}); |
| } |
| cub::DeviceReduce::Sum( |
| static_cast<void*>(scratch_ptr->template mutable_data<T>()), |
| memRequired, |
| it, |
| dest, |
| N, |
| context->cuda_stream()); |
| } |
| } // namespace |
| |
| template <> |
| void Sum<float, CUDAContext>( |
| const int N, |
| const float* x, |
| float* y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { |
| SumGenericIter<float>(N, x, y, context, scratch_ptr); |
| } else { |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( |
| N, x, y, false); |
| } |
| } |
| |
| template <> |
| void Sum<int32_t, CUDAContext>( |
| const int N, |
| const int32_t* x, |
| int32_t* y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { |
| SumGenericIter<int32_t>(N, x, y, context, scratch_ptr); |
| } else { |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( |
| N, x, y, false); |
| } |
| } |
| |
| namespace { |
| template <typename T> |
| struct FloatTransform { |
| inline __host__ __device__ float operator()(const T v) const { |
| return convert::To<T, float>(v); |
| } |
| }; |
| } // namespace |
| |
| #define CAFFE2_MATH_SUM_FUNC(T) \ |
| template <> \ |
| void Sum<T, CUDAContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| CUDAContext* context, \ |
| Tensor<CUDAContext>* scratch_ptr) { \ |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ |
| FloatTransform<T> transform; \ |
| cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \ |
| x, transform); \ |
| float* sum = nullptr; \ |
| SumGenericIter<float>(N, it, sum, context, scratch_ptr); \ |
| SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y); \ |
| } else { \ |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \ |
| N, x, y, false); \ |
| } \ |
| } |
| |
| CAFFE2_MATH_SUM_FUNC(float16) |
| #undef CAFFE2_MATH_SUM_FUNC |
| |
| namespace { |
| template <typename T> |
| struct SqrTransform { |
| inline __host__ __device__ T operator()(const T v) const { |
| return v * v; |
| } |
| }; |
| } // namespace |
| |
| template <> |
| void SumSqr<float, CUDAContext>( |
| const int N, |
| const float* x, |
| float* y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { |
| SqrTransform<float> transform; |
| cub::TransformInputIterator<float, SqrTransform<float>, const float*> it( |
| x, transform); |
| SumGenericIter<float>(N, it, y, context, scratch_ptr); |
| } else { |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( |
| N, x, y, true); |
| } |
| } |
| |
| #define CAFFE2_MATH_SUMSQR_FUNC(T) \ |
| template <> \ |
| void SumSqr<T, CUDAContext>( \ |
| const int N, \ |
| const T* x, \ |
| T* y, \ |
| CUDAContext* context, \ |
| Tensor<CUDAContext>* scratch_ptr) { \ |
| if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \ |
| FloatTransform<T> float_transform; \ |
| cub::TransformInputIterator<float, FloatTransform<T>, const T*> \ |
| float_it(x, float_transform); \ |
| SqrTransform<float> sqr_transform; \ |
| cub::TransformInputIterator< \ |
| float, \ |
| SqrTransform<float>, \ |
| decltype(float_it)> \ |
| it(float_it, sqr_transform); \ |
| float* sum = nullptr; \ |
| SumGenericIter<float>(N, it, sum, context, scratch_ptr); \ |
| SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y); \ |
| } else { \ |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \ |
| N, x, y, true); \ |
| } \ |
| } |
| |
| CAFFE2_MATH_SUMSQR_FUNC(float16) |
| #undef CAFFE2_MATH_SUMSQR_FUNC |
| #undef DEVICE_REDUCE_SIZE_THRESHOLD |
| |
| namespace { |
| template <typename T> |
| __global__ void SelectKernel( |
| const int N, const int D, const T* x, const int* idx, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| y[i] = x[i * D + idx[i]]; |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Select<float, CUDAContext>( |
| const int N, const int D, const float* x, const int* idx, float* y, |
| CUDAContext* context) { |
| SelectKernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(N, D, x, idx, y); |
| } |
| |
| template <> |
| void Select<float16, CUDAContext>( |
| const int N, |
| const int D, |
| const float16* x, |
| const int* idx, |
| float16* y, |
| CUDAContext* context) { |
| SelectKernel<float16><<< |
| CAFFE_GET_BLOCKS(N), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, D, x, idx, y); |
| } |
| |
| namespace { |
| template <typename T> |
| __global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, n) { |
| // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha); |
| y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha); |
| } |
| } |
| |
| template <typename T> |
| __global__ void |
| ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, n) { |
| y[i] = x[i] * (*alpha); |
| } |
| } |
| |
| template <typename T> |
| __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, n) { |
| y[i] = powf(x[i], exponent); |
| } |
| } |
| |
| // fp16 specialization |
| template <> |
| __global__ void ScaleKernelDeviceAlpha( |
| const int n, |
| const float* alpha, |
| const float16* x, |
| float16* y) { |
| CUDA_1D_KERNEL_LOOP(i, n) { |
| y[i] = convert::To<float, float16>( |
| convert::To<float16, float>(x[i]) * (*alpha)); |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| void Powx<float, CUDAContext>( |
| const int N, |
| const float* a, |
| const float b, |
| float* y, |
| CUDAContext* context) { |
| PowKernel<<< |
| CAFFE_GET_BLOCKS(N), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, a, b, y); |
| } |
| |
| template <> |
| void Scale<float, CUDAContext>( |
| const int n, |
| const float alpha, |
| const float* x, |
| float* y, |
| CUDAContext* context) { |
| ScaleKernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, alpha, x, y); |
| } |
| |
| template <> |
| void Scale<float16, CUDAContext>( |
| const int n, |
| const float alpha, |
| const float16* x, |
| float16* y, |
| CUDAContext* context) { |
| ScaleKernel<float16><<< |
| CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(n, alpha, x, y); |
| } |
| |
| template <> |
| void Scale<float, CUDAContext>( |
| const int n, const float* alpha, const float *x, float* y, |
| CUDAContext* context) { |
| ScaleKernelDeviceAlpha<float><<< |
| CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( |
| n, alpha, x, y); |
| } |
| |
| template <> |
| void Scale<float16, CUDAContext>( |
| const int n, |
| const float* alpha, |
| const float16* x, |
| float16* y, |
| CUDAContext* context) { |
| ScaleKernelDeviceAlpha<float16><<< |
| CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(n, alpha, x, y); |
| } |
| |
| template <> |
| void Axpy<float, CUDAContext>( |
| const int N, |
| const float alpha, |
| const float* X, |
| float* Y, |
| CUDAContext* context) { |
| CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); |
| } |
| |
| template <> |
| void Axpy<double, CUDAContext>( |
| const int N, |
| const float alpha, |
| const double* X, |
| double* Y, |
| CUDAContext* context) { |
| double alpha_d{alpha}; |
| CUBLAS_ENFORCE( |
| cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1)); |
| } |
| |
| template <> |
| void Axpy<float16, CUDAContext>( |
| const int N, |
| const float alpha, |
| const float16* X, |
| float16* Y, |
| CUDAContext* context) { |
| CUBLAS_CHECK(cublasAxpyEx( |
| context->cublas_handle(), |
| N, |
| &alpha, |
| CUDA_R_16F, |
| X, |
| CUDA_R_16F, |
| 1, |
| Y, |
| CUDA_R_16F, |
| 1, |
| CUDA_R_32F)); |
| } |
| |
| namespace { |
| template <typename T> |
| __global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) { |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| y[index] = convert::Get<T>( |
| convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index])); |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Axpy<float, CUDAContext>( |
| const int n, const float* alpha, const float* X, |
| float* Y, CUDAContext* context) { |
| AxpyKernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, alpha, X, Y); |
| } |
| |
| template <> |
| void Axpy<float16, CUDAContext>( |
| const int n, |
| const float* alpha, |
| const float16* X, |
| float16* Y, |
| CUDAContext* context) { |
| AxpyKernel<float16><<< |
| CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(n, alpha, X, Y); |
| } |
| |
| namespace { |
| template <typename T> |
| __global__ void AxpbyKernel(const int n, const T a, const T* x, |
| const T b, T* y) { |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| y[index] = x[index] * a + y[index] * b; |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Axpby<float, CUDAContext>( |
| const int n, const float a, const float* x, const float b, float* y, |
| CUDAContext* context) { |
| AxpbyKernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, a, x, b, y); |
| } |
| |
| namespace { |
| |
| template <typename T> |
| __global__ void im2col_gpu_kernel_nchw(const int n, const T* data_im, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int height_col, const int width_col, |
| T* data_col) { |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| int w_out = index % width_col; |
| int h_index = index / width_col; |
| int h_out = h_index % height_col; |
| int channel_in = h_index / height_col; |
| int channel_out = channel_in * kernel_h * kernel_w; |
| int h_in = h_out * stride_h - pad_t; |
| int w_in = w_out * stride_w - pad_l; |
| T* data_col_ptr = data_col; |
| data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; |
| const T* data_im_ptr = data_im; |
| data_im_ptr += (channel_in * height + h_in) * width + w_in; |
| for (int i = 0; i < kernel_h; ++i) { |
| for (int j = 0; j < kernel_w; ++j) { |
| int h = h_in + i * dilation_h; |
| int w = w_in + j * dilation_w; |
| *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? |
| data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; |
| data_col_ptr += height_col * width_col; |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| __global__ void im2col_gpu_kernel_nhwc(const int n, const T* data_im, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int width_col, const int channels, |
| T* data_col) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| int channel_in = index % channels; |
| int w_out = index / channels % width_col; |
| int h_out = index / channels / width_col; |
| int h_in = h_out * stride_h - pad_t; |
| int w_in = w_out * stride_w - pad_l; |
| T* local_data_col = data_col + |
| ((h_out * width_col) + w_out) * channels * kernel_h * kernel_w |
| + channel_in; |
| for (int i = 0; i < dkernel_h; i += dilation_h) { |
| int h = h_in + i; |
| for (int j = 0; j < dkernel_w; j += dilation_w) { |
| int w = w_in + j; |
| *local_data_col = (h >= 0 && w >= 0 && h < height && w < width) ? |
| data_im[(h * width + w) * channels + channel_in] : 0; |
| local_data_col += channels; |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| __global__ void col2im_gpu_kernel_nchw(const int n, const T* data_col, |
| const int height, const int width, |
| const int patch_h, const int patch_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int height_col, const int width_col, |
| T* data_im) { |
| |
| const int dpatch_h = dilation_h * (patch_h - 1) + 1; |
| const int dpatch_w = dilation_w * (patch_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| T val = 0; |
| int w = index % width + pad_l; |
| int h = (index / width) % height + pad_t; |
| int c = index / (width * height); |
| |
| // compute the start and end of the output |
| int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1; |
| int w_col_end = min(w / stride_w + 1, width_col); |
| int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1; |
| int h_col_end = min(h / stride_h + 1, height_col); |
| |
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { |
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { |
| int h_k = (h - h_col * stride_h); |
| int w_k = (w - w_col * stride_w); |
| if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { |
| h_k /= dilation_h; |
| w_k /= dilation_w; |
| int data_col_index = |
| (((c * patch_h + h_k) * patch_w + w_k) * height_col + h_col) * |
| width_col + |
| w_col; |
| val += data_col[data_col_index]; |
| } |
| } |
| } |
| data_im[index] = val; |
| } |
| } |
| |
| template <typename T> |
| __global__ void col2im_gpu_kernel_nhwc(const int n, const T* data_col, |
| const int width, const int channels, |
| const int patch_h, const int patch_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int height_col, const int width_col, |
| T* data_im) { |
| |
| const int dpatch_h = dilation_h * (patch_h - 1) + 1; |
| const int dpatch_w = dilation_w * (patch_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| T val = 0; |
| int c = index % channels; |
| int w = index / channels % width + pad_l; |
| int h = index / channels / width + pad_t; |
| // compute the start and end of the output |
| int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1; |
| int w_col_end = min(w / stride_w + 1, width_col); |
| int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1; |
| int h_col_end = min(h / stride_h + 1, height_col); |
| int channels_col = patch_h * patch_w * channels; |
| |
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { |
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { |
| int h_k = h - h_col * stride_h; |
| int w_k = w - w_col * stride_w; |
| if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { |
| h_k /= dilation_h; |
| w_k /= dilation_w; |
| int c_col = (h_k * patch_w + w_k) * channels + c; |
| val += data_col[(h_col * width_col + w_col) * channels_col + c_col]; |
| } |
| } |
| } |
| data_im[index] = val; |
| } |
| } |
| |
| // Ported from caffe1 |
| template <typename T, int num_axes> |
| __global__ void im2col_nd_gpu_kernel( |
| const int n, |
| const T* data_im, |
| const int* im_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* pad, |
| const int* stride, |
| const int* dilation, |
| T* data_col) { |
| int d_offset[num_axes]; // NOLINT(runtime/arrays) |
| int d_iter[num_axes]; // NOLINT(runtime/arrays) |
| |
| __shared__ int shared_dilation[num_axes]; |
| __shared__ int shared_kernel_shape[num_axes]; |
| __shared__ int shared_pad[num_axes]; |
| __shared__ int shared_stride[num_axes]; |
| __shared__ int shared_col_shape[num_axes + 1]; |
| __shared__ int shared_im_shape[num_axes + 1]; |
| |
| if (threadIdx.x < num_axes) { |
| shared_dilation[threadIdx.x] = dilation[threadIdx.x]; |
| shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x]; |
| shared_pad[threadIdx.x] = pad[threadIdx.x]; |
| shared_stride[threadIdx.x] = stride[threadIdx.x]; |
| } |
| if (threadIdx.x < num_axes + 1) { |
| shared_col_shape[threadIdx.x] = col_shape[threadIdx.x]; |
| shared_im_shape[threadIdx.x] = im_shape[threadIdx.x]; |
| } |
| __syncthreads(); |
| |
| int i; |
| int kernel_size = 1; |
| for (i = 0; i < num_axes; ++i) { |
| kernel_size *= shared_kernel_shape[i]; |
| } |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| if (index >= col_shape[0]) { |
| break; |
| } |
| // Initialize offset, computed in the loop below, with intermediate |
| // computations used to compute the spatial indices. |
| int offset = index; |
| for (i = num_axes - 1; i >= 0; --i) { |
| if (i < num_axes - 1) { |
| offset /= shared_kernel_shape[i + 1]; |
| } |
| d_offset[i] = offset % shared_kernel_shape[i]; |
| } |
| for (i = 0; i < num_axes; ++i) { |
| d_iter[i] = 0; |
| } |
| bool incremented; |
| do { |
| int index_col = index; |
| int index_im = index / kernel_size; |
| bool in_range = true; |
| for (i = 0; i < num_axes; ++i) { |
| const int d = d_iter[i]; |
| const int d_im = d * shared_stride[i] - shared_pad[i] + |
| d_offset[i] * shared_dilation[i]; |
| in_range &= (d_im >= 0 && d_im < shared_im_shape[i + 1]); |
| |
| index_col *= shared_col_shape[i + 1]; |
| index_col += d; |
| index_im *= shared_im_shape[i + 1]; |
| index_im += d_im; |
| } |
| if (in_range) { |
| // data_col[index_col] = 0; |
| data_col[index_col] = data_im[index_im]; |
| // T temp = data_im[index_im]; |
| } else { |
| data_col[index_col] = 0; |
| } |
| |
| incremented = false; |
| for (i = num_axes - 1; i >= 0; --i) { |
| // const int d_max = shared_kernel_shape[i]; |
| const int d_max = shared_col_shape[i + 1]; |
| if (d_iter[i] == d_max - 1) { |
| d_iter[i] = 0; |
| } else { // d_iter[i] < d_max - 1 |
| ++d_iter[i]; |
| incremented = true; |
| break; |
| } |
| } // for (int i = num_axes - 1; i >= 0; --i) |
| } while (incremented); // do |
| } // CUDA_KERNEL_LOOP(index, n) |
| } |
| |
| template <typename T, int num_axes> |
| __global__ void col2im_nd_gpu_kernel( |
| const int n, |
| const T* data_col, |
| const int* im_shape, |
| const int* col_shape, |
| const int* kernel_shape, |
| const int* pad, |
| const int* stride, |
| const int* dilation, |
| T* data_im) { |
| int d_im[num_axes]; // NOLINT(runtime/arrays) |
| int d_col_iter[num_axes]; // NOLINT(runtime/arrays) |
| int d_col_start[num_axes]; // NOLINT(runtime/arrays) |
| int d_col_end[num_axes]; // NOLINT(runtime/arrays) |
| |
| __shared__ int shared_dilation[num_axes]; |
| __shared__ int shared_kernel_shape[num_axes]; |
| __shared__ int shared_pad[num_axes]; |
| __shared__ int shared_stride[num_axes]; |
| __shared__ int shared_col_shape[num_axes + 1]; |
| __shared__ int shared_im_shape[num_axes + 1]; |
| |
| if (threadIdx.x < num_axes) { |
| shared_dilation[threadIdx.x] = dilation[threadIdx.x]; |
| shared_kernel_shape[threadIdx.x] = kernel_shape[threadIdx.x]; |
| shared_pad[threadIdx.x] = pad[threadIdx.x]; |
| shared_stride[threadIdx.x] = stride[threadIdx.x]; |
| } |
| |
| if (threadIdx.x < num_axes + 1) { |
| shared_col_shape[threadIdx.x] = col_shape[threadIdx.x]; |
| shared_im_shape[threadIdx.x] = im_shape[threadIdx.x]; |
| } |
| __syncthreads(); |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| // Initialize channel_in, computed in the loop below, with intermediate |
| // computations used to compute the spatial indices. |
| int c_im = index; |
| // Calculate d_im (image dimensions). |
| for (int i = num_axes - 1; i >= 0; --i) { |
| d_im[i] = c_im % shared_im_shape[i + 1] + shared_pad[i]; |
| c_im /= shared_im_shape[i + 1]; |
| } |
| // Calculate col start/end indices. |
| bool done = false; |
| for (int i = 0; i < num_axes; ++i) { |
| const int kernel_extent = |
| shared_dilation[i] * (shared_kernel_shape[i] - 1) + 1; |
| d_col_start[i] = d_col_iter[i] = (d_im[i] < kernel_extent) |
| ? 0 |
| : (d_im[i] - kernel_extent) / shared_stride[i] + 1; |
| d_col_end[i] = |
| min(d_im[i] / shared_stride[i] + 1, shared_col_shape[i + 1]); |
| if (d_col_start[i] >= d_col_end[i]) { |
| // Skip computation if the dimension is 0 at any spatial axis -- |
| // final val will be 0. |
| data_im[index] = 0; |
| done = true; |
| break; // for (int i = 0; i < num_axes; ++i) |
| } |
| } |
| if (done) { |
| continue; // CUDA_KERNEL_LOOP(index, n) |
| } |
| // Loop over the col to compute the output val. |
| T val = 0; |
| bool incremented = true; |
| bool skip = false; |
| do { |
| // Compute the final offset. |
| int final_offset = 0; |
| int kernel_shape_prod = 1; |
| int kernel_index; |
| for (int i = num_axes - 1; i >= 0; --i) { |
| kernel_index = d_im[i] - d_col_iter[i] * shared_stride[i]; |
| if (kernel_index % shared_dilation[i]) { |
| skip = true; |
| break; |
| } else { |
| kernel_index /= shared_dilation[i]; |
| final_offset += kernel_index * kernel_shape_prod; |
| kernel_shape_prod *= shared_kernel_shape[i]; |
| } |
| } |
| if (!skip) { |
| final_offset += kernel_shape_prod * c_im; |
| for (int i = 0; i < num_axes; ++i) { |
| final_offset *= shared_col_shape[i + 1]; |
| final_offset += d_col_iter[i]; |
| } |
| val += data_col[final_offset]; |
| } |
| skip = false; |
| incremented = false; |
| for (int i = num_axes - 1; i >= 0; --i) { |
| const int d_max = d_col_end[i]; |
| if (d_col_iter[i] == d_max - 1) { |
| d_col_iter[i] = d_col_start[i]; |
| } else { // d_col_iter[i] < d_max - 1 |
| ++d_col_iter[i]; |
| incremented = true; |
| break; // for (int i = num_axes - 1; i >= 0; --i) |
| } |
| } // for (int i = num_axes - 1; i >= 0; --i) |
| } while (incremented); |
| data_im[index] = val; |
| } // CUDA_KERNEL_LOOP(index, n) |
| } |
| |
| } // namespace |
| |
| template <> |
| void Im2col<float, CUDAContext, StorageOrder::NCHW>( |
| const float* data_im, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_col, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| // We are going to launch channels * height_col * width_col kernels, each |
| // kernel responsible for copying a single-channel grid. |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = channels * height_col * width_col; |
| // NOLINT_NEXT_LINE(whitespace/operators) |
| im2col_gpu_kernel_nchw<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_im, height, width, kernel_h, kernel_w, |
| dilation_h, dilation_w, pad_t, pad_l, stride_h, stride_w, |
| height_col, width_col, data_col); |
| } |
| |
| template <> |
| void Im2col<float, CUDAContext, StorageOrder::NHWC>( |
| const float* data_im, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_col, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| // We are going to launch height_col * width_col * channels kernels, each |
| // kernel responsible for copying a single-channel grid. |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = height_col * width_col * channels; |
| // NOLINT_NEXT_LINE(whitespace/operators) |
| im2col_gpu_kernel_nhwc<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_im, height, width, kernel_h, kernel_w, |
| dilation_h, dilation_w, pad_t, pad_l, stride_h, stride_w, |
| width_col, channels, data_col); |
| } |
| |
| |
| template <> |
| void Col2im<float, CUDAContext, StorageOrder::NCHW>( |
| const float* data_col, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_im, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = channels * height * width; |
| // To avoid involving atomic operations, we will launch one kernel per |
| // bottom dimension, and then in the kernel add up the top dimensions. |
| col2im_gpu_kernel_nchw<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_col, height, width, kernel_h, kernel_w, |
| dilation_h, dilation_w, |
| pad_t, pad_l, stride_h, stride_w, |
| height_col, width_col, data_im); |
| } |
| |
| template <> |
| void Col2im<float, CUDAContext, StorageOrder::NHWC>( |
| const float* data_col, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_im, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = height * width * channels; |
| // To avoid involving atomic operations, we will launch one kernel per |
| // bottom dimension, and then in the kernel add up the top dimensions. |
| col2im_gpu_kernel_nhwc<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_col, width, channels, kernel_h, kernel_w, |
| dilation_h, dilation_w, |
| pad_t, pad_l, stride_h, stride_w, height_col, width_col, data_im); |
| } |
| |
| template <> |
| void Col2imNd<float, CUDAContext, StorageOrder::NCHW>( |
| const float* data_col, |
| const int* img_shape, |
| const int* col_shape, |
| const int img_size, |
| const int col_size, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const int N, |
| float* data_img, |
| CUDAContext* context) { |
| CAFFE_ENFORCE_LT( |
| N, CAFFE_CUDA_NUM_THREADS, "num_axes should be smaller than block size."); |
| |
| #define COL2IM_ND_KERNEL(n) \ |
| col2im_nd_gpu_kernel<float, n> /* NOLINT_NEXT_LINE(whitespace/operators) */ \ |
| <<<CAFFE_GET_BLOCKS(img_size), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>( \ |
| img_size, \ |
| data_col, \ |
| img_shape, \ |
| col_shape, \ |
| kernel_shape, \ |
| pad, \ |
| stride, \ |
| dilation, \ |
| data_img) |
| |
| switch (N) { |
| case 1: |
| COL2IM_ND_KERNEL(1); |
| break; |
| case 2: |
| COL2IM_ND_KERNEL(2); |
| break; |
| case 3: |
| COL2IM_ND_KERNEL(3); |
| break; |
| case 4: |
| COL2IM_ND_KERNEL(4); |
| break; |
| case 5: |
| COL2IM_ND_KERNEL(5); |
| break; |
| default: |
| CAFFE_THROW( |
| "Col2imNd does not support computation with ", N, " spatial axes"); |
| } |
| } |
| |
| template <> |
| void Im2colNd<float, CUDAContext, StorageOrder::NCHW>( |
| const float* data_img, |
| const int* img_shape, |
| const int* col_shape, |
| const int img_size, |
| const int col_size, |
| const int* kernel_shape, |
| const int* stride, |
| const int* dilation, |
| const int* pad, |
| const int N, |
| float* data_col, |
| CUDAContext* context, |
| bool /*accumlate_output*/) { |
| CAFFE_ENFORCE_LT( |
| N, CAFFE_CUDA_NUM_THREADS, "num_axes should be smaller than block size."); |
| |
| #define IM2COL_ND_KERNEL(n) \ |
| im2col_nd_gpu_kernel<float, n> /* NOLINT_NEXT_LINE(whitespace/operators) */ \ |
| <<<CAFFE_GET_BLOCKS(col_size), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>( \ |
| col_size, \ |
| data_img, \ |
| img_shape, \ |
| col_shape, \ |
| kernel_shape, \ |
| pad, \ |
| stride, \ |
| dilation, \ |
| data_col) |
| |
| switch (N) { |
| case 1: |
| IM2COL_ND_KERNEL(1); |
| break; |
| case 2: |
| IM2COL_ND_KERNEL(2); |
| break; |
| case 3: |
| IM2COL_ND_KERNEL(3); |
| break; |
| case 4: |
| IM2COL_ND_KERNEL(4); |
| break; |
| case 5: |
| IM2COL_ND_KERNEL(5); |
| break; |
| default: |
| CAFFE_THROW( |
| "Im2colNd does not support computation with ", N, " spatial axes"); |
| } |
| } |
| |
| template <> |
| void CopyMatrix<CUDAContext>( |
| const size_t itemsize, |
| const int M, |
| const int N, |
| const void* A, |
| const int lda, |
| void* B, |
| const int ldb, |
| CUDAContext* context, |
| TypeMeta::TypedCopy copy) { |
| CAFFE_ENFORCE(!copy, "Copy constructor is not supported in CUDA context"); |
| cudaMemcpy2DAsync(B, ldb * itemsize, A, lda * itemsize, N * itemsize, M, |
| cudaMemcpyDeviceToDevice, context->cuda_stream()); |
| } |
| |
| template <> |
| void CopyVector<float, CUDAContext>( |
| const int N, |
| const float* src, |
| float* dst, |
| CUDAContext* context) { |
| if (src != dst && N > 0) { |
| cudaMemcpyAsync( |
| dst, |
| src, |
| sizeof(float) * N, |
| cudaMemcpyDeviceToDevice, |
| context->cuda_stream()); |
| } |
| } |
| |
| namespace { |
| |
| template <typename T> |
| using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>; |
| |
| template <typename T, class Reducer> |
| __global__ void RowwiseReduceKernel( |
| const int rows, |
| const int cols, |
| const Reducer reducer, |
| const T init, |
| const T* X, |
| T* Y) { |
| __shared__ typename BlockReduce<T>::TempStorage temp_storage; |
| for (int i = blockIdx.x; i < rows; i += gridDim.x) { |
| T val = init; |
| for (int j = threadIdx.x; j < cols; j += blockDim.x) { |
| val = reducer(X[i * cols + j], val); |
| } |
| val = BlockReduce<T>(temp_storage).Reduce(val, reducer); |
| if (threadIdx.x == 0) { |
| Y[i] = val; |
| } |
| __syncthreads(); |
| } |
| } |
| |
| template <typename T, class Reducer> |
| __global__ void ColwiseReduceKernel( |
| const int rows, |
| const int cols, |
| const Reducer reducer, |
| const T init, |
| const T* X, |
| T* Y) { |
| __shared__ typename BlockReduce<T>::TempStorage temp_storage; |
| for (int i = blockIdx.x; i < cols; i += gridDim.x) { |
| T val = init; |
| for (int j = threadIdx.x; j < rows; j += blockDim.x) { |
| val = reducer(X[j * cols + i], val); |
| } |
| val = BlockReduce<T>(temp_storage).Reduce(val, reducer); |
| if (threadIdx.x == 0) { |
| Y[i] = val; |
| } |
| __syncthreads(); |
| } |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T) \ |
| template <> \ |
| void RowwiseMax<T, CUDAContext>( \ |
| const int N, const int D, const T* x, T* y, CUDAContext* context) { \ |
| RowwiseReduceKernel<<< \ |
| std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>( \ |
| N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX |
| |
| #define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T) \ |
| template <> \ |
| void ColwiseMax<T, CUDAContext>( \ |
| const int N, const int D, const T* x, T* y, CUDAContext* context) { \ |
| ColwiseReduceKernel<<< \ |
| std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>( \ |
| N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX |
| |
| namespace { |
| __global__ void |
| maximum_kernel(const int N, const float alpha, const float* x, float* y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| y[i] = fmaxf(x[i], alpha); |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Maximum( |
| const int N, |
| const float alpha, |
| const float* x, |
| float* y, |
| CUDAContext* context) { |
| maximum_kernel<<< |
| std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(N, alpha, x, y); |
| } |
| |
| namespace { |
| |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| |
| template <typename T, class Reducer, int kNumDims, int kNumAxes> |
| void EigenReduceTensorCUDAImpl( |
| const int* dims, |
| const int* axes, |
| const Reducer& reducer, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| Eigen::DSizes<Eigen::DenseIndex, kNumDims> X_dims; |
| Eigen::DSizes<Eigen::DenseIndex, kNumDims> Y_dims; |
| Eigen::array<Eigen::DenseIndex, kNumAxes> reduce_dims; |
| for (int i = 0; i < kNumDims; ++i) { |
| X_dims[i] = static_cast<Eigen::DenseIndex>(dims[kNumDims - 1 - i]); |
| Y_dims[i] = static_cast<Eigen::DenseIndex>(dims[kNumDims - 1 - i]); |
| } |
| for (int i = 0; i < kNumAxes; ++i) { |
| Y_dims[kNumDims - 1 - axes[i]] = static_cast<Eigen::DenseIndex>(1); |
| reduce_dims[kNumAxes - 1 - i] = |
| static_cast<Eigen::DenseIndex>(kNumDims - 1 - axes[i]); |
| } |
| const cudaStream_t cuda_stream = context->cuda_stream(); |
| const Eigen::CudaStreamDevice stream_device( |
| &cuda_stream, context->cuda_gpu_id()); |
| const Eigen::GpuDevice gpu_device(&stream_device); |
| EigenTensorMap<T, kNumDims>(Y, Y_dims).device(gpu_device) = |
| EigenTensorMap<T, kNumDims>(const_cast<T*>(X), X_dims) |
| .reduce(reduce_dims, reducer); |
| } |
| |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| |
| template <typename T, class Reducer> |
| bool EigenReduceTensorCUDA( |
| const int num_dims, |
| const int* dims, |
| const int num_axes, |
| const int* axes, |
| const Reducer& reducer, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| switch (num_dims) { |
| case 1: { |
| switch (num_axes) { |
| case 1: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 1, 1>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| default: { return false; } |
| } |
| } |
| case 2: { |
| switch (num_axes) { |
| case 1: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 2, 1>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| case 2: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 2, 2>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| default: { return false; } |
| } |
| } |
| case 3: { |
| switch (num_axes) { |
| case 1: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 3, 1>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| case 2: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 3, 2>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| case 3: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 3, 3>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| default: { return false; } |
| } |
| } |
| case 4: { |
| switch (num_axes) { |
| case 1: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 4, 1>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| case 2: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 4, 2>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| case 3: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 4, 3>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| case 4: { |
| EigenReduceTensorCUDAImpl<T, Reducer, 4, 4>( |
| dims, axes, reducer, X, Y, context); |
| return true; |
| } |
| default: { return false; } |
| } |
| } |
| default: { return false; } |
| } |
| } |
| |
| template <typename T, class Reducer> |
| void ReduceTensorCUDA( |
| const int num_dims, |
| const int* dims, |
| const int num_axes, |
| const int* axes, |
| const Reducer& reducer, |
| const T& init, |
| const T* X, |
| T* Y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| std::vector<int> transpose_axes(num_dims); |
| const int d = num_dims - num_axes; |
| for (int i = 0; i < num_axes; ++i) { |
| transpose_axes[i + d] = axes[i]; |
| } |
| std::sort(transpose_axes.begin() + d, transpose_axes.end()); |
| int outer_size = 1; |
| int inner_size = 1; |
| int p = 0; |
| int q = d; |
| for (int i = 0; i < num_dims; ++i) { |
| if (q < num_dims && i == transpose_axes[q]) { |
| inner_size *= dims[i]; |
| ++q; |
| } else { |
| outer_size *= dims[i]; |
| transpose_axes[p++] = i; |
| } |
| } |
| const T* X_data = X; |
| if (transpose_axes[d] != d) { |
| scratch_ptr->Resize(std::vector<int>{outer_size, inner_size}); |
| Transpose<T, CUDAContext>( |
| num_dims, |
| dims, |
| transpose_axes.data(), |
| X, |
| scratch_ptr->mutable_data<T>(), |
| context); |
| X_data = scratch_ptr->data<T>(); |
| } |
| RowwiseReduceKernel<<< |
| std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>( |
| outer_size, inner_size, reducer, init, X_data, Y); |
| } |
| |
| template <typename T> |
| void ReduceMinCUDAImpl( |
| const int num_dims, |
| const int* dims, |
| const int num_axes, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| CAFFE_ENFORCE_LE(num_axes, num_dims); |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| if (EigenReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| Eigen::internal::MinReducer<T>(), |
| X, |
| Y, |
| context)) { |
| return; |
| } |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| ReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| cub::Min(), |
| std::numeric_limits<T>::max(), |
| X, |
| Y, |
| context, |
| scratch_ptr); |
| } |
| |
| template <typename T> |
| void ReduceMaxCUDAImpl( |
| const int num_dims, |
| const int* dims, |
| const int num_axes, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| CAFFE_ENFORCE_LE(num_axes, num_dims); |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| if (EigenReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| Eigen::internal::MaxReducer<T>(), |
| X, |
| Y, |
| context)) { |
| return; |
| } |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| ReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| cub::Max(), |
| std::numeric_limits<T>::lowest(), |
| X, |
| Y, |
| context, |
| scratch_ptr); |
| } |
| |
| template <typename T> |
| void ReduceSumCUDAImpl( |
| const int num_dims, |
| const int* dims, |
| const int num_axes, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| CAFFE_ENFORCE_LE(num_axes, num_dims); |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| if (EigenReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| Eigen::internal::SumReducer<T>(), |
| X, |
| Y, |
| context)) { |
| return; |
| } |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| ReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| cub::Sum(), |
| T(0), |
| X, |
| Y, |
| context, |
| scratch_ptr); |
| } |
| |
| template <typename T> |
| void ReduceMeanCUDAImpl( |
| const int num_dims, |
| const int* dims, |
| const int num_axes, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context, |
| Tensor<CUDAContext>* scratch_ptr) { |
| CAFFE_ENFORCE_LE(num_axes, num_dims); |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| if (EigenReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| Eigen::internal::MeanReducer<T>(), |
| X, |
| Y, |
| context)) { |
| return; |
| } |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| ReduceTensorCUDA( |
| num_dims, |
| dims, |
| num_axes, |
| axes, |
| cub::Sum(), |
| T(0), |
| X, |
| Y, |
| context, |
| scratch_ptr); |
| const int X_size = |
| std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>()); |
| int scale = 1; |
| for (int i = 0; i < num_axes; ++i) { |
| scale *= dims[axes[i]]; |
| } |
| const int Y_size = X_size / scale; |
| Scale<T, CUDAContext>( |
| Y_size, 1.0f / static_cast<float>(scale), Y, Y, context); |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \ |
| template <> \ |
| void ReduceMin<T, CUDAContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T* X, \ |
| T* Y, \ |
| CUDAContext* context, \ |
| Tensor<CUDAContext>* scratch_ptr) { \ |
| ReduceMinCUDAImpl<T>( \ |
| num_dims, dims, num_axes, axes, X, Y, context, scratch_ptr); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN |
| |
| #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \ |
| template <> \ |
| void ReduceMax<T, CUDAContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T* X, \ |
| T* Y, \ |
| CUDAContext* context, \ |
| Tensor<CUDAContext>* scratch_ptr) { \ |
| ReduceMaxCUDAImpl<T>( \ |
| num_dims, dims, num_axes, axes, X, Y, context, scratch_ptr); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX |
| |
| #define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \ |
| template <> \ |
| void ReduceSum<T, CUDAContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T* X, \ |
| T* Y, \ |
| CUDAContext* context, \ |
| Tensor<CUDAContext>* scratch_ptr) { \ |
| ReduceSumCUDAImpl<T>( \ |
| num_dims, dims, num_axes, axes, X, Y, context, scratch_ptr); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM |
| |
| #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \ |
| template <> \ |
| void ReduceMean<T, CUDAContext>( \ |
| const int num_dims, \ |
| const int* dims, \ |
| const int num_axes, \ |
| const int* axes, \ |
| const T* X, \ |
| T* Y, \ |
| CUDAContext* context, \ |
| Tensor<CUDAContext>* scratch_ptr) { \ |
| ReduceMeanCUDAImpl<T>( \ |
| num_dims, dims, num_axes, axes, X, Y, context, scratch_ptr); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN |
| |
| namespace { |
| |
| constexpr int kCUDABroadcastMaxDims = 8; |
| |
| template <typename T, int D> |
| __global__ void BroadcastCUDAKernel( |
| const int Y_size, |
| const SimpleArray<int, D> X_strides, |
| const SimpleArray<int, D> Y_dims, |
| const T* X, |
| T* Y) { |
| CUDA_1D_KERNEL_LOOP(Y_index, Y_size) { |
| int X_index = 0; |
| int Y_index_val = Y_index; |
| #pragma unroll |
| for (int i = D - 1; i >= 0; --i) { |
| X_index += X_strides.data[i] == 0 |
| ? 0 |
| : (Y_index_val % Y_dims.data[i]) * X_strides.data[i]; |
| Y_index_val /= Y_dims.data[i]; |
| } |
| #if __CUDA_ARCH__ >= 350 |
| Y[Y_index] = __ldg(X + X_index); |
| #else |
| Y[Y_index] = X[X_index]; |
| #endif |
| } |
| } |
| |
| template <typename T, int D> |
| void BroadcastCUDAImpl( |
| const int X_ndim, |
| const int* X_dims, |
| const int* Y_dims, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| SimpleArray<int, D> X_strides_array; |
| SimpleArray<int, D> Y_dims_array; |
| const int d = D - X_ndim; |
| std::fill(X_strides_array.data, X_strides_array.data + d, 0); |
| int cur_stride = 1; |
| for (int i = D - 1; i >= d; --i) { |
| CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]); |
| X_strides_array.data[i] = X_dims[i - d] == 1 ? 0 : cur_stride; |
| cur_stride *= X_dims[i - d]; |
| } |
| std::copy_n(Y_dims, D, Y_dims_array.data); |
| const int Y_size = |
| std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>()); |
| BroadcastCUDAKernel<T, D> |
| <<<CAFFE_GET_BLOCKS(Y_size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(Y_size, X_strides_array, Y_dims_array, X, Y); |
| } |
| |
| template <typename T> |
| void BroadcastCUDA( |
| const int X_ndim, |
| const int* X_dims, |
| const int Y_ndim, |
| const int* Y_dims, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| CAFFE_ENFORCE_LE(X_ndim, Y_ndim); |
| switch (Y_ndim) { |
| case 1: { |
| BroadcastCUDAImpl<T, 1>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| case 2: { |
| BroadcastCUDAImpl<T, 2>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| case 3: { |
| BroadcastCUDAImpl<T, 3>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| case 4: { |
| BroadcastCUDAImpl<T, 4>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| case 5: { |
| BroadcastCUDAImpl<T, 5>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| case 6: { |
| BroadcastCUDAImpl<T, 6>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| case 7: { |
| BroadcastCUDAImpl<T, 7>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| case 8: { |
| BroadcastCUDAImpl<T, 8>(X_ndim, X_dims, Y_dims, X, Y, context); |
| break; |
| } |
| default: { break; } |
| } |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T) \ |
| template <> \ |
| void Broadcast<T, CUDAContext>( \ |
| const int X_ndim, \ |
| const int* X_dims, \ |
| const int Y_ndim, \ |
| const int* Y_dims, \ |
| const T* X, \ |
| T* Y, \ |
| CUDAContext* context) { \ |
| CAFFE_ENFORCE_LE( \ |
| Y_ndim, kCUDABroadcastMaxDims, "Y_ndim exceeds compile time max."); \ |
| BroadcastCUDA<T>(X_ndim, X_dims, Y_ndim, Y_dims, X, Y, context); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_BROADCAST(float) |
| #undef CAFFE2_SPECIALIZED_CUDA_BROADCAST |
| |
| namespace { |
| |
| constexpr int kCUDATransposeMaxDims = 8; |
| |
| template <typename T, int D> |
| void EigenTransposeCUDAImpl( |
| const int* dims, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| Eigen::DSizes<Eigen::DenseIndex, D> X_dims; |
| Eigen::DSizes<Eigen::DenseIndex, D> Y_dims; |
| Eigen::array<Eigen::DenseIndex, D> axes_array; |
| for (int i = 0; i < D; ++i) { |
| X_dims[i] = static_cast<Eigen::DenseIndex>(dims[D - 1 - i]); |
| Y_dims[i] = static_cast<Eigen::DenseIndex>(dims[D - 1 - axes[i]]); |
| axes_array[D - 1 - i] = static_cast<Eigen::DenseIndex>(D - 1 - axes[i]); |
| } |
| const cudaStream_t cuda_stream = context->cuda_stream(); |
| const Eigen::CudaStreamDevice stream_device( |
| &cuda_stream, context->cuda_gpu_id()); |
| const Eigen::GpuDevice gpu_device(&stream_device); |
| EigenTensorMap<T, D>(Y, Y_dims).device(gpu_device) = |
| EigenTensorMap<T, D>(const_cast<T*>(X), X_dims).shuffle(axes_array); |
| } |
| |
| template <typename T> |
| bool EigenTransposeCUDA( |
| const int ndim, |
| const int* dims, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| #if EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| switch (ndim) { |
| case 1: { |
| EigenTransposeCUDAImpl<T, 1>(dims, axes, X, Y, context); |
| return true; |
| } |
| case 2: { |
| EigenTransposeCUDAImpl<T, 2>(dims, axes, X, Y, context); |
| return true; |
| } |
| case 3: { |
| EigenTransposeCUDAImpl<T, 3>(dims, axes, X, Y, context); |
| return true; |
| } |
| case 4: { |
| EigenTransposeCUDAImpl<T, 4>(dims, axes, X, Y, context); |
| return true; |
| } |
| case 5: { |
| EigenTransposeCUDAImpl<T, 5>(dims, axes, X, Y, context); |
| return true; |
| } |
| case 6: { |
| EigenTransposeCUDAImpl<T, 6>(dims, axes, X, Y, context); |
| return true; |
| } |
| case 7: { |
| EigenTransposeCUDAImpl<T, 7>(dims, axes, X, Y, context); |
| return true; |
| } |
| case 8: { |
| EigenTransposeCUDAImpl<T, 8>(dims, axes, X, Y, context); |
| return true; |
| } |
| default: { return false; } |
| } |
| #endif // EIGEN_VERSION_AT_LEAST(3, 3, 0) |
| return false; |
| } |
| |
| template <typename T, int D> |
| __global__ void TransposeCUDAKernel( |
| const int size, |
| const SimpleArray<int, D> X_strides, |
| const SimpleArray<int, D> Y_dims, |
| const T* X, |
| T* Y) { |
| CUDA_1D_KERNEL_LOOP(Y_index, size) { |
| int X_index = 0; |
| int Y_index_val = Y_index; |
| #pragma unroll |
| for (int i = D - 1; i >= 0; --i) { |
| X_index += (Y_index_val % Y_dims.data[i]) * X_strides.data[i]; |
| Y_index_val /= Y_dims.data[i]; |
| } |
| #if __CUDA_ARCH__ >= 350 |
| Y[Y_index] = __ldg(X + X_index); |
| #else |
| Y[Y_index] = X[X_index]; |
| #endif |
| } |
| } |
| |
| template <int D> |
| void ComputeXStride( |
| const int* X_dims, |
| const int* axes, |
| int* X_strides) { |
| int buff[D]; |
| int cur_stride = 1; |
| for (int i = D - 1; i >= 0; --i) { |
| buff[i] = cur_stride; |
| cur_stride *= X_dims[i]; |
| } |
| for (int i = 0; i < D; ++i) { |
| X_strides[i] = buff[axes[i]]; |
| } |
| } |
| |
| template <typename T, int D> |
| void TransposeCUDAImpl( |
| const int* dims, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| SimpleArray<int, D> X_strides; |
| SimpleArray<int, D> Y_dims; |
| ComputeXStride<D>(dims, axes, X_strides.data); |
| int size = 1; |
| for (int i = 0; i < D; ++i) { |
| Y_dims.data[i] = dims[axes[i]]; |
| size *= dims[i]; |
| } |
| TransposeCUDAKernel<T, D> |
| <<<CAFFE_GET_BLOCKS(size), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(size, X_strides, Y_dims, X, Y); |
| } |
| |
| template <typename T> |
| void TransposeCUDA( |
| const int ndim, |
| const int* dims, |
| const int* axes, |
| const T* X, |
| T* Y, |
| CUDAContext* context) { |
| switch (ndim) { |
| case 1: { |
| TransposeCUDAImpl<T, 1>(dims, axes, X, Y, context); |
| break; |
| } |
| case 2: { |
| TransposeCUDAImpl<T, 2>(dims, axes, X, Y, context); |
| break; |
| } |
| case 3: { |
| TransposeCUDAImpl<T, 3>(dims, axes, X, Y, context); |
| break; |
| } |
| case 4: { |
| TransposeCUDAImpl<T, 4>(dims, axes, X, Y, context); |
| break; |
| } |
| case 5: { |
| TransposeCUDAImpl<T, 5>(dims, axes, X, Y, context); |
| break; |
| } |
| case 6: { |
| TransposeCUDAImpl<T, 6>(dims, axes, X, Y, context); |
| break; |
| } |
| case 7: { |
| TransposeCUDAImpl<T, 7>(dims, axes, X, Y, context); |
| break; |
| } |
| case 8: { |
| TransposeCUDAImpl<T, 8>(dims, axes, X, Y, context); |
| break; |
| } |
| default: { break; } |
| } |
| } |
| |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \ |
| template <> \ |
| void Transpose<T, CUDAContext>( \ |
| const int ndim, \ |
| const int* dims, \ |
| const int* axes, \ |
| const T* X, \ |
| T* Y, \ |
| CUDAContext* context) { \ |
| CAFFE_ENFORCE_LE( \ |
| ndim, kCUDATransposeMaxDims, "ndim exceeds compile time max."); \ |
| if (EigenTransposeCUDA(ndim, dims, axes, X, Y, context)) { \ |
| return; \ |
| } \ |
| TransposeCUDA<T>(ndim, dims, axes, X, Y, context); \ |
| } |
| CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float) |
| CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double) |
| CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int) |
| CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(long) |
| #undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE |
| |
| } // namespace math |
| } // namespace caffe2 |