| // Implementes the math functions for CPU. |
| |
| #include <thrust/device_ptr.h> |
| #include <thrust/reduce.h> |
| #include <thrust/system/cuda/detail/par.h> |
| #include <thrust/version.h> |
| |
| #include "caffe2/utils/math.h" |
| #include "caffe2/core/context_gpu.h" |
| |
| #if THRUST_VERSION >= 100800 |
| #define THRUST_SUPPORTS_PER_THREAD |
| #endif // THRUST_VERSION >= 100800 |
| |
| namespace caffe2 { |
| namespace math { |
| |
| #define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Funcname, function) \ |
| __global__ \ |
| void _Kernel_##T##_##Funcname(const int N, const T* x, T* y) { \ |
| CUDA_1D_KERNEL_LOOP(i, N) { \ |
| y[i] = function(x[i]); \ |
| } \ |
| } \ |
| template <> \ |
| void Funcname<T, CUDAContext>( \ |
| const int N, const T* x, T* y, \ |
| CUDAContext* context) { \ |
| _Kernel_##T##_##Funcname<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, \ |
| 0, context->cuda_stream()>>>( \ |
| N, x, y); \ |
| } |
| |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Exp, exp); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Log, log); |
| |
| __device__ float cuda_sqrf(const float x) { return x * x; } |
| __device__ double cuda_sqr(const double x) { return x * x; } |
| |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, cuda_sqrf); |
| DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sqr, cuda_sqr); |
| |
| #undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION |
| |
| #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Funcname, expr) \ |
| __global__ void _Kernel_##T##_##Funcname( \ |
| const int N, const T* a, const T* b, T* y) { \ |
| CUDA_1D_KERNEL_LOOP(i, N) { \ |
| y[i] = a[i] expr b[i]; \ |
| } \ |
| } \ |
| template <> \ |
| void Funcname<T, CUDAContext>( \ |
| const int N, const T* a, const T* b, T* y, CUDAContext* context) { \ |
| _Kernel_##T##_##Funcname<<< \ |
| CAFFE_GET_BLOCKS(N), \ |
| CAFFE_CUDA_NUM_THREADS, \ |
| 0, \ |
| context->cuda_stream()>>>(N, a, b, y); \ |
| } |
| |
| DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, +); |
| DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, -); |
| DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, *); |
| DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, /); |
| |
| // Caffe2 gemm provides a simpler interface to the gemm functions, with the |
| // limitation that the data has to be contiguous in memory. |
| template <> |
| void Gemm<float, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, |
| const int M, const int N, const int K, const float alpha, const float* A, |
| const float* B, const float beta, float* C, CUDAContext* context) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| int lda = (TransA == CblasNoTrans) ? K : M; |
| int ldb = (TransB == CblasNoTrans) ? N : K; |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_CHECK(cublasSgemm(context->cublas_handle(), cuTransB, cuTransA, |
| N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); |
| } |
| |
| template <> |
| void GemmEx<float, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, |
| const int M, const int N, const int K, const float alpha, const float* A, |
| const int lda, const float* B, const int ldb, const float beta, float* C, |
| const int ldc, CUDAContext* context) { |
| // Note that cublas follows fortran order, so the order is different from |
| // the cblas convention. |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| cublasOperation_t cuTransB = |
| (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; |
| CUBLAS_CHECK(cublasSgemm(context->cublas_handle(), cuTransB, cuTransA, |
| N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); |
| } |
| |
| template <> |
| void Gemv<float, CUDAContext>( |
| const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, |
| const float* A, const float* x, const float beta, float* y, |
| CUDAContext* context) { |
| cublasOperation_t cuTransA = |
| (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; |
| CUBLAS_CHECK(cublasSgemv(context->cublas_handle(), cuTransA, N, M, &alpha, |
| A, N, x, 1, &beta, y, 1)); |
| } |
| |
| |
| namespace { |
| template <typename T> |
| __global__ void SetKernel(const int N, const T alpha, T* Y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| Y[i] = alpha; |
| } |
| } |
| } // namespace |
| |
| #define CAFFE2_SPECIALIZED_CUDA_SET(T) \ |
| template <> \ |
| void Set<T, CUDAContext>(const TIndex N, const T alpha, T *Y, \ |
| CUDAContext* context) { \ |
| SetKernel<<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, \ |
| 0, context->cuda_stream()>>>(N, alpha, Y); \ |
| } |
| |
| CAFFE2_SPECIALIZED_CUDA_SET(float); |
| CAFFE2_SPECIALIZED_CUDA_SET(double); |
| CAFFE2_SPECIALIZED_CUDA_SET(bool); |
| CAFFE2_SPECIALIZED_CUDA_SET(int8_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(int16_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(int); |
| CAFFE2_SPECIALIZED_CUDA_SET(int64_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(char); |
| CAFFE2_SPECIALIZED_CUDA_SET(uint8_t); |
| CAFFE2_SPECIALIZED_CUDA_SET(uint16_t); |
| #undef CAFFE2_SPECIALIZED_CUDA_SET |
| |
| namespace { |
| template <typename T> |
| __global__ void UniformShift(const int N, const T min, const T max, |
| T* x) { |
| T scale = max - min; |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| x[i] = x[i] * scale + min; |
| } |
| } |
| |
| __global__ void UniformIntFit(const int N, const int min, const int max, |
| unsigned int* x) { |
| int* x_int = reinterpret_cast<int*>(x); |
| int range = (max - min + 1); |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| x_int[i] = min + static_cast<int>(x[i] % range); |
| } |
| } |
| } // namespace |
| |
| template <> |
| void RandUniform<float, CUDAContext>( |
| const int n, const float min, const float max, float* r, |
| CUDAContext* context) { |
| CURAND_CHECK(curandGenerateUniform(context->curand_generator(), r, n)); |
| UniformShift<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, min, max, r); |
| } |
| |
| template <> |
| void RandUniform<double, CUDAContext>( |
| const int n, const double min, const double max, double* r, |
| CUDAContext* context) { |
| CURAND_CHECK(curandGenerateUniformDouble(context->curand_generator(), r, n)); |
| UniformShift<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, min, max, r); |
| } |
| |
| template <> |
| void RandUniform<int, CUDAContext>( |
| const int n, const int min, const int max, int* r, |
| CUDAContext* context) { |
| CURAND_CHECK(curandGenerate(context->curand_generator(), |
| reinterpret_cast<unsigned int*>(r), n)); |
| UniformIntFit<<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>( |
| n, min, max, reinterpret_cast<unsigned int*>(r)); |
| } |
| |
| template <typename T> |
| int HandleOddLengthRandGaussian( |
| const int n, |
| const T mean, |
| const T std, |
| T* r, |
| CUDAContext* context) { |
| if (n % 2 == 1) { |
| std::default_random_engine generator; |
| std::normal_distribution<T> distribution(mean, std); |
| const T random_value = distribution(generator); |
| math::Set<T, CUDAContext>( |
| 1, random_value, r + sizeof(T) * (n - 1), context); |
| return n - 1; |
| } |
| return n; |
| } |
| |
| template <> |
| void RandGaussian<float, CUDAContext>( |
| const int n, const float mean, const float std, float* r, |
| CUDAContext* context) { |
| // If n is odd, we add a random Gaussian value at the end manually |
| // and generate n-1 random values using curandGenerateNormal. |
| // curandGenerateNormal requires n to be even. |
| const int even_n = |
| HandleOddLengthRandGaussian<float>(n, mean, std, r, context); |
| CURAND_CHECK( |
| curandGenerateNormal(context->curand_generator(), r, even_n, mean, std)); |
| } |
| |
| template <> |
| void RandGaussian<double, CUDAContext>( |
| const int n, const double mean, const double std, double* r, |
| CUDAContext* context) { |
| const int even_n = |
| HandleOddLengthRandGaussian<double>(n, mean, std, r, context); |
| CURAND_CHECK(curandGenerateNormalDouble( |
| context->curand_generator(), r, even_n, mean, std)); |
| } |
| |
| |
| template<> |
| void Dot<float, CUDAContext>( |
| const int n, const float* a, const float* b, float* y, |
| CUDAContext* context) { |
| float result; |
| CUBLAS_CHECK(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result)); |
| context->Copy<float, CPUContext, CUDAContext>(1, &result, y); |
| } |
| |
| template<> |
| void Dot<double, CUDAContext>( |
| const int n, const double* a, const double* b, double* y, |
| CUDAContext* context) { |
| double result; |
| CUBLAS_CHECK(cublasDdot(context->cublas_handle(), n, a, 1, b, 1, y)); |
| context->Copy<double, CPUContext, CUDAContext>(1, &result, y); |
| } |
| |
| // A previous version of caffe2 used Thrust but it turns out that thrust |
| // reduction has an implicit scratch space allocation and deallocation, which |
| // may interfere with NCCL and create a deadlock. Hence we are using a custom |
| // reduction here. |
| #define SUM_KERNEL_NTHREADS 128 |
| template <typename T> |
| __global__ void SumKernel(const int N, const T* X, T* Y) { |
| const int idx = threadIdx.x; |
| __shared__ T reduction_buffer[SUM_KERNEL_NTHREADS]; |
| |
| reduction_buffer[idx] = 0; |
| |
| // A multilevel reduction. |
| // N -> 128 |
| for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) { |
| reduction_buffer[idx] += X[i]; |
| } |
| __syncthreads(); |
| // 128 -> 32 |
| if (idx < 32) { |
| reduction_buffer[idx] += |
| reduction_buffer[idx + 32] + |
| reduction_buffer[idx + 64] + |
| reduction_buffer[idx + 96]; |
| } |
| __syncthreads(); |
| // 32 -> 1 |
| if (idx == 0) { |
| float tmp = 0; |
| for (int i = 0; i < 32; ++i) { |
| tmp += reduction_buffer[i]; |
| } |
| *Y = tmp; |
| } |
| } |
| |
| #define CAFFE2_MATH_SUM_FUNC(T) \ |
| template<> \ |
| void Sum<T, CUDAContext>(const int N, const T* x, T* y, CUDAContext* context) {\ |
| SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(N, x, y); \ |
| } |
| |
| CAFFE2_MATH_SUM_FUNC(float) |
| CAFFE2_MATH_SUM_FUNC(double) |
| #undef CAFFE2_MATH_SUM_FUNC |
| |
| namespace { |
| template <typename T> |
| __global__ void SelectKernel( |
| const int N, const int D, const T* x, const int* idx, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, N) { |
| y[i] = x[i * D + idx[i]]; |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Select<float, CUDAContext>( |
| const int N, const int D, const float* x, const int* idx, float* y, |
| CUDAContext* context) { |
| SelectKernel<float><<<CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(N, D, x, idx, y); |
| } |
| |
| namespace { |
| template <typename T> |
| __global__ void ScaleKernel( |
| const int n, const T alpha, const T* x, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, n) { |
| y[i] = x[i] * alpha; |
| } |
| } |
| |
| template <typename T> |
| __global__ void ScaleKernelDeviceAlpha( |
| const int n, const T* alpha, const T* x, T* y) { |
| CUDA_1D_KERNEL_LOOP(i, n) { |
| y[i] = x[i] * (*alpha); |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Scale<float, CUDAContext>( |
| const int n, const float alpha, const float *x, float* y, |
| CUDAContext* context) { |
| ScaleKernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, alpha, x, y); |
| } |
| |
| template <> |
| void Scale<double, CUDAContext>( |
| const int n, const double alpha, const double *x, double* y, |
| CUDAContext* context) { |
| ScaleKernel<double><<< |
| CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( |
| n, alpha, x, y); |
| } |
| |
| template <> |
| void Scale<float, CUDAContext>( |
| const int n, const float* alpha, const float *x, float* y, |
| CUDAContext* context) { |
| ScaleKernelDeviceAlpha<float><<< |
| CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( |
| n, alpha, x, y); |
| } |
| |
| template <> |
| void Scale<double, CUDAContext>( |
| const int n, const double* alpha, const double *x, double* y, |
| CUDAContext* context) { |
| ScaleKernelDeviceAlpha<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, alpha, x, y); |
| } |
| |
| template <> |
| void Axpy<float, CUDAContext>( |
| const int N, |
| const float alpha, |
| const float* X, |
| float* Y, |
| CUDAContext* context) { |
| CUBLAS_CHECK(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); |
| } |
| |
| template <> |
| void Axpy<double, CUDAContext>( |
| const int N, |
| const double alpha, |
| const double* X, |
| double* Y, |
| CUDAContext* context) { |
| CUBLAS_CHECK(cublasDaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1)); |
| } |
| |
| namespace detail { |
| template <> |
| void ScaleDynamic<float, CUDAContext>( |
| const int n, |
| const float alpha, |
| const float* x, |
| float* y, |
| CUDAContext* context) { |
| return math::Scale<float, CUDAContext>(n, alpha, x, y, context); |
| } |
| |
| template <> |
| void ScaleDynamic<double, CUDAContext>( |
| const int n, |
| const double alpha, |
| const double* x, |
| double* y, |
| CUDAContext* context) { |
| return math::Scale<double, CUDAContext>(n, alpha, x, y, context); |
| } |
| |
| template <> |
| void AxpyDynamic<float, CUDAContext>( |
| const int n, |
| const float alpha, |
| const float* x, |
| float* y, |
| CUDAContext* context) { |
| return math::Axpy<float, CUDAContext>(n, alpha, x, y, context); |
| } |
| |
| template <> |
| void AxpyDynamic<double, CUDAContext>( |
| const int n, |
| const double alpha, |
| const double* x, |
| double* y, |
| CUDAContext* context) { |
| return math::Axpy<double, CUDAContext>(n, alpha, x, y, context); |
| } |
| } |
| |
| namespace { |
| template <typename T> |
| __global__ void AxpyKernel(const int n, const T* a, const T* x, T* y) { |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| y[index] += x[index] * (*a); |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Axpy<float, CUDAContext>( |
| const int n, const float* alpha, const float* X, |
| float* Y, CUDAContext* context) { |
| AxpyKernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, alpha, X, Y); |
| } |
| |
| template <> |
| void Axpy<double, CUDAContext>( |
| const int n, const double* alpha, const double* X, |
| double* Y, CUDAContext* context) { |
| AxpyKernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, alpha, X, Y); |
| } |
| |
| |
| namespace { |
| template <typename T> |
| __global__ void AxpbyKernel(const int n, const T a, const T* x, |
| const T b, T* y) { |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| y[index] = x[index] * a + y[index] * b; |
| } |
| } |
| } // namespace |
| |
| template <> |
| void Axpby<float, CUDAContext>( |
| const int n, const float a, const float* x, const float b, float* y, |
| CUDAContext* context) { |
| AxpbyKernel<float><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, a, x, b, y); |
| } |
| |
| template <> |
| void Axpby<double, CUDAContext>( |
| const int n, const double a, const double* x, const double b, double* y, |
| CUDAContext* context) { |
| AxpbyKernel<double><<<CAFFE_GET_BLOCKS(n), CAFFE_CUDA_NUM_THREADS, |
| 0, context->cuda_stream()>>>(n, a, x, b, y); |
| } |
| |
| namespace { |
| |
| template <typename T> |
| __global__ void im2col_gpu_kernel_nchw(const int n, const T* data_im, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int height_col, const int width_col, |
| T* data_col) { |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| int w_out = index % width_col; |
| int h_index = index / width_col; |
| int h_out = h_index % height_col; |
| int channel_in = h_index / height_col; |
| int channel_out = channel_in * kernel_h * kernel_w; |
| int h_in = h_out * stride_h - pad_t; |
| int w_in = w_out * stride_w - pad_l; |
| T* data_col_ptr = data_col; |
| data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; |
| const T* data_im_ptr = data_im; |
| data_im_ptr += (channel_in * height + h_in) * width + w_in; |
| for (int i = 0; i < kernel_h; ++i) { |
| for (int j = 0; j < kernel_w; ++j) { |
| int h = h_in + i * dilation_h; |
| int w = w_in + j * dilation_w; |
| *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? |
| data_im_ptr[i * dilation_h * width + j * dilation_w] : 0; |
| data_col_ptr += height_col * width_col; |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| __global__ void im2col_gpu_kernel_nhwc(const int n, const T* data_im, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int width_col, const int channels, |
| T* data_col) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| int channel_in = index % channels; |
| int w_out = index / channels % width_col; |
| int h_out = index / channels / width_col; |
| int h_in = h_out * stride_h - pad_t; |
| int w_in = w_out * stride_w - pad_l; |
| T* local_data_col = data_col + |
| ((h_out * width_col) + w_out) * channels * kernel_h * kernel_w |
| + channel_in; |
| for (int i = 0; i < dkernel_h; i += dilation_h) { |
| int h = h_in + i; |
| for (int j = 0; j < dkernel_w; j += dilation_w) { |
| int w = w_in + j; |
| *local_data_col = (h >= 0 && w >= 0 && h < height && w < width) ? |
| data_im[(h * width + w) * channels + channel_in] : 0; |
| local_data_col += channels; |
| } |
| } |
| } |
| } |
| |
| template <typename T> |
| __global__ void col2im_gpu_kernel_nchw(const int n, const T* data_col, |
| const int height, const int width, |
| const int patch_h, const int patch_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int height_col, const int width_col, |
| T* data_im) { |
| |
| const int dpatch_h = dilation_h * (patch_h - 1) + 1; |
| const int dpatch_w = dilation_w * (patch_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| T val = 0; |
| int w = index % width + pad_l; |
| int h = (index / width) % height + pad_t; |
| int c = index / (width * height); |
| |
| // compute the start and end of the output |
| int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1; |
| int w_col_end = min(w / stride_w + 1, width_col); |
| int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1; |
| int h_col_end = min(h / stride_h + 1, height_col); |
| |
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { |
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { |
| int h_k = (h - h_col * stride_h); |
| int w_k = (w - w_col * stride_w); |
| if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { |
| h_k /= dilation_h; |
| w_k /= dilation_w; |
| int data_col_index = |
| (((c * patch_h + h_k) * patch_w + w_k) * height_col + h_col) * width_col + w_col; |
| val += data_col[data_col_index]; |
| } |
| } |
| } |
| data_im[index] = val; |
| } |
| } |
| |
| template <typename T> |
| __global__ void col2im_gpu_kernel_nhwc(const int n, const T* data_col, |
| const int width, const int channels, |
| const int patch_h, const int patch_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, |
| const int stride_h, const int stride_w, |
| const int height_col, const int width_col, |
| T* data_im) { |
| |
| const int dpatch_h = dilation_h * (patch_h - 1) + 1; |
| const int dpatch_w = dilation_w * (patch_w - 1) + 1; |
| |
| CUDA_1D_KERNEL_LOOP(index, n) { |
| T val = 0; |
| int c = index % channels; |
| int w = index / channels % width + pad_l; |
| int h = index / channels / width + pad_t; |
| // compute the start and end of the output |
| int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1; |
| int w_col_end = min(w / stride_w + 1, width_col); |
| int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1; |
| int h_col_end = min(h / stride_h + 1, height_col); |
| int channels_col = patch_h * patch_w * channels; |
| |
| for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { |
| for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { |
| int h_k = h - h_col * stride_h; |
| int w_k = w - w_col * stride_w; |
| if (h_k % dilation_h == 0 && w_k % dilation_w == 0) { |
| h_k /= dilation_h; |
| w_k /= dilation_w; |
| int c_col = (h_k * patch_w + w_k) * channels + c; |
| val += data_col[(h_col * width_col + w_col) * channels_col + c_col]; |
| } |
| } |
| } |
| data_im[index] = val; |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| void Im2col<float, CUDAContext, StorageOrder::NCHW>( |
| const float* data_im, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_col, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| // We are going to launch channels * height_col * width_col kernels, each |
| // kernel responsible for copying a single-channel grid. |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = channels * height_col * width_col; |
| // NOLINT_NEXT_LINE(whitespace/operators) |
| im2col_gpu_kernel_nchw<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_im, height, width, kernel_h, kernel_w, |
| dilation_h, dilation_w, pad_t, pad_l, stride_h, stride_w, |
| height_col, width_col, data_col); |
| } |
| |
| template <> |
| void Im2col<float, CUDAContext, StorageOrder::NHWC>( |
| const float* data_im, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_col, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| // We are going to launch height_col * width_col * channels kernels, each |
| // kernel responsible for copying a single-channel grid. |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = height_col * width_col * channels; |
| // NOLINT_NEXT_LINE(whitespace/operators) |
| im2col_gpu_kernel_nhwc<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_im, height, width, kernel_h, kernel_w, |
| dilation_h, dilation_w, pad_t, pad_l, stride_h, stride_w, |
| width_col, channels, data_col); |
| } |
| |
| |
| template <> |
| void Col2im<float, CUDAContext, StorageOrder::NCHW>( |
| const float* data_col, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_im, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = channels * height * width; |
| // To avoid involving atomic operations, we will launch one kernel per |
| // bottom dimension, and then in the kernel add up the top dimensions. |
| col2im_gpu_kernel_nchw<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_col, height, width, kernel_h, kernel_w, |
| dilation_h, dilation_w, |
| pad_t, pad_l, stride_h, stride_w, |
| height_col, width_col, data_im); |
| } |
| |
| template <> |
| void Col2im<float, CUDAContext, StorageOrder::NHWC>( |
| const float* data_col, const int channels, |
| const int height, const int width, const int kernel_h, const int kernel_w, |
| const int dilation_h, const int dilation_w, |
| const int pad_t, const int pad_l, const int pad_b, const int pad_r, |
| const int stride_h, |
| const int stride_w, float* data_im, CUDAContext* context) { |
| |
| const int dkernel_h = dilation_h * (kernel_h - 1) + 1; |
| const int dkernel_w = dilation_w * (kernel_w - 1) + 1; |
| |
| int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1; |
| int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1; |
| int num_kernels = height * width * channels; |
| // To avoid involving atomic operations, we will launch one kernel per |
| // bottom dimension, and then in the kernel add up the top dimensions. |
| col2im_gpu_kernel_nhwc<float><<<CAFFE_GET_BLOCKS(num_kernels), |
| CAFFE_CUDA_NUM_THREADS, 0, |
| context->cuda_stream()>>>( |
| num_kernels, data_col, width, channels, kernel_h, kernel_w, |
| dilation_h, dilation_w, |
| pad_t, pad_l, stride_h, stride_w, height_col, width_col, data_im); |
| } |
| |
| template <> |
| void CopyMatrix<CUDAContext>( |
| const size_t itemsize, const int M, const int N, const void* A, |
| const int lda, void* B, const int ldb, CUDAContext* context) { |
| cudaMemcpy2DAsync(B, ldb * itemsize, A, lda * itemsize, N * itemsize, M, |
| cudaMemcpyDeviceToDevice, context->cuda_stream()); |
| } |
| |
| } // namespace math |
| } // namespace caffe2 |