| #define CUB_STDERR |
| #include <cub/block/block_load.cuh> |
| #include <cub/block/block_reduce.cuh> |
| #include <cub/device/device_reduce.cuh> |
| #include "caffe2/core/common_gpu.h" |
| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/operators/pow_op.h" |
| #include "caffe2/utils/conversions.h" |
| |
| namespace caffe2 { |
| |
| // pow, log and other math functions are defined in |
| // CUDA math library in header file math.h |
| #define CUDA_POW(x, y) (pow(x, y)) |
| |
| // renaming to PowOpKernel as there exists PowKernel in caffe2/utils/math_gpu.cc |
| // Kernels with same leads to conflict during hipification for ROCm platform. |
| template <int b_is_scalar, typename T1, typename T2, typename R> |
| __global__ void PowOpKernel(const T1* a, const T2* b, T2 e, R* out, int n) { |
| CUDA_1D_KERNEL_LOOP(i, n) { |
| out[i] = CUDA_POW(a[i], ((b == NULL) ? e : b[b_is_scalar ? 0 : i])); |
| } |
| } |
| template <typename T1, typename T2, typename R> |
| __global__ void |
| PowBroadcastKernel(const T1* a, const T2* b, R* out, int pre, int n) { |
| CUDA_1D_KERNEL_LOOP(i, pre * n) { |
| out[i] = CUDA_POW(a[i], b[i % n]); |
| } |
| } |
| template <typename T1, typename T2, typename R> |
| __global__ void PowBroadcast2Kernel( |
| const T1* a, |
| const T2* b, |
| R* out, |
| int pre, |
| int n, |
| int post) { |
| CUDA_1D_KERNEL_LOOP(i, pre * n * post) { |
| out[i] = CUDA_POW(a[i], b[(i / post) % n]); |
| } |
| } |
| |
| struct CudaPowFunctor { |
| template <bool b_is_scalar, typename T1, typename T2, typename R> |
| inline void |
| Run(size_t n, const T1* a, const T2* b, T2 e, R* out, CUDAContext* context) { |
| PowOpKernel<b_is_scalar, T1, T2, R> |
| <<<CAFFE_GET_BLOCKS(n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(a, b, e, out, n); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| template <typename T1, typename T2, typename R> |
| void RunWithBroadcast( |
| const T1* a, |
| const T2* b, |
| R* out, |
| size_t pre, |
| size_t n, |
| CUDAContext* context) { |
| PowBroadcastKernel<T1, T2, R> |
| <<<CAFFE_GET_BLOCKS(pre * n), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(a, b, out, pre, n); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| template <typename T1, typename T2, typename R> |
| void RunWithBroadcast2( |
| const T1* a, |
| const T2* b, |
| R* out, |
| size_t pre, |
| size_t n, |
| size_t post, |
| CUDAContext* context) { |
| PowBroadcast2Kernel<T1, T2, R> |
| <<<CAFFE_GET_BLOCKS(pre * n * post), |
| CAFFE_CUDA_NUM_THREADS, |
| 0, |
| context->cuda_stream()>>>(a, b, out, pre, n, post); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| } |
| }; |
| REGISTER_CUDA_OPERATOR( |
| Pow, |
| PowOp< |
| TensorTypes<float> /*NumericTypes*/, |
| CUDAContext, |
| CudaPowFunctor, |
| SameTypeAsInput>) |
| |
| } // namespace caffe2 |