| #include "caffe2/operators/channel_stats_op.h" |
| |
| #include "caffe2/core/context_gpu.h" |
| #include "caffe2/utils/math/reduce.cuh" |
| |
| namespace caffe2 { |
| |
| namespace { |
| |
| template <typename T, int kBlockDimX, int kBlockDimY> |
| __global__ void ChannelStatsNCHWCUDAKernel( |
| const int N, |
| const int C, |
| const int HxW, |
| const T* X, |
| T* sum, |
| T* sumsq) { |
| __shared__ |
| typename BlockReduce2D<T, kBlockDimX, kBlockDimY>::TempStorage m_storage; |
| __shared__ |
| typename BlockReduce2D<T, kBlockDimX, kBlockDimY>::TempStorage v_storage; |
| const int c = blockIdx.x; |
| T m_val = 0; |
| T v_val = 0; |
| for (int n = threadIdx.x; n < N; n += blockDim.x) { |
| for (int hw = threadIdx.y; hw < HxW; hw += blockDim.y) { |
| const int index = (n * C + c) * HxW + hw; |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| m_val += __ldg(X + index); |
| v_val += __ldg(X + index) * __ldg(X + index); |
| #else |
| m_val += X[index]; |
| v_val += X[index] * X[index]; |
| #endif |
| } |
| } |
| m_val = BlockReduce2D<T, kBlockDimX, kBlockDimY>(m_storage).Sum(m_val); |
| v_val = BlockReduce2D<T, kBlockDimX, kBlockDimY>(v_storage).Sum(v_val); |
| if (threadIdx.x == 0 && threadIdx.y == 0) { |
| sum[c] = m_val; |
| sumsq[c] = v_val; |
| } |
| } |
| |
| template <typename T> |
| __global__ void ChannelStatsNHWCCUDAKernel( |
| const int N, |
| const int C, |
| const int HxW, |
| const T* X, |
| T* sum, |
| T* sumsq) { |
| __shared__ typename BlockReduce<T>::TempStorage m_storage; |
| __shared__ typename BlockReduce<T>::TempStorage v_storage; |
| const int inner_size = N * HxW; |
| const int c = blockIdx.x; |
| T m_val = 0; |
| T v_val = 0; |
| for (int i = threadIdx.x; i < inner_size; i += blockDim.x) { |
| const int index = i * C + c; |
| #if __CUDA_ARCH__ >= 350 || defined(USE_ROCM) |
| m_val += __ldg(X + index); |
| v_val += __ldg(X + index) * __ldg(X + index); |
| #else |
| m_val += X[index]; |
| v_val += X[index] * X[index]; |
| #endif |
| } |
| m_val = BlockReduce<T>(m_storage).Sum(m_val); |
| v_val = BlockReduce<T>(v_storage).Sum(v_val); |
| if (threadIdx.x == 0) { |
| sum[c] = m_val; |
| sumsq[c] = v_val; |
| } |
| } |
| |
| } // namespace |
| |
| template <> |
| template <> |
| bool ChannelStatsOp<CUDAContext>::ComputeChannelStatsNCHW<float>( |
| const int N, |
| const int C, |
| const int HxW, |
| const float* X, |
| float* sum, |
| float* sumsq) { |
| DISPATCH_REDUCE_KERNEL_BY_2D_BLOCK_WITH_TYPE_1( |
| HxW, |
| ChannelStatsNCHWCUDAKernel, |
| float, |
| C, |
| context_.cuda_stream(), |
| N, |
| C, |
| HxW, |
| X, |
| sum, |
| sumsq); |
| return true; |
| } |
| |
| template <> |
| template <> |
| bool ChannelStatsOp<CUDAContext>::ComputeChannelStatsNHWC<float>( |
| const int N, |
| const int C, |
| const int HxW, |
| const float* X, |
| float* sum, |
| float* sumsq) { |
| ChannelStatsNHWCCUDAKernel<float> |
| <<<C, CAFFE_CUDA_NUM_THREADS, 0, context_.cuda_stream()>>>( |
| N, C, HxW, X, sum, sumsq); |
| C10_CUDA_KERNEL_LAUNCH_CHECK(); |
| |
| return true; |
| } |
| |
| REGISTER_CUDA_OPERATOR(ChannelStats, ChannelStatsOp<CUDAContext>); |
| |
| } // namespace caffe2 |