caffe2/utils/math_gpu.cu - platform/external/pytorch - Git at Google

 // Implements the math functions for GPU.

 #include "caffe2/utils/math.h"

 #include <limits>
 #include <numeric>
 #include <vector>

 #include <cub/block/block_reduce.cuh>
 #include <cub/cub.cuh>

 #include <thrust/functional.h>

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/conversions.h"

 #if THRUST_VERSION >= 100800
 #define THRUST_SUPPORTS_PER_THREAD
 #endif // THRUST_VERSION >= 100800

 namespace caffe2 {
 namespace math {

 namespace {

 inline __host__ __device__ bool Not(const bool x) {
   return !x;
 }

 template <typename T>
 inline __host__ __device__ T Negate(const T& x) {
   return -x;
 }

 template <typename T>
 inline __host__ __device__ T Square(const T& x) {
   return x * x;
 }

 template <typename T>
 inline __host__ __device__ T Sign(const T& x) {
   return x > 0 ? T(1) : (x < 0 ? T(-1) : T(0));
 }

 #define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr)        \
   template <typename T>                                               \
   struct Func##Functor {                                              \
     inline __host__ __device__ T                                      \
     operator()(const T& lhs, const T& rhs) const {                    \
       return lhs expr rhs;                                            \
     }                                                                 \
   };                                                                  \
   template <>                                                         \
   struct Func##Functor<float16> {                                     \
     inline __host__ __device__ float16                                \
     operator()(const float16& lhs, const float16& rhs) const {        \
       return convert::To<float, float16>(convert::To<float16, float>( \
           lhs) expr convert::To<float16, float>(rhs));                \
     }                                                                 \
   };
 DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +)
 DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -)
 DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *)
 DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /)
 #undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR

 template <typename T>
 __global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
   CUDA_1D_KERNEL_LOOP(i, N) {
 #if __CUDA_ARCH__ >= 350
     sincos(__ldg(X + i), S + i, C + i);
 #else
     sincos(X[i], S + i, C + i);
 #endif
   }
 }

 template <typename TIn, typename TOut, class BinaryOperator>
 __global__ void SimpleBinaryOpCUDAKernel(
     const int N,
     const BinaryOperator op,
     const TIn* A,
     const TIn* B,
     TOut* C) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     C[i] = op(A[i], B[i]);
   }
 }

 template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
 __global__ void RowwiseBinaryOpCUDAKenel(
     const int rows,
     const int cols,
     const BinaryOperator op,
     const TIn* A,
     const TIn* B,
     TOut* C) {
   const int size = rows * cols;
   CUDA_1D_KERNEL_LOOP(C_index, size) {
     const int j = C_index % cols;
     const int A_index = broadcast_1st ? j : C_index;
     const int B_index = broadcast_1st ? C_index : j;
     C[C_index] = op(A[A_index], B[B_index]);
   }
 }

 template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
 __global__ void ColwiseBinaryOpCUDAKenel(
     const int rows,
     const int cols,
     const BinaryOperator op,
     const TIn* A,
     const TIn* B,
     TOut* C) {
   const int size = rows * cols;
   CUDA_1D_KERNEL_LOOP(C_index, size) {
     const int i = C_index / cols;
     const int A_index = broadcast_1st ? i : C_index;
     const int B_index = broadcast_1st ? C_index : i;
     C[C_index] = op(A[A_index], B[B_index]);
   }
 }

 template <typename TIn, typename TOut, class BinaryOperator, int D>
 __global__ void BroadcastBinaryOpCUDAKernel(
     const int size,
     const SimpleArray<int, D> A_strides,
     const SimpleArray<int, D> B_strides,
     const SimpleArray<int, D> C_dims,
     const BinaryOperator op,
     const TIn* A,
     const TIn* B,
     TOut* C) {
   CUDA_1D_KERNEL_LOOP(C_index, size) {
     int A_index = 0;
     int B_index = 0;
     int C_index_val = C_index;
 #pragma unroll
     for (int i = D - 1; i >= 0; --i) {
       const int d = C_index_val % C_dims.data[i];
       A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i];
       B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i];
       C_index_val /= C_dims.data[i];
     }
     C[C_index] = op(A[A_index], B[B_index]);
   }
 }

 template <typename TIn, typename TOut, class BinaryOperator>
 void BinaryOpWith2DBroadcasting(
     const int ndim,
     const int* dims,
     const int pivot,
     const bool rowwise_broadcast,
     const bool broadcast_1st,
     const BinaryOperator& op,
     const TIn* A,
     const TIn* B,
     TOut* C,
     CUDAContext* context) {
   const int rows =
       std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
   const int cols =
       std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
   const int size = rows * cols;
   if (rowwise_broadcast) {
     if (broadcast_1st) {
       RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
           <<<CAFFE_GET_BLOCKS(size),
              CAFFE_CUDA_NUM_THREADS,
              0,
              context->cuda_stream()>>>(rows, cols, op, A, B, C);
     } else {
       RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
           <<<CAFFE_GET_BLOCKS(size),
              CAFFE_CUDA_NUM_THREADS,
              0,
              context->cuda_stream()>>>(rows, cols, op, A, B, C);
     }
   } else {
     if (broadcast_1st) {
       ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
           <<<CAFFE_GET_BLOCKS(size),
              CAFFE_CUDA_NUM_THREADS,
              0,
              context->cuda_stream()>>>(rows, cols, op, A, B, C);
     } else {
       ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
           <<<CAFFE_GET_BLOCKS(size),
              CAFFE_CUDA_NUM_THREADS,
              0,
              context->cuda_stream()>>>(rows, cols, op, A, B, C);
     }
   }
 }

 template <typename TIn, typename TOut, class BinaryOperator, int D>
 void BroadcastBinaryOpImpl(
     const int* A_dims,
     const int* B_dims,
     const int* C_dims,
     const BinaryOperator& op,
     const TIn* A,
     const TIn* B,
     TOut* C,
     CUDAContext* context) {
   SimpleArray<int, D> A_strides_array;
   SimpleArray<int, D> B_strides_array;
   SimpleArray<int, D> C_dims_array;
   int A_stride = 1;
   int B_stride = 1;
   for (int i = D - 1; i >= 0; --i) {
     A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
     B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
     A_stride *= A_dims[i];
     B_stride *= B_dims[i];
   }
   std::copy(C_dims, C_dims + D, C_dims_array.data);
   const int size =
       std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
   BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D>
       <<<CAFFE_GET_BLOCKS(size),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           size, A_strides_array, B_strides_array, C_dims_array, op, A, B, C);
 }

 template <typename TIn, typename TOut, class BinaryOperator>
 void BroadcastBinaryOp(
     const int A_ndim,
     const int* A_dims,
     const int B_ndim,
     const int* B_dims,
     const BinaryOperator& op,
     const TIn* A,
     const TIn* B,
     TOut* C,
     CUDAContext* context) {
   const int ndim = std::max(A_ndim, B_ndim);
   std::vector<int> A_dims_array(ndim);
   std::vector<int> B_dims_array(ndim);
   std::vector<int> C_dims_array(ndim);
   utils::ComputeBroadcastBinaryOpDims(
       A_ndim,
       A_dims,
       B_ndim,
       B_dims,
       A_dims_array.data(),
       B_dims_array.data(),
       C_dims_array.data());
   if (A_dims_array == B_dims_array) {
     const int size = std::accumulate(
         C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>());
     SimpleBinaryOpCUDAKernel<TIn, TOut, BinaryOperator>
         <<<CAFFE_GET_BLOCKS(size),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context->cuda_stream()>>>(size, op, A, B, C);
     return;
   }
   int pivot;
   bool broadcast_1st;
   if (utils::IsRowwiseBroadcastBinaryOp(
           ndim,
           A_dims_array.data(),
           B_dims_array.data(),
           &pivot,
           &broadcast_1st)) {
     BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
         ndim,
         C_dims_array.data(),
         pivot,
         true,
         broadcast_1st,
         op,
         A,
         B,
         C,
         context);
     return;
   }
   if (utils::IsColwiseBroadcastBinaryOp(
           ndim,
           A_dims_array.data(),
           B_dims_array.data(),
           &pivot,
           &broadcast_1st)) {
     BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
         ndim,
         C_dims_array.data(),
         pivot,
         false,
         broadcast_1st,
         op,
         A,
         B,
         C,
         context);
     return;
   }
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
       ndim,
       BroadcastBinaryOpImpl,
       TIn,
       TOut,
       BinaryOperator,
       A_dims_array.data(),
       B_dims_array.data(),
       C_dims_array.data(),
       op,
       A,
       B,
       C,
       context);
 }

 } // namespace

 #define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, op)            \
   __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
     CUDA_1D_KERNEL_LOOP(i, N) {                                     \
       Y[i] = op(X[i]);                                              \
     }                                                               \
   }                                                                 \
   template <>                                                       \
   void Func<T, CUDAContext>(                                        \
       const int N, const T* x, T* y, CUDAContext* context) {        \
     Func##CUDAKernel<<<                                             \
         CAFFE_GET_BLOCKS(N),                                        \
         CAFFE_CUDA_NUM_THREADS,                                     \
         0,                                                          \
         context->cuda_stream()>>>(N, x, y);                         \
   }

 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, InvSqrt, rsqrtf)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, Square<float>)

 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, Not)

 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, Negate<float>)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, Negate<double>)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg, Negate<std::int32_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg, Negate<std::int64_t>)

 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, Sign<float>)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, Sign<double>)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign, Sign<std::int32_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign, Sign<std::int64_t>)

 #undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION

 #define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                            \
   template <>                                                        \
   void SinCos<T, CUDAContext>(                                       \
       const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \
     SinCosCUDAKernel<<<                                              \
         CAFFE_GET_BLOCKS(N),                                         \
         CAFFE_CUDA_NUM_THREADS,                                      \
         0,                                                           \
         context->cuda_stream()>>>(N, x, ys, yc);                     \
   }
 CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
 CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
 #undef CAFFE2_SPECIALIZED_CUDA_SINCOS

 #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
   template <>                                                     \
   void Func<TIn, CUDAContext>(                                    \
       const int N,                                                \
       const TIn* A,                                               \
       const TIn* B,                                               \
       TOut* C,                                                    \
       CUDAContext* context) {                                     \
     SimpleBinaryOpCUDAKernel<TIn, TOut, Op<TIn>>                  \
         <<<CAFFE_GET_BLOCKS(N),                                   \
            CAFFE_CUDA_NUM_THREADS,                                \
            0,                                                     \
            context->cuda_stream()>>>(N, Op<TIn>(), A, B, C);      \
   }

 #define DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(Func, Op)                \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)

 DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
 DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
 DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LT, thrust::less)
 DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
 DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
 DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)

 #undef DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION

 #define DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Func, Op)                         \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, float, Func, Op)               \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, double, Func, Op)             \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)

 DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Add, AddFunctor)
 DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
 DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
 DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Div, DivFunctor)

 #undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTION

 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)

 #define DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(Func, Op)                 \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)                 \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
   DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)

 DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
 DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
 DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)

 #undef DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION

 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     float,
     float,
     ElemwiseMax,
     thrust::maximum);

 #undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION

 #define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
   template <>                                                           \
   void Rowwise##Func<TIn, CUDAContext, true>(                           \
       const int rows,                                                   \
       const int cols,                                                   \
       const TIn* A,                                                     \
       const TIn* B,                                                     \
       TOut* C,                                                          \
       CUDAContext* context) {                                           \
     const int size = rows * cols;                                       \
     RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                  \
         <<<CAFFE_GET_BLOCKS(size),                                      \
            CAFFE_CUDA_NUM_THREADS,                                      \
            0,                                                           \
            context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
   }                                                                     \
   template <>                                                           \
   void Rowwise##Func<TIn, CUDAContext, false>(                          \
       const int rows,                                                   \
       const int cols,                                                   \
       const TIn* A,                                                     \
       const TIn* B,                                                     \
       TOut* C,                                                          \
       CUDAContext* context) {                                           \
     const int size = rows * cols;                                       \
     RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                 \
         <<<CAFFE_GET_BLOCKS(size),                                      \
            CAFFE_CUDA_NUM_THREADS,                                      \
            0,                                                           \
            context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
   }                                                                     \
   template <>                                                           \
   void Colwise##Func<TIn, CUDAContext, true>(                           \
       const int rows,                                                   \
       const int cols,                                                   \
       const TIn* A,                                                     \
       const TIn* B,                                                     \
       TOut* C,                                                          \
       CUDAContext* context) {                                           \
     const int size = rows * cols;                                       \
     ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                  \
         <<<CAFFE_GET_BLOCKS(size),                                      \
            CAFFE_CUDA_NUM_THREADS,                                      \
            0,                                                           \
            context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
   }                                                                     \
   template <>                                                           \
   void Colwise##Func<TIn, CUDAContext, false>(                          \
       const int rows,                                                   \
       const int cols,                                                   \
       const TIn* A,                                                     \
       const TIn* B,                                                     \
       TOut* C,                                                          \
       CUDAContext* context) {                                           \
     const int size = rows * cols;                                       \
     ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                 \
         <<<CAFFE_GET_BLOCKS(size),                                      \
            CAFFE_CUDA_NUM_THREADS,                                      \
            0,                                                           \
            context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C);   \
   }

 #define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)

 DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
 DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
 DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
 DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
 DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
 DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)

 #undef DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION

 #define DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
       std::int32_t, std::int32_t, Func, Op)                            \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
       std::int64_t, std::int64_t, Func, Op)                            \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)

 DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
 DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
 DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
 DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)

 #undef DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION

 DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
 DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
 DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)

 #define DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
       std::int32_t, std::int32_t, Func, Op)                        \
   DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
       std::int64_t, std::int64_t, Func, Op)

 DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
 DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
 DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)

 #undef DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION

 #undef DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION

 #define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)  \
   template <>                                                         \
   void Func<TIn, CUDAContext>(                                        \
       const int A_ndim,                                               \
       const int* A_dims,                                              \
       const int B_ndim,                                               \
       const int* B_dims,                                              \
       const TIn* A,                                                   \
       const TIn* B,                                                   \
       TOut* C,                                                        \
       CUDAContext* context) {                                         \
     BroadcastBinaryOp<TIn, TOut, Op<TIn>>(                            \
         A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \
   }

 #define DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)

 DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
 DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
 DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
 DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
 DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
 DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)

 #undef DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION

 #define DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
       std::int32_t, std::int32_t, Func, Op)                         \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
       std::int64_t, std::int64_t, Func, Op)                         \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)

 DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
 DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
 DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
 DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)

 #undef DEFINE_BROADCAST_CUDA_BINARY_FUNCTION

 DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
 DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
 DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)

 #define DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                      \
       std::int32_t, std::int32_t, Func, Op)                     \
   DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)

 DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
 DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
 DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)

 #undef DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION

 #undef DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION

 #define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
   template <>                                                           \
   void Funcname<T, CUDAContext>(                                        \
       const int N,                                                      \
       const T* src,                                                     \
       T* dst,                                                           \
       Tensor<CUDAContext>* scratch_ptr,                                 \
       CUDAContext* context) {                                           \
     size_t memRequired = 0;                                             \
     cub::DeviceReduce::func(                                            \
         nullptr, memRequired, src, dst, N, context->cuda_stream());     \
     auto buffer_size =                                                  \
         static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
     scratch_ptr->Resize(std::vector<TIndex>{buffer_size});              \
     cub::DeviceReduce::func(                                            \
         static_cast<void*>(scratch_ptr->mutable_data<T>()),             \
         memRequired,                                                    \
         src,                                                            \
         dst,                                                            \
         N,                                                              \
         context->cuda_stream());                                        \
   }

 DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min)
 DELEGATE_REDUCTION_FUNCTION(float, ReduceMax, Max)
 DELEGATE_REDUCTION_FUNCTION(int32_t, ReduceMax, Max)
 DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)

 #undef DELEGATE_REDUCTION_FUNCTION

 // Caffe2 gemm provides a simpler interface to the gemm functions, with the
 // limitation that the data has to be contiguous in memory.
 template <>
 void Gemm<float, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float* A,
     const float* B,
     const float beta,
     float* C,
     CUDAContext* context,
     TensorProto::DataType math_type) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   CUBLAS_ENFORCE(cublasSgemm(
       context->cublas_handle(),
       cuTransB,
       cuTransA,
       N,
       M,
       K,
       &alpha,
       B,
       ldb,
       A,
       lda,
       &beta,
       C,
       N));
 }

 template <>
 void Gemm<float16, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float16* A,
     const float16* B,
     const float beta,
     float16* C,
     CUDAContext* context,
     TensorProto::DataType math_type) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   if (math_type == TensorProto_DataType_FLOAT) {
     CUBLAS_CHECK(cublasSgemmEx(
         context->cublas_handle(),
         cuTransB,
         cuTransA,
         N,
         M,
         K,
         &alpha,
         B,
         CUDA_R_16F,
         ldb,
         A,
         CUDA_R_16F,
         lda,
         &beta,
         C,
         CUDA_R_16F,
         N));

   } else if (math_type == TensorProto_DataType_FLOAT16) {
     // convert alpha, beta from float -> __half
     auto alpha_fp16 = convert::floatToHalf(alpha);
     auto beta_fp16 = convert::floatToHalf(beta);

     // call cublasHgemm
     CUBLAS_CHECK(cublasHgemm(
         context->cublas_handle(),
         cuTransB,
         cuTransA,
         N,
         M,
         K,
         &alpha_fp16,
         (const __half*)B,
         ldb,
         (const __half*)A,
         lda,
         &beta_fp16,
         (__half*)C,
         N));
   } else {
     // fail
     CAFFE_THROW("Unsupported math type");
   }
 }

 template <>
 void BiasCHW<float, CUDAContext>(
     const float* bias,
     const float* bias_multiplier,
     const int bias_channels,
     const int image_size,
     float* image,
     CUDAContext* context) {
   Gemm<float, CUDAContext>(
       CblasNoTrans,
       CblasNoTrans,
       bias_channels,
       image_size,
       1,
       1,
       bias,
       bias_multiplier,
       1,
       image,
       context);
 }

 template <>
 void GemmBatched<float, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int batch_size,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float* A,
     const float* B,
     const float beta,
     float* C,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch,
     TensorProto::DataType math_type) {
   const int a_stride = M * K;
   const int b_stride = K * N;
   const int c_stride = M * N;
 #if __CUDACC_VER_MAJOR__ < 8
   // loop over matrices in the batch
   for (int i = 0; i < batch_size; ++i) {
     math::Gemm<float, CUDAContext>(
         TransA,
         TransB,
         M,
         N,
         K,
         alpha,
         A + a_stride * i,
         B + b_stride * i,
         beta,
         C + c_stride * i,
         context);
   }
 #else
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   const int lda = (TransA == CblasNoTrans) ? K : M;
   const int ldb = (TransB == CblasNoTrans) ? N : K;
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   CUBLAS_ENFORCE(cublasSgemmStridedBatched(
       context->cublas_handle(),
       cuTransB,
       cuTransA,
       N,
       M,
       K,
       &alpha,
       B,
       ldb,
       b_stride,
       A,
       lda,
       a_stride,
       &beta,
       C,
       N,
       c_stride,
       batch_size));
 #endif
 }

 namespace {

 __global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     Y[i] = __float2half(X[i]);
   }
 }

 __global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     Y[i] = __half2float(X[i]);
   }
 }

 }; // namespace

 template <>
 void GemmBatched<float16, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int batch_size,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float16* A,
     const float16* B,
     const float beta,
     float16* C,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch,
     TensorProto::DataType math_type) {
   const int a_stride = M * K;
   const int b_stride = K * N;
   const int c_stride = M * N;
 #if __CUDACC_VER_MAJOR__ < 8
   // loop over matrices in the batch
   for (int i = 0; i < batch_size; ++i) {
     math::Gemm<float16, CUDAContext>(
         TransA,
         TransB,
         M,
         N,
         K,
         alpha,
         A + a_stride * i,
         B + b_stride * i,
         beta,
         C + c_stride * i,
         context);
   }
 #else
   // 3 options:
   // 1) scratch != null = cast to fp32, SgemmStridedBatched, cast result to fp16
   // 2) math_type == FLOAT, scratch == nullptr = looped SgemmEx
   // 3) math_type == FLOAT16, scratch == nullptr = batched Hgemm

   if (scratch != nullptr) {
     const int A_size = a_stride * batch_size;
     const int B_size = b_stride * batch_size;
     // cast, cublasSgemmStridedBatched, cast
     size_t in_elems = A_size + B_size;
     size_t out_elems = c_stride * batch_size;

     scratch->Resize(in_elems + out_elems);
     float* scratch_ptr = scratch->mutable_data<float>();

     float* A_fp32 = scratch_ptr;
     float* B_fp32 = scratch_ptr + A_size;
     float* C_fp32 = scratch_ptr + A_size + B_size;

     // cast A, B into fp32
     HalfToFloatKernel<<<
         CAFFE_GET_BLOCKS(A_size),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(A_size, (half*)A, A_fp32);
     HalfToFloatKernel<<<
         CAFFE_GET_BLOCKS(B_size),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(B_size, (half*)B, B_fp32);

     // run fp32 batched Gemm
     GemmBatched<float, CUDAContext>(
         TransA,
         TransB,
         batch_size,
         M,
         N,
         K,
         alpha,
         A_fp32,
         B_fp32,
         beta,
         C_fp32,
         context);

     // cast result back to fp16
     FloatToHalfKernel<<<
         CAFFE_GET_BLOCKS(batch_size * M * N),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(batch_size * M * N, C_fp32, (half*)C);
   } else {
     if (math_type == TensorProto_DataType_FLOAT) {
       // loop over matrices in the batch
       for (int i = 0; i < batch_size; ++i) {
         math::Gemm<float16, CUDAContext>(
             TransA,
             TransB,
             M,
             N,
             K,
             alpha,
             A + a_stride * i,
             B + b_stride * i,
             beta,
             C + c_stride * i,
             context);
       }
     } else if (math_type == TensorProto_DataType_FLOAT16) {
       // Note that cublas follows fortran order, so the order is different from
       // the cblas convention.
       const int lda = (TransA == CblasNoTrans) ? K : M;
       const int ldb = (TransB == CblasNoTrans) ? N : K;
       cublasOperation_t cuTransA =
           (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
       cublasOperation_t cuTransB =
           (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;

       // convert alpha, beta from float -> __half
       auto alpha_fp16 = convert::floatToHalf(alpha);
       auto beta_fp16 = convert::floatToHalf(beta);
       CUBLAS_ENFORCE(cublasHgemmStridedBatched(
           context->cublas_handle(),
           cuTransB,
           cuTransA,
           N,
           M,
           K,
           &alpha_fp16,
           (const __half*)B,
           ldb,
           b_stride,
           (const __half*)A,
           lda,
           a_stride,
           &beta_fp16,
           (__half*)C,
           N,
           c_stride,
           batch_size));
     }
   }
 #endif
 }

 #if CUDA_VERSION >= 9000

 // No change, but required. Defer to default CUDA engine
 template <>
 void Gemm<float, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float* A,
     const float* B,
     const float beta,
     float* C,
     CUDAContext* context,
     TensorProto::DataType math_type) {
   return Gemm<float, CUDAContext>(
       TransA, TransB, M, N, K, alpha, A, B, beta, C, context, math_type);
 }

 template <>
 void Gemm<float16, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float16* A,
     const float16* B,
     const float beta,
     float16* C,
     CUDAContext* context,
     TensorProto::DataType math_type) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (TransA == CblasNoTrans) ? K : M;
   int ldb = (TransB == CblasNoTrans) ? N : K;
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;

   // enable TensorCore for this call on this handle
   if (TensorCoreAvailable()) {
     CUBLAS_ENFORCE(
         cublasSetMathMode(context->cublas_handle(), CUBLAS_TENSOR_OP_MATH));
   }

   CUBLAS_CHECK(cublasGemmEx(
       context->cublas_handle(),
       cuTransB,
       cuTransA,
       N,
       M,
       K,
       &alpha,
       B,
       CUDA_R_16F,
       ldb,
       A,
       CUDA_R_16F,
       lda,
       &beta,
       C,
       CUDA_R_16F,
       N,
       CUDA_R_32F,
       CUBLAS_GEMM_DFALT_TENSOR_OP));

   // Now disable TensorCore math for subsequent calls to this handle
   if (TensorCoreAvailable()) {
     CUBLAS_ENFORCE(
         cublasSetMathMode(context->cublas_handle(), CUBLAS_DEFAULT_MATH));
   }
 }

 template <>
 void GemmBatched<float, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int batch_size,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float* A,
     const float* B,
     const float beta,
     float* C,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch,
     TensorProto::DataType math_type) {
   return GemmBatched<float, CUDAContext, DefaultEngine>(
       TransA,
       TransB,
       batch_size,
       M,
       N,
       K,
       alpha,
       A,
       B,
       beta,
       C,
       context,
       scratch,
       math_type);
 }

 template <>
 void GemmBatched<float16, CUDAContext, TensorCoreEngine>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int batch_size,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float16* A,
     const float16* B,
     const float beta,
     float16* C,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch,
     TensorProto::DataType math_type) {
   return GemmBatched<float16, CUDAContext, DefaultEngine>(
       TransA,
       TransB,
       batch_size,
       M,
       N,
       K,
       alpha,
       A,
       B,
       beta,
       C,
       context,
       scratch,
       math_type);
 }

 #endif // CUDA_VERSION >= 9000

 template <>
 void GemmEx<float, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const CBLAS_TRANSPOSE TransB,
     const int M,
     const int N,
     const int K,
     const float alpha,
     const float* A,
     const int lda,
     const float* B,
     const int ldb,
     const float beta,
     float* C,
     const int ldc,
     CUDAContext* context) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
       (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   CUBLAS_ENFORCE(cublasSgemm(
       context->cublas_handle(),
       cuTransB,
       cuTransA,
       N,
       M,
       K,
       &alpha,
       B,
       ldb,
       A,
       lda,
       &beta,
       C,
       ldc));
 }

 template <>
 void Gemv<float, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const int M,
     const int N,
     const float alpha,
     const float* A,
     const float* x,
     const float beta,
     float* y,
     CUDAContext* context,
     TensorProto::DataType math_type) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
   CUBLAS_ENFORCE(cublasSgemv(
       context->cublas_handle(),
       cuTransA,
       N,
       M,
       &alpha,
       A,
       N,
       x,
       1,
       &beta,
       y,
       1));
 }

 // Batched Add variants
 namespace {

 template <typename T>
 __global__ void AddStripedBatchKernel(
     const int N,
     const T* first,
     T* Y,
     const int stripe,
     const int batch) {
   for (int j = 0; j < batch; j++) {
     const T* x = first + j * stripe;
     CUDA_1D_KERNEL_LOOP(i, N) {
       float tmpY = convert::To<T, float>(Y[i]);
       tmpY += convert::To<T, float>(x[i]);
       Y[i] = convert::To<float, T>(tmpY);
     }
   }
 }
 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T)              \
   template <>                                                     \
   void AddStripedBatch<T, CUDAContext>(                           \
       const int N,                                                \
       const T* first,                                             \
       T* Y,                                                       \
       const int stripe,                                           \
       const int batch,                                            \
       CUDAContext* context) {                                     \
     AddStripedBatchKernel<T>                                      \
         <<<CAFFE_GET_BLOCKS(N),                                   \
            CAFFE_CUDA_NUM_THREADS,                                \
            0,                                                     \
            context->cuda_stream()>>>(N, first, Y, stripe, batch); \
   }

 CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float);
 CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16);
 #undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH

 template <>
 void Gemv<float16, CUDAContext>(
     const CBLAS_TRANSPOSE TransA,
     const int M,
     const int N,
     const float alpha,
     const float16* A,
     const float16* x,
     const float beta,
     float16* y,
     CUDAContext* context,
     TensorProto::DataType math_type) {
   cublasOperation_t cuTransA =
       (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;

   // sort out what we need to call cublasSgemmEx / cublasHgemm
   int m = (cuTransA == CUBLAS_OP_N) ? N : M;
   int k = (cuTransA == CUBLAS_OP_N) ? M : N;
   int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
   int LDC = m;

   if (math_type == TensorProto_DataType_FLOAT) {
     CUBLAS_CHECK(cublasSgemmEx(
         context->cublas_handle(),
         cuTransA,
         CUBLAS_OP_N,
         m,
         1,
         k,
         &alpha,
         A,
         CUDA_R_16F,
         LDA,
         x,
         CUDA_R_16F,
         k,
         &beta,
         y,
         CUDA_R_16F,
         LDC));
   } else if (math_type == TensorProto_DataType_FLOAT16) {
     auto alpha_fp16 = convert::floatToHalf(alpha);
     auto beta_fp16 = convert::floatToHalf(beta);

     CUBLAS_CHECK(cublasHgemm(
         context->cublas_handle(),
         cuTransA,
         CUBLAS_OP_N,
         m,
         1,
         k,
         &alpha_fp16,
         (const __half*)A,
         LDA,
         (const __half*)x,
         k,
         &beta_fp16,
         (__half*)y,
         LDC));
   } else {
     // fail
     CAFFE_THROW("Unsupported math type");
   }
 }

 namespace {
 template <typename T>
 __global__ void SetKernel(const int N, const T alpha, T* Y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     Y[i] = alpha;
   }
 }
 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_SET(T)                             \
   template <>                                                      \
   void Set<T, CUDAContext>(                                        \
       const size_t N, const T alpha, T* Y, CUDAContext* context) { \
     SetKernel<<<                                                   \
         CAFFE_GET_BLOCKS(N),                                       \
         CAFFE_CUDA_NUM_THREADS,                                    \
         0,                                                         \
         context->cuda_stream()>>>(N, alpha, Y);                    \
   }

 CAFFE2_SPECIALIZED_CUDA_SET(float);
 CAFFE2_SPECIALIZED_CUDA_SET(double);
 CAFFE2_SPECIALIZED_CUDA_SET(bool);
 CAFFE2_SPECIALIZED_CUDA_SET(int8_t);
 CAFFE2_SPECIALIZED_CUDA_SET(int16_t);
 CAFFE2_SPECIALIZED_CUDA_SET(float16);
 CAFFE2_SPECIALIZED_CUDA_SET(int);
 CAFFE2_SPECIALIZED_CUDA_SET(int64_t);
 CAFFE2_SPECIALIZED_CUDA_SET(char);
 CAFFE2_SPECIALIZED_CUDA_SET(uint8_t);
 CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
 #undef CAFFE2_SPECIALIZED_CUDA_SET

 namespace {
 template <typename T>
 __global__ void
 UniformShift(const size_t N, const float min, const float max, T* x) {
   float scale = max - min;
   CUDA_1D_KERNEL_LOOP(i, N) {
     x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
   }
 }

 __global__ void
 UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
   int* x_int = reinterpret_cast<int*>(x);
   int range = (max - min + 1);
   CUDA_1D_KERNEL_LOOP(i, N) {
     x_int[i] = min + static_cast<int>(x[i] % range);
   }
 }
 } // namespace

 template <>
 void RandUniform<float, CUDAContext>(
     const size_t n,
     const float min,
     const float max,
     float* r,
     CUDAContext* context) {
   CURAND_ENFORCE(curandGenerateUniform(context->curand_generator(), r, n));
   UniformShift<float>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, min, max, r);
 }

 template <>
 void RandUniform<double, CUDAContext>(
     const size_t n,
     const double min,
     const double max,
     double* r,
     CUDAContext* context) {
   CURAND_ENFORCE(
       curandGenerateUniformDouble(context->curand_generator(), r, n));
   UniformShift<double>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, min, max, r);
 }

 template <>
 void RandUniform<int, CUDAContext>(
     const size_t n,
     const int min,
     const int max,
     int* r,
     CUDAContext* context) {
   CURAND_ENFORCE(curandGenerate(
       context->curand_generator(), reinterpret_cast<unsigned int*>(r), n));
   UniformIntFit<<<
       CAFFE_GET_BLOCKS(n),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context->cuda_stream()>>>(
       n, min, max, reinterpret_cast<unsigned int*>(r));
 }

 template <typename T>
 size_t HandleOddLengthRandGaussian(
     const size_t n,
     const T mean,
     const T std,
     T* r,
     CUDAContext* context) {
   if (n % 2 == 1) {
     std::default_random_engine generator;
     std::normal_distribution<T> distribution(mean, std);
     const T random_value = distribution(generator);
     math::Set<T, CUDAContext>(1, random_value, r + (n - 1), context);
     return n - 1;
   }
   return n;
 }

 template <>
 void RandGaussian<float, CUDAContext>(
     const size_t n,
     const float mean,
     const float std,
     float* r,
     CUDAContext* context) {
   // If n is odd, we add a random Gaussian value at the end manually
   // and generate n-1 random values using curandGenerateNormal.
   // curandGenerateNormal requires n to be even.
   const size_t even_n =
       HandleOddLengthRandGaussian<float>(n, mean, std, r, context);
   CURAND_ENFORCE(
       curandGenerateNormal(context->curand_generator(), r, even_n, mean, std));
 }

 template <>
 void RandGaussian<double, CUDAContext>(
     const size_t n,
     const double mean,
     const double std,
     double* r,
     CUDAContext* context) {
   const size_t even_n =
       HandleOddLengthRandGaussian<double>(n, mean, std, r, context);
   CURAND_ENFORCE(curandGenerateNormalDouble(
       context->curand_generator(), r, even_n, mean, std));
 }

 template <>
 void Dot<float, CUDAContext>(
     const int n,
     const float* a,
     const float* b,
     float* y,
     CUDAContext* context) {
   float result;
   CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result));
   context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
 }

 template <>
 void Dot<float16, CUDAContext>(
     const int n,
     const float16* a,
     const float16* b,
     float16* y,
     CUDAContext* context) {
   float16 result;
   // execute with 32-bit math
   CUBLAS_CHECK(cublasDotEx(
       context->cublas_handle(),
       n,
       a,
       CUDA_R_16F,
       1,
       b,
       CUDA_R_16F,
       1,
       &result,
       CUDA_R_16F,
       CUDA_R_32F));
   context->Copy<float16, CPUContext, CUDAContext>(1, &result, y);
 }

 // A previous version of caffe2 used Thrust but it turns out that thrust
 // reduction has an implicit scratch space allocation and deallocation, which
 // may interfere with NCCL and create a deadlock. Hence we are using a custom
 // reduction here.
 #define SUM_KERNEL_NTHREADS 128
 template <typename T>
 __global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
   const int idx = threadIdx.x;
   __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];

   reduction_buffer[idx] = 0;

   // A multilevel reduction.
   // N -> 128
   if (!square) {
     for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
       reduction_buffer[idx] += convert::To<T, float>(X[i]);
     }
   } else {
     for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
       float Xi = convert::To<T, float>(X[i]);
       reduction_buffer[idx] += Xi * Xi;
     }
   }
   __syncthreads();
   // 128 -> 32
   if (idx < 32) {
     reduction_buffer[idx] += reduction_buffer[idx + 32] +
         reduction_buffer[idx + 64] + reduction_buffer[idx + 96];
   }
   __syncthreads();
   // 32 -> 1
   if (idx == 0) {
     float tmp = 0;
     for (int i = 0; i < 32; ++i) {
       tmp += reduction_buffer[i];
     }
     *Y = convert::To<float, T>(tmp);
   }
 }

 // According to the benchmarks script
 // caffe2/caffe2/experiments/python/device_reduce_sum_bench.py,
 // device reduce is slower for N <= 10000.
 #define DEVICE_REDUCE_SIZE_THRESHOLD 10000

 namespace {

 template <typename T>
 __global__ void SumConvertKernel(float* sum, T* dest) {
   *dest = convert::To<float, T>(*sum);
 }

 template <typename T, typename IterT>
 void SumGenericIter(
     const int N,
     IterT it,
     T*& dest,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch_ptr) {
   size_t memRequired = 0;
   cub::DeviceReduce::Sum(
       nullptr, memRequired, it, dest, N, context->cuda_stream());
   auto buffer_size =
       static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
   if (!dest) {
     // allocate one more T at the end of scratch for dest
     scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1});
     dest = scratch_ptr->template mutable_data<T>() + buffer_size;
   } else {
     scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
   }
   cub::DeviceReduce::Sum(
       static_cast<void*>(scratch_ptr->template mutable_data<T>()),
       memRequired,
       it,
       dest,
       N,
       context->cuda_stream());
 }
 } // namespace

 template <>
 void Sum<float, CUDAContext>(
     const int N,
     const float* x,
     float* y,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<float>(N, x, y, context, scratch_ptr);
   } else {
     SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
         N, x, y, false);
   }
 }

 template <>
 void Sum<int32_t, CUDAContext>(
     const int N,
     const int32_t* x,
     int32_t* y,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
   } else {
     SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
         N, x, y, false);
   }
 }

 namespace {
 template <typename T>
 struct FloatTransform {
   inline __host__ __device__ float operator()(const T v) const {
     return convert::To<T, float>(v);
   }
 };
 } // namespace

 #define CAFFE2_MATH_SUM_FUNC(T)                                           \
   template <>                                                             \
   void Sum<T, CUDAContext>(                                               \
       const int N,                                                        \
       const T* x,                                                         \
       T* y,                                                               \
       CUDAContext* context,                                               \
       Tensor<CUDAContext>* scratch_ptr) {                                 \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
       FloatTransform<T> transform;                                        \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
           x, transform);                                                  \
       float* sum = nullptr;                                               \
       SumGenericIter<float>(N, it, sum, context, scratch_ptr);            \
       SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y);      \
     } else {                                                              \
       SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(   \
           N, x, y, false);                                                \
     }                                                                     \
   }

 CAFFE2_MATH_SUM_FUNC(float16)
 #undef CAFFE2_MATH_SUM_FUNC

 namespace {
 template <typename T>
 struct SqrTransform {
   inline __host__ __device__ T operator()(const T v) const {
     return v * v;
   }
 };
 } //  namespace

 template <>
 void SumSqr<float, CUDAContext>(
     const int N,
     const float* x,
     float* y,
     CUDAContext* context,
     Tensor<CUDAContext>* scratch_ptr) {
   if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
     SqrTransform<float> transform;
     cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
         x, transform);
     SumGenericIter<float>(N, it, y, context, scratch_ptr);
   } else {
     SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
         N, x, y, true);
   }
 }

 #define CAFFE2_MATH_SUMSQR_FUNC(T)                                      \
   template <>                                                           \
   void SumSqr<T, CUDAContext>(                                          \
       const int N,                                                      \
       const T* x,                                                       \
       T* y,                                                             \
       CUDAContext* context,                                             \
       Tensor<CUDAContext>* scratch_ptr) {                               \
     if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {              \
       FloatTransform<T> float_transform;                                \
       cub::TransformInputIterator<float, FloatTransform<T>, const T*>   \
           float_it(x, float_transform);                                 \
       SqrTransform<float> sqr_transform;                                \
       cub::TransformInputIterator<                                      \
           float,                                                        \
           SqrTransform<float>,                                          \
           decltype(float_it)>                                           \
           it(float_it, sqr_transform);                                  \
       float* sum = nullptr;                                             \
       SumGenericIter<float>(N, it, sum, context, scratch_ptr);          \
       SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y);    \
     } else {                                                            \
       SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \
           N, x, y, true);                                               \
     }                                                                   \
   }

 CAFFE2_MATH_SUMSQR_FUNC(float16)
 #undef CAFFE2_MATH_SUMSQR_FUNC
 #undef DEVICE_REDUCE_SIZE_THRESHOLD

 namespace {
 template <typename T>
 __global__ void
 SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     y[i] = x[i * D + idx[i]];
   }
 }
 } // namespace

 template <>
 void Select<float, CUDAContext>(
     const int N,
     const int D,
     const float* x,
     const int* idx,
     float* y,
     CUDAContext* context) {
   SelectKernel<float>
       <<<CAFFE_GET_BLOCKS(N),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(N, D, x, idx, y);
 }

 template <>
 void Select<float16, CUDAContext>(
     const int N,
     const int D,
     const float16* x,
     const int* idx,
     float16* y,
     CUDAContext* context) {
   SelectKernel<float16>
       <<<CAFFE_GET_BLOCKS(N),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(N, D, x, idx, y);
 }

 namespace {
 template <typename T>
 __global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
   CUDA_1D_KERNEL_LOOP(i, n) {
     // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
     y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
   }
 }

 template <typename T>
 __global__ void
 ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
   CUDA_1D_KERNEL_LOOP(i, n) {
     y[i] = x[i] * (*alpha);
   }
 }

 template <typename T>
 __global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
   CUDA_1D_KERNEL_LOOP(i, n) {
     y[i] = powf(x[i], exponent);
   }
 }

 // fp16 specialization
 template <>
 __global__ void ScaleKernelDeviceAlpha(
     const int n,
     const float* alpha,
     const float16* x,
     float16* y) {
   CUDA_1D_KERNEL_LOOP(i, n) {
     y[i] = convert::To<float, float16>(
         convert::To<float16, float>(x[i]) * (*alpha));
   }
 }

 } // namespace

 template <>
 void Powx<float, CUDAContext>(
     const int N,
     const float* a,
     const float b,
     float* y,
     CUDAContext* context) {
   PowKernel<<<
       CAFFE_GET_BLOCKS(N),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context->cuda_stream()>>>(N, a, b, y);
 }

 template <>
 void Scale<float, CUDAContext>(
     const int n,
     const float alpha,
     const float* x,
     float* y,
     CUDAContext* context) {
   ScaleKernel<float>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, alpha, x, y);
 }

 template <>
 void Scale<float16, CUDAContext>(
     const int n,
     const float alpha,
     const float16* x,
     float16* y,
     CUDAContext* context) {
   ScaleKernel<float16>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, alpha, x, y);
 }

 template <>
 void Scale<float, CUDAContext>(
     const int n,
     const float* alpha,
     const float* x,
     float* y,
     CUDAContext* context) {
   ScaleKernelDeviceAlpha<float>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, alpha, x, y);
 }

 template <>
 void Scale<float16, CUDAContext>(
     const int n,
     const float* alpha,
     const float16* x,
     float16* y,
     CUDAContext* context) {
   ScaleKernelDeviceAlpha<float16>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, alpha, x, y);
 }

 template <>
 void Axpy<float, CUDAContext>(
     const int N,
     const float alpha,
     const float* X,
     float* Y,
     CUDAContext* context) {
   CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
 }

 template <>
 void Axpy<double, CUDAContext>(
     const int N,
     const float alpha,
     const double* X,
     double* Y,
     CUDAContext* context) {
   double alpha_d{alpha};
   CUBLAS_ENFORCE(
       cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1));
 }

 template <>
 void Axpy<float16, CUDAContext>(
     const int N,
     const float alpha,
     const float16* X,
     float16* Y,
     CUDAContext* context) {
   CUBLAS_CHECK(cublasAxpyEx(
       context->cublas_handle(),
       N,
       &alpha,
       CUDA_R_16F,
       X,
       CUDA_R_16F,
       1,
       Y,
       CUDA_R_16F,
       1,
       CUDA_R_32F));
 }

 namespace {
 template <typename T>
 __global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
   CUDA_1D_KERNEL_LOOP(index, n) {
     y[index] = convert::Get<T>(
         convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
   }
 }
 } // namespace

 template <>
 void Axpy<float, CUDAContext>(
     const int n,
     const float* alpha,
     const float* X,
     float* Y,
     CUDAContext* context) {
   AxpyKernel<float>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, alpha, X, Y);
 }

 template <>
 void Axpy<float16, CUDAContext>(
     const int n,
     const float* alpha,
     const float16* X,
     float16* Y,
     CUDAContext* context) {
   AxpyKernel<float16>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, alpha, X, Y);
 }

 namespace {
 template <typename T>
 __global__ void
 AxpbyKernel(const int n, const T a, const T* x, const T b, T* y) {
   CUDA_1D_KERNEL_LOOP(index, n) {
     y[index] = x[index] * a + y[index] * b;
   }
 }
 } // namespace

 template <>
 void Axpby<float, CUDAContext>(
     const int n,
     const float a,
     const float* x,
     const float b,
     float* y,
     CUDAContext* context) {
   AxpbyKernel<float>
       <<<CAFFE_GET_BLOCKS(n),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(n, a, x, b, y);
 }

 namespace {

 template <typename T>
 __global__ void Im2ColNCHWCUDAKernel(
     const int n,
     const int input_h,
     const int input_w,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int stride_h,
     const int stride_w,
     const int output_h,
     const int output_w,
     const T* img_data,
     T* col_data) {
   CUDA_1D_KERNEL_LOOP(index, n) {
     const int w_out = index % output_w;
     const int h_index = index / output_w;
     const int h_out = h_index % output_h;
     const int channel_in = h_index / output_h;
     const int channel_out = channel_in * kernel_h * kernel_w;
     const int h_in = h_out * stride_h - pad_t;
     const int w_in = w_out * stride_w - pad_l;
     const int output_size = output_h * output_w;
     T* col_data_ptr =
         col_data + (channel_out * output_h + h_out) * output_w + w_out;
     const T* img_data_ptr =
         img_data + (channel_in * input_h + h_in) * input_w + w_in;
     int dh = 0;
     for (int i = 0; i < kernel_h; ++i) {
       int dw = 0;
       for (int j = 0; j < kernel_w; ++j) {
         const int h = h_in + dh;
         const int w = w_in + dw;
 #if __CUDA_ARCH__ >= 350
         *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
             ? __ldg(img_data_ptr + dh * input_w + dw)
             : 0;
 #else
         *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
             ? img_data_ptr[dh * input_w + dw]
             : 0;
 #endif
         col_data_ptr += output_size;
         dw += dilation_w;
       }
       dh += dilation_h;
     }
   }
 }

 template <typename T>
 __global__ void Im2ColNHWCCUDAKernel(
     const int n,
     const int input_h,
     const int input_w,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int stride_h,
     const int stride_w,
     const int output_w,
     const int channels,
     const T* img_data,
     T* col_data) {
   CUDA_1D_KERNEL_LOOP(index, n) {
     const int channel_in = index % channels;
     const int w_out = index / channels % output_w;
     const int h_out = index / channels / output_w;
     const int h_in = h_out * stride_h - pad_t;
     const int w_in = w_out * stride_w - pad_l;
     T* col_data_ptr = col_data +
         (h_out * output_w + w_out) * channels * kernel_h * kernel_w +
         channel_in;
     int dh = 0;
     for (int i = 0; i < kernel_h; ++i) {
       int dw = 0;
       for (int j = 0; j < kernel_w; ++j) {
         const int h = h_in + dh;
         const int w = w_in + dw;
 #if __CUDA_ARCH__ >= 350
         *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
             ? __ldg(img_data + (h * input_w + w) * channels + channel_in)
             : 0;
 #else
         *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
             ? img_data[(h * input_w + w) * channels + channel_in]
             : 0;
 #endif
         col_data_ptr += channels;
         dw += dilation_w;
       }
       dh += dilation_h;
     }
   }
 }

 template <typename T>
 __global__ void Col2ImNCHWCUDAKernel(
     const int n,
     const int input_h,
     const int input_w,
     const int patch_h,
     const int patch_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int stride_h,
     const int stride_w,
     const int output_h,
     const int output_w,
     const T* col_data,
     T* img_data) {
   const int dpatch_h = dilation_h * (patch_h - 1) + 1;
   const int dpatch_w = dilation_w * (patch_w - 1) + 1;

   CUDA_1D_KERNEL_LOOP(index, n) {
     T val = 0;
     const int w = index % input_w + pad_l;
     const int h = index / input_w % input_h + pad_t;
     const int c = index / (input_h * input_w);

     // compute the start and end of the output
     const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
     const int w_col_end = min(w / stride_w + 1, output_w);
     const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
     const int h_col_end = min(h / stride_h + 1, output_h);

     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
         int h_k = (h - h_col * stride_h);
         int w_k = (w - w_col * stride_w);
         if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
           h_k /= dilation_h;
           w_k /= dilation_w;
           const int col_data_index =
               (((c * patch_h + h_k) * patch_w + w_k) * output_h + h_col) *
                   output_w +
               w_col;
 #if __CUDA_ARCH__ >= 350
           val += __ldg(col_data + col_data_index);
 #else
           val += col_data[col_data_index];
 #endif
         }
       }
     }
     img_data[index] = val;
   }
 }

 template <typename T>
 __global__ void Col2ImNHWCCUDAKernel(
     const int n,
     const int input_w,
     const int channels,
     const int patch_h,
     const int patch_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int stride_h,
     const int stride_w,
     const int output_h,
     const int output_w,
     const T* col_data,
     T* img_data) {
   const int dpatch_h = dilation_h * (patch_h - 1) + 1;
   const int dpatch_w = dilation_w * (patch_w - 1) + 1;

   CUDA_1D_KERNEL_LOOP(index, n) {
     T val = 0;
     const int c = index % channels;
     const int w = index / channels % input_w + pad_l;
     const int h = index / channels / input_w + pad_t;
     // compute the start and end of the output
     const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
     const int w_col_end = min(w / stride_w + 1, output_w);
     const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
     const int h_col_end = min(h / stride_h + 1, output_h);
     const int channels_col = patch_h * patch_w * channels;

     for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
       for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
         int h_k = h - h_col * stride_h;
         int w_k = w - w_col * stride_w;
         if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
           h_k /= dilation_h;
           w_k /= dilation_w;
           const int c_col = (h_k * patch_w + w_k) * channels + c;
 #if __CUDA_ARCH__ >= 350
           val += __ldg(
               col_data + (h_col * output_w + w_col) * channels_col + c_col);
 #else
           val += col_data[(h_col * output_w + w_col) * channels_col + c_col];
 #endif
         }
       }
     }
     img_data[index] = val;
   }
 }

 template <typename T, int N, bool kCol2Im>
 __global__ void Im2ColNdNCHWCUDAKernel(
     const int outer_size,
     const int inner_size,
     const int kernel_size,
     SimpleArray<int, N + 1> img_shape,
     SimpleArray<int, N + 1> col_shape,
     SimpleArray<int, N> kernel_shape,
     SimpleArray<int, N> stride,
     SimpleArray<int, N> dilation,
     SimpleArray<int, N> pad,
     const T* X_data,
     T* Y_data) {
   int d_offset[N];
   int d_iter[N];
   for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
     int offset_i = i;
 #pragma unroll
     for (int d_i = N - 1; d_i >= 0; --d_i) {
       d_offset[d_i] = offset_i % kernel_shape.data[d_i];
       offset_i /= kernel_shape.data[d_i];
     }
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       int offset_j = j;
 #pragma unroll
       for (int d_i = N - 1; d_i >= 0; --d_i) {
         d_iter[d_i] = offset_j % col_shape.data[d_i + 1];
         offset_j /= col_shape.data[d_i + 1];
       }
       const int col_index = i * inner_size + j;
       int img_index = i / kernel_size;
       bool is_padding = false;
 #pragma unroll
       for (int d_i = 0; d_i < N; ++d_i) {
         const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] +
             d_offset[d_i] * dilation.data[d_i];
         is_padding |= d_img < 0 || d_img >= img_shape.data[d_i + 1];
         img_index = img_index * img_shape.data[d_i + 1] + d_img;
       }
 #if __CUDA_ARCH__ >= 350
       if (!kCol2Im) {
         Y_data[col_index] = is_padding ? 0 : __ldg(X_data + img_index);
       } else if (!is_padding) {
         atomicAdd(Y_data + img_index, __ldg(X_data + col_index));
       }
 #else
       if (!kCol2Im) {
         Y_data[col_index] = is_padding ? 0 : X_data[img_index];
       } else if (!is_padding) {
         atomicAdd(Y_data + img_index, X_data[col_index]);
       }
 #endif
     }
   }
 }

 template <typename T, int N>
 void Im2ColNdNCHWCUDAImpl(
     const int img_size,
     const int col_size,
     const int* img_shape,
     const int* col_shape,
     const int* kernel_shape,
     const int* stride,
     const int* dilation,
     const int* pad,
     const float* img_data,
     float* col_data,
     CUDAContext* context) {
   const int outer_size = col_shape[0];
   const int inner_size = col_size / outer_size;
   const int kernel_size = std::accumulate(
       kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
   SimpleArray<int, N + 1> img_shape_array;
   SimpleArray<int, N + 1> col_shape_array;
   SimpleArray<int, N> kernel_shape_array;
   SimpleArray<int, N> stride_array;
   SimpleArray<int, N> dilation_array;
   SimpleArray<int, N> pad_array;
   std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
   std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
   std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
   std::memcpy(stride_array.data, stride, N * sizeof(int));
   std::memcpy(dilation_array.data, dilation, N * sizeof(int));
   std::memcpy(pad_array.data, pad, N * sizeof(int));
   Im2ColNdNCHWCUDAKernel<T, N, false>
       <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           outer_size,
           inner_size,
           kernel_size,
           img_shape_array,
           col_shape_array,
           kernel_shape_array,
           stride_array,
           dilation_array,
           pad_array,
           img_data,
           col_data);
 }

 template <typename T, int N>
 void Col2ImNdNCHWCUDAImpl(
     const int img_size,
     const int col_size,
     const int* img_shape,
     const int* col_shape,
     const int* kernel_shape,
     const int* stride,
     const int* dilation,
     const int* pad,
     const float* col_data,
     float* img_data,
     CUDAContext* context) {
   const int outer_size = col_shape[0];
   const int inner_size = col_size / outer_size;
   const int kernel_size = std::accumulate(
       kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
   SimpleArray<int, N + 1> img_shape_array;
   SimpleArray<int, N + 1> col_shape_array;
   SimpleArray<int, N> kernel_shape_array;
   SimpleArray<int, N> stride_array;
   SimpleArray<int, N> dilation_array;
   SimpleArray<int, N> pad_array;
   std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
   std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
   std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
   std::memcpy(stride_array.data, stride, N * sizeof(int));
   std::memcpy(dilation_array.data, dilation, N * sizeof(int));
   std::memcpy(pad_array.data, pad, N * sizeof(int));
   Set<T, CUDAContext>(img_size, 0, img_data, context);
   Im2ColNdNCHWCUDAKernel<T, N, true>
       <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           outer_size,
           inner_size,
           kernel_size,
           img_shape_array,
           col_shape_array,
           kernel_shape_array,
           stride_array,
           dilation_array,
           pad_array,
           col_data,
           img_data);
 }

 } // namespace

 template <>
 void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
     const int channels,
     const int height,
     const int width,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int stride_h,
     const int stride_w,
     const float* img_data,
     float* col_data,
     CUDAContext* context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = channels * output_h * output_w;
   Im2ColNCHWCUDAKernel<float>
       <<<CAFFE_GET_BLOCKS(num_kernels),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           num_kernels,
           height,
           width,
           kernel_h,
           kernel_w,
           dilation_h,
           dilation_w,
           pad_t,
           pad_l,
           stride_h,
           stride_w,
           output_h,
           output_w,
           img_data,
           col_data);
 }

 template <>
 void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
     const int channels,
     const int height,
     const int width,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int stride_h,
     const int stride_w,
     const float* img_data,
     float* col_data,
     CUDAContext* context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = output_h * output_w * channels;
   Im2ColNHWCCUDAKernel<float>
       <<<CAFFE_GET_BLOCKS(num_kernels),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           num_kernels,
           height,
           width,
           kernel_h,
           kernel_w,
           dilation_h,
           dilation_w,
           pad_t,
           pad_l,
           stride_h,
           stride_w,
           output_w,
           channels,
           img_data,
           col_data);
 }

 template <>
 void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
     const int channels,
     const int height,
     const int width,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int stride_h,
     const int stride_w,
     const float* col_data,
     float* img_data,
     CUDAContext* context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = channels * height * width;
   Col2ImNCHWCUDAKernel<float>
       <<<CAFFE_GET_BLOCKS(num_kernels),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           num_kernels,
           height,
           width,
           kernel_h,
           kernel_w,
           dilation_h,
           dilation_w,
           pad_t,
           pad_l,
           stride_h,
           stride_w,
           output_h,
           output_w,
           col_data,
           img_data);
 }

 template <>
 void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
     const int channels,
     const int height,
     const int width,
     const int kernel_h,
     const int kernel_w,
     const int dilation_h,
     const int dilation_w,
     const int pad_t,
     const int pad_l,
     const int pad_b,
     const int pad_r,
     const int stride_h,
     const int stride_w,
     const float* col_data,
     float* img_data,
     CUDAContext* context) {
   const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
   const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
   const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
   const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
   const int num_kernels = height * width * channels;
   Col2ImNHWCCUDAKernel<float>
       <<<CAFFE_GET_BLOCKS(num_kernels),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           num_kernels,
           width,
           channels,
           kernel_h,
           kernel_w,
           dilation_h,
           dilation_w,
           pad_t,
           pad_l,
           stride_h,
           stride_w,
           output_h,
           output_w,
           col_data,
           img_data);
 }

 template <>
 void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
     const int* img_shape,
     const int* col_shape,
     const int* kernel_shape,
     const int* stride,
     const int* dilation,
     const int* pad,
     const float* img_data,
     float* col_data,
     CUDAContext* context) {
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
       N,
       Im2ColNdNCHWCUDAImpl,
       float,
       img_size,
       col_size,
       img_shape,
       col_shape,
       kernel_shape,
       stride,
       dilation,
       pad,
       img_data,
       col_data,
       context);
 }

 template <>
 void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
     const int N,
     const int img_size,
     const int col_size,
     const int* img_shape,
     const int* col_shape,
     const int* kernel_shape,
     const int* stride,
     const int* dilation,
     const int* pad,
     const float* col_data,
     float* img_data,
     CUDAContext* context) {
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
       N,
       Col2ImNdNCHWCUDAImpl,
       float,
       img_size,
       col_size,
       img_shape,
       col_shape,
       kernel_shape,
       stride,
       dilation,
       pad,
       col_data,
       img_data,
       context);
 }

 template <>
 void CopyMatrix<CUDAContext>(
     const size_t itemsize,
     const int M,
     const int N,
     const void* A,
     const int lda,
     void* B,
     const int ldb,
     CUDAContext* context,
     TypeMeta::TypedCopy copy) {
   CAFFE_ENFORCE(!copy, "Copy constructor is not supported in CUDA context");
   cudaMemcpy2DAsync(
       B,
       ldb * itemsize,
       A,
       lda * itemsize,
       N * itemsize,
       M,
       cudaMemcpyDeviceToDevice,
       context->cuda_stream());
 }

 template <>
 void CopyVector<float, CUDAContext>(
     const int N,
     const float* src,
     float* dst,
     CUDAContext* context) {
   if (src != dst && N > 0) {
     cudaMemcpyAsync(
         dst,
         src,
         sizeof(float) * N,
         cudaMemcpyDeviceToDevice,
         context->cuda_stream());
   }
 }

 namespace {

 template <typename T>
 using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;

 template <typename T, class Reducer>
 __global__ void RowwiseReduceKernel(
     const int rows,
     const int cols,
     const Reducer reducer,
     const T init,
     const T* X,
     T* Y) {
   __shared__ typename BlockReduce<T>::TempStorage temp_storage;
   for (int i = blockIdx.x; i < rows; i += gridDim.x) {
     T val = init;
     for (int j = threadIdx.x; j < cols; j += blockDim.x) {
       val = reducer(X[i * cols + j], val);
     }
     val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
     if (threadIdx.x == 0) {
       Y[i] = val;
     }
     __syncthreads();
   }
 }

 template <typename T, class Reducer>
 __global__ void ColwiseReduceKernel(
     const int rows,
     const int cols,
     const Reducer reducer,
     const T init,
     const T* X,
     T* Y) {
   __shared__ typename BlockReduce<T>::TempStorage temp_storage;
   for (int i = blockIdx.x; i < cols; i += gridDim.x) {
     T val = init;
     for (int j = threadIdx.x; j < rows; j += blockDim.x) {
       val = reducer(X[j * cols + i], val);
     }
     val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
     if (threadIdx.x == 0) {
       Y[i] = val;
     }
     __syncthreads();
   }
 }

 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T)                            \
   template <>                                                             \
   void RowwiseMax<T, CUDAContext>(                                        \
       const int N, const int D, const T* x, T* y, CUDAContext* context) { \
     RowwiseReduceKernel<<<                                                \
         std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
         CAFFE_CUDA_NUM_THREADS,                                           \
         0,                                                                \
         context->cuda_stream()>>>(                                        \
         N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y);        \
   }
 CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float)
 #undef CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX

 #define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T)                            \
   template <>                                                             \
   void ColwiseMax<T, CUDAContext>(                                        \
       const int N, const int D, const T* x, T* y, CUDAContext* context) { \
     ColwiseReduceKernel<<<                                                \
         std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
         CAFFE_CUDA_NUM_THREADS,                                           \
         0,                                                                \
         context->cuda_stream()>>>(                                        \
         N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y);        \
   }
 CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(float)
 #undef CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX

 namespace {
 __global__ void
 maximum_kernel(const int N, const float alpha, const float* x, float* y) {
   CUDA_1D_KERNEL_LOOP(i, N) {
     y[i] = fmaxf(x[i], alpha);
   }
 }
 } // namespace

 template <>
 void Maximum(
     const int N,
     const float alpha,
     const float* x,
     float* y,
     CUDAContext* context) {
   maximum_kernel<<<
       std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
       CAFFE_CUDA_NUM_THREADS,
       0,
       context->cuda_stream()>>>(N, alpha, x, y);
 }

 namespace {

 template <typename T, class Reducer, int D>
 __global__ void ReduceTensorCUDAKernel(
     const int outer_size,
     const int inner_size,
     SimpleArray<int, D> X_strides,
     SimpleArray<int, D> Y_dims,
     const Reducer reducer,
     const T init,
     const T* X,
     T* Y) {
   __shared__ typename BlockReduce<T>::TempStorage temp_storage;
   for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
     T val = init;
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       int X_index = 0;
       int Y_index = i * inner_size + j;
 #pragma unroll
       for (int d = D - 1; d >= 0; --d) {
         X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d];
         Y_index /= Y_dims.data[d];
       }
 #if __CUDA_ARCH__ >= 350
       val = reducer(val, __ldg(X + X_index));
 #else
       val = reducer(val, X[X_index]);
 #endif
     }
     val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
     if (threadIdx.x == 0) {
       Y[i] = val;
     }
     __syncthreads();
   }
 }

 template <typename T, class Reducer, int D>
 void ReduceTensorCUDAImpl(
     const int outer_size,
     const int inner_size,
     const int* dims,
     const int* axes,
     const Reducer& reducer,
     const T& init,
     const T* X,
     T* Y,
     CUDAContext* context) {
   SimpleArray<int, D> X_strides;
   SimpleArray<int, D> Y_dims;
   utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   for (int i = 0; i < D; ++i) {
     Y_dims.data[i] = dims[axes[i]];
   }
   ReduceTensorCUDAKernel<T, Reducer, D>
       <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           outer_size, inner_size, X_strides, Y_dims, reducer, init, X, Y);
 }

 template <typename T, class Reducer>
 void ReduceTensorCUDA(
     const int num_dims,
     const int* dims,
     const int num_axes,
     const int* axes,
     const Reducer& reducer,
     const T& init,
     const T* X,
     T* Y,
     CUDAContext* context) {
   CAFFE_ENFORCE_LE(num_axes, num_dims);
   std::vector<int> transpose_axes(num_dims);
   utils::ComputeTransposeAxesForReduceOp(
       num_dims, num_axes, axes, transpose_axes.data());
   const int pivot = num_dims - num_axes;
   int outer_size = 1;
   for (int i = 0; i < pivot; ++i) {
     outer_size *= dims[transpose_axes[i]];
   }
   int inner_size = 1;
   for (int i = pivot; i < num_dims; ++i) {
     inner_size *= dims[transpose_axes[i]];
   }
   if (transpose_axes[pivot] == pivot) {
     RowwiseReduceKernel<T>
         <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context->cuda_stream()>>>(
             outer_size, inner_size, reducer, init, X, Y);
     return;
   }
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
       num_dims,
       ReduceTensorCUDAImpl,
       T,
       Reducer,
       outer_size,
       inner_size,
       dims,
       transpose_axes.data(),
       reducer,
       init,
       X,
       Y,
       context);
 }

 template <typename T>
 void ReduceMeanCUDAImpl(
     const int num_dims,
     const int* dims,
     const int num_axes,
     const int* axes,
     const T* X,
     T* Y,
     CUDAContext* context) {
   ReduceTensorCUDA(
       num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context);
   const int X_size =
       std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
   int scale = 1;
   for (int i = 0; i < num_axes; ++i) {
     scale *= dims[axes[i]];
   }
   const int Y_size = X_size / scale;
   Scale<T, CUDAContext>(
       Y_size, 1.0f / static_cast<float>(scale), Y, Y, context);
 }

 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
   template <>                                 \
   void ReduceMin<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
       const int* axes,                        \
       const T* X,                             \
       T* Y,                                   \
       CUDAContext* context) {                 \
     ReduceTensorCUDA(                         \
         num_dims,                             \
         dims,                                 \
         num_axes,                             \
         axes,                                 \
         cub::Min(),                           \
         std::numeric_limits<T>::max(),        \
         X,                                    \
         Y,                                    \
         context);                             \
   }
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int32_t)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int64_t)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(float)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
 #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN

 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
   template <>                                 \
   void ReduceMax<T, CUDAContext>(             \
       const int num_dims,                     \
       const int* dims,                        \
       const int num_axes,                     \
       const int* axes,                        \
       const T* X,                             \
       T* Y,                                   \
       CUDAContext* context) {                 \
     ReduceTensorCUDA(                         \
         num_dims,                             \
         dims,                                 \
         num_axes,                             \
         axes,                                 \
         cub::Max(),                           \
         std::numeric_limits<T>::lowest(),     \
         X,                                    \
         Y,                                    \
         context);                             \
   }
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int32_t)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int64_t)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
 #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX

 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T)                             \
   template <>                                                             \
   void ReduceSum<T, CUDAContext>(                                         \
       const int num_dims,                                                 \
       const int* dims,                                                    \
       const int num_axes,                                                 \
       const int* axes,                                                    \
       const T* X,                                                         \
       T* Y,                                                               \
       CUDAContext* context) {                                             \
     ReduceTensorCUDA(                                                     \
         num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); \
   }
 CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(float)
 CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
 #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM

 #define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T)                            \
   template <>                                                             \
   void ReduceMean<T, CUDAContext>(                                        \
       const int num_dims,                                                 \
       const int* dims,                                                    \
       const int num_axes,                                                 \
       const int* axes,                                                    \
       const T* X,                                                         \
       T* Y,                                                               \
       CUDAContext* context) {                                             \
     ReduceMeanCUDAImpl<T>(num_dims, dims, num_axes, axes, X, Y, context); \
   }
 CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(float)
 #undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN

 namespace {

 template <typename T, int D>
 __global__ void BroadcastCUDAKernel(
     const int Y_size,
     const SimpleArray<int, D> X_strides,
     const SimpleArray<int, D> Y_dims,
     const T* X,
     T* Y) {
   CUDA_1D_KERNEL_LOOP(Y_index, Y_size) {
     int X_index = 0;
     int Y_index_val = Y_index;
 #pragma unroll
     for (int i = D - 1; i >= 0; --i) {
       X_index += X_strides.data[i] == 0
           ? 0
           : (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
       Y_index_val /= Y_dims.data[i];
     }
 #if __CUDA_ARCH__ >= 350
     Y[Y_index] = __ldg(X + X_index);
 #else
     Y[Y_index] = X[X_index];
 #endif
   }
 }

 template <typename T, int D>
 void BroadcastCUDAImpl(
     const int X_ndim,
     const int* X_dims,
     const int* Y_dims,
     const T* X,
     T* Y,
     CUDAContext* context) {
   SimpleArray<int, D> X_strides_array;
   SimpleArray<int, D> Y_dims_array;
   const int d = D - X_ndim;
   std::fill(X_strides_array.data, X_strides_array.data + d, 0);
   int cur_stride = 1;
   for (int i = D - 1; i >= d; --i) {
     CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
     X_strides_array.data[i] = X_dims[i - d] == 1 ? 0 : cur_stride;
     cur_stride *= X_dims[i - d];
   }
   std::copy_n(Y_dims, D, Y_dims_array.data);
   const int Y_size =
       std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
   BroadcastCUDAKernel<T, D>
       <<<CAFFE_GET_BLOCKS(Y_size),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(Y_size, X_strides_array, Y_dims_array, X, Y);
 }

 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T)                                  \
   template <>                                                                 \
   void Broadcast<T, CUDAContext>(                                             \
       const int X_ndim,                                                       \
       const int* X_dims,                                                      \
       const int Y_ndim,                                                       \
       const int* Y_dims,                                                      \
       const T* X,                                                             \
       T* Y,                                                                   \
       CUDAContext* context) {                                                 \
     CAFFE_ENFORCE_LE(X_ndim, Y_ndim);                                         \
     DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                                   \
         Y_ndim, BroadcastCUDAImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \
   }
 CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int32_t)
 CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int64_t)
 CAFFE2_SPECIALIZED_CUDA_BROADCAST(float)
 CAFFE2_SPECIALIZED_CUDA_BROADCAST(double)
 #undef CAFFE2_SPECIALIZED_CUDA_BROADCAST

 namespace {

 template <typename T>
 __global__ void RowwiseMomentsCUDAKernel(
     const int rows,
     const int cols,
     const T* X,
     T* mean,
     T* variance) {
   __shared__ typename BlockReduce<T>::TempStorage m_storage;
   __shared__ typename BlockReduce<T>::TempStorage v_storage;
   for (int i = blockIdx.x; i < rows; i += gridDim.x) {
     T m_val = 0;
     T v_val = 0;
     for (int j = threadIdx.x; j < cols; j += blockDim.x) {
       const int X_index = i * cols + j;
 #if __CUDA_ARCH__ >= 350
       m_val += __ldg(X + X_index);
       v_val += __ldg(X + X_index) * __ldg(X + X_index);
 #else
       m_val += X[X_index];
       v_val += X[X_index] * X[X_index];
 #endif
     }
     m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
     v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
     if (threadIdx.x == 0) {
       mean[i] = m_val / static_cast<T>(cols);
       variance[i] = v_val / static_cast<T>(cols) - mean[i] * mean[i];
     }
     __syncthreads();
   }
 }

 template <typename T, int D>
 __global__ void MomentsCUDAKernel(
     const int outer_size,
     const int inner_size,
     SimpleArray<int, D> X_strides,
     SimpleArray<int, D> Y_dims,
     const T* X,
     T* mean,
     T* variance) {
   __shared__ typename BlockReduce<T>::TempStorage m_storage;
   __shared__ typename BlockReduce<T>::TempStorage v_storage;
   for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
     T m_val = 0;
     T v_val = 0;
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       int X_index = 0;
       int Y_index = i * inner_size + j;
 #pragma unroll
       for (int i = D - 1; i >= 0; --i) {
         X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i];
         Y_index /= Y_dims.data[i];
       }
 #if __CUDA_ARCH__ >= 350
       m_val += __ldg(X + X_index);
       v_val += __ldg(X + X_index) * __ldg(X + X_index);
 #else
       m_val += X[X_index];
       v_val += X[X_index] * X[X_index];
 #endif
     }
     m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
     v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
     if (threadIdx.x == 0) {
       mean[i] = m_val / static_cast<T>(inner_size);
       variance[i] = v_val / static_cast<T>(inner_size) - mean[i] * mean[i];
     }
     __syncthreads();
   }
 }

 template <typename T, int D>
 void MomentsCUDAImpl(
     const int outer_size,
     const int inner_size,
     const int* dims,
     const int* axes,
     const T* X,
     T* mean,
     T* variance,
     CUDAContext* context) {
   SimpleArray<int, D> X_strides;
   SimpleArray<int, D> Y_dims;
   utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   for (int i = 0; i < D; ++i) {
     Y_dims.data[i] = dims[axes[i]];
   }
   MomentsCUDAKernel<T, D>
       <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(
           outer_size, inner_size, X_strides, Y_dims, X, mean, variance);
 }

 template <typename T>
 void MomentsCUDA(
     const int num_dims,
     const int* dims,
     const int num_axes,
     const int* axes,
     const T* X,
     T* mean,
     T* variance,
     CUDAContext* context) {
   CAFFE_ENFORCE_LE(num_axes, num_dims);
   std::vector<int> transpose_axes(num_dims);
   utils::ComputeTransposeAxesForReduceOp(
       num_dims, num_axes, axes, transpose_axes.data());
   const int pivot = num_dims - num_axes;
   int outer_size = 1;
   for (int i = 0; i < pivot; ++i) {
     outer_size *= dims[transpose_axes[i]];
   }
   int inner_size = 1;
   for (int i = pivot; i < num_dims; ++i) {
     inner_size *= dims[transpose_axes[i]];
   }
   if (transpose_axes[pivot] == pivot) {
     RowwiseMomentsCUDAKernel<T>
         <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
            CAFFE_CUDA_NUM_THREADS,
            0,
            context->cuda_stream()>>>(outer_size, inner_size, X, mean, variance);
     return;
   }
   DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
       num_dims,
       MomentsCUDAImpl,
       T,
       outer_size,
       inner_size,
       dims,
       transpose_axes.data(),
       X,
       mean,
       variance,
       context);
 }

 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T)                           \
   template <>                                                        \
   void Moments<T, CUDAContext>(                                      \
       const int num_dims,                                            \
       const int* dims,                                               \
       const int num_axes,                                            \
       const int* axes,                                               \
       const T* X,                                                    \
       T* mean,                                                       \
       T* variance,                                                   \
       CUDAContext* context) {                                        \
     MomentsCUDA<T>(                                                  \
         num_dims, dims, num_axes, axes, X, mean, variance, context); \
   }
 CAFFE2_SPECIALIZED_CUDA_MOMENTS(float)
 #undef CAFFE2_SPECIALIZED_CUDA_MOMENTS

 namespace {

 template <typename T, int D>
 __global__ void TransposeCUDAKernel(
     const int size,
     const SimpleArray<int, D> X_strides,
     const SimpleArray<int, D> Y_dims,
     const T* X,
     T* Y) {
   CUDA_1D_KERNEL_LOOP(Y_index, size) {
     int X_index = 0;
     int Y_index_val = Y_index;
 #pragma unroll
     for (int i = D - 1; i >= 0; --i) {
       X_index += (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
       Y_index_val /= Y_dims.data[i];
     }
 #if __CUDA_ARCH__ >= 350
     Y[Y_index] = __ldg(X + X_index);
 #else
     Y[Y_index] = X[X_index];
 #endif
   }
 }

 template <typename T, int D>
 void TransposeCUDAImpl(
     const int* dims,
     const int* axes,
     const T* X,
     T* Y,
     CUDAContext* context) {
   SimpleArray<int, D> X_strides;
   SimpleArray<int, D> Y_dims;
   utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
   int size = 1;
   for (int i = 0; i < D; ++i) {
     Y_dims.data[i] = dims[axes[i]];
     size *= dims[i];
   }
   TransposeCUDAKernel<T, D>
       <<<CAFFE_GET_BLOCKS(size),
          CAFFE_CUDA_NUM_THREADS,
          0,
          context->cuda_stream()>>>(size, X_strides, Y_dims, X, Y);
 }

 } // namespace

 #define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T)                    \
   template <>                                                   \
   void Transpose<T, CUDAContext>(                               \
       const int ndim,                                           \
       const int* dims,                                          \
       const int* axes,                                          \
       const T* X,                                               \
       T* Y,                                                     \
       CUDAContext* context) {                                   \
     DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                     \
         ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context); \
   }
 CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
 CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
 CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int)
 CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(TIndex)
 #undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE

 } // namespace math
 } // namespace caffe2