caffe2/utils/math/elementwise.cu - platform/external/pytorch - Git at Google

 #include "caffe2/utils/math/elementwise.h"

 #include <thrust/functional.h>

 #include "caffe2/core/context_gpu.h"
 #include "caffe2/utils/conversions.h"
 #include "caffe2/utils/math/half_utils.h"
 #include "caffe2/utils/math/utils.h"

 namespace caffe2 {
 namespace math {

 namespace {

 #define DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(T, Func, DeviceFunc) \
   __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) {     \
     const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;    \
     if (i < N) {                                                        \
       Y[i] = DeviceFunc(X[i]);                                          \
     }                                                                   \
   }
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Exp, expf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Log, logf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cos, cosf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Acos, acosf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sin, sinf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Asin, asinf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tan, tanf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Atan, atanf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sinh, sinhf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cosh, coshf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tanh, tanhf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Abs, fabsf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqr, utils::Square<float>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqrt, sqrtf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Rsqrt, rsqrtf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cbrt, cbrtf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Erf, erff)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Erf, erf)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
     std::int32_t,
     Cube,
     utils::Cube<std::int32_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
     std::int64_t,
     Cube,
     utils::Cube<std::int64_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cube, utils::Cube<float>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Cube, utils::Cube<double>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(bool, Not, utils::Not<bool>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
     std::int32_t,
     Neg,
     utils::Negate<std::int32_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
     std::int64_t,
     Neg,
     utils::Negate<std::int64_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Neg, utils::Negate<float>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Neg, utils::Negate<double>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
     std::int32_t,
     Sign,
     utils::Sign<std::int32_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
     std::int64_t,
     Sign,
     utils::Sign<std::int64_t>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sign, utils::Sign<float>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Sign, utils::Sign<double>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Inv, utils::Inv<float>)
 DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Inv, utils::Inv<double>)
 #undef DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION

 template <typename T>
 __global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
 #if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
     c10::cuda::compat::sincos(__ldg(X + i), S + i, C + i);
 #else
     c10::cuda::compat::sincos(X[i], S + i, C + i);
 #endif
   }
 }

 template <typename T, class Func>
 __global__ void SimpleBinaryCUDAKernel(
     const int N,
     const Func func,
     const T* A,
     const T* B,
     T* C) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
     C[i] = func(A[i], B[i]);
   }
 }

 template <typename T, class Comp>
 __global__ void SimpleCompareCUDAKernel(
     const int N,
     const Comp comp,
     const T* A,
     const T* B,
     bool* C) {
   const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
   if (i < N) {
     C[i] = comp(A[i], B[i]);
   }
 }

 } // namespace

 #define DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func)           \
   template <>                                                \
   CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(              \
       const int N, const T* X, T* Y, CUDAContext* context) { \
     if (N > 0) {                                             \
       const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS);        \
       Func##CUDAKernel<<<                                    \
           M,                                                 \
           CAFFE_CUDA_NUM_THREADS,                            \
           0,                                                 \
           context->cuda_stream()>>>(N, X, Y);                \
     }                                                        \
   }
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Erf)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Erf)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Cube)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Cube)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Inv)
 DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv)
 #undef DEFINE_SIMPLE_CUDA_UNARY_FUNCTION

 #define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                             \
   template <>                                                         \
   CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>(                     \
       const int N, const T* X, T* S, T* C, CUDAContext* context) {    \
     if (N > 0) {                                                      \
       const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS);                 \
       SinCosCUDAKernel<T>                                             \
           <<<K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
               N, X, S, C);                                            \
     }                                                                 \
   }
 CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
 CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
 #undef CAFFE2_SPECIALIZED_CUDA_SINCOS

 #define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Func, DeviceFunc)        \
   template <>                                                            \
   CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(                          \
       const int N, const T* A, const T* B, T* C, CUDAContext* context) { \
     if (N > 0) {                                                         \
       const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS);                    \
       SimpleBinaryCUDAKernel<<<                                          \
           M,                                                             \
           CAFFE_CUDA_NUM_THREADS,                                        \
           0,                                                             \
           context->cuda_stream()>>>(N, DeviceFunc, A, B, C);             \
     }                                                                    \
   }
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int32_t,
     Add,
     thrust::plus<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int64_t,
     Add,
     thrust::plus<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, thrust::plus<float>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Add, thrust::plus<double>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Add, utils::HalfAddFunctor())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int32_t,
     Sub,
     thrust::minus<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int64_t,
     Sub,
     thrust::minus<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, thrust::minus<float>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Sub, thrust::minus<double>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Sub, utils::HalfSubFunctor())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int32_t,
     Mul,
     thrust::multiplies<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int64_t,
     Mul,
     thrust::multiplies<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, thrust::multiplies<float>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Mul, thrust::multiplies<double>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Mul, utils::HalfMulFunctor())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int32_t,
     Div,
     thrust::divides<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int64_t,
     Div,
     thrust::divides<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, thrust::divides<float>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Div, thrust::divides<double>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Div, utils::HalfDivFunctor())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Min, thrust::minimum<float>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Min, thrust::minimum<double>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Max, thrust::maximum<float>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Max, thrust::maximum<double>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, And, thrust::logical_and<bool>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Or, thrust::logical_or<bool>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Xor, thrust::bit_xor<bool>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseAnd, thrust::bit_and<bool>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int32_t,
     BitwiseAnd,
     thrust::bit_and<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int64_t,
     BitwiseAnd,
     thrust::bit_and<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseOr, thrust::bit_or<bool>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int32_t,
     BitwiseOr,
     thrust::bit_or<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int64_t,
     BitwiseOr,
     thrust::bit_or<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseXor, thrust::bit_xor<bool>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int32_t,
     BitwiseXor,
     thrust::bit_xor<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
     std::int64_t,
     BitwiseXor,
     thrust::bit_xor<std::int64_t>())
 #undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION

 #define DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(T, Func, DeviceComp)          \
   template <>                                                               \
   CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(                             \
       const int N, const T* A, const T* B, bool* C, CUDAContext* context) { \
     if (N > 0) {                                                            \
       const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS);                       \
       SimpleCompareCUDAKernel<<<                                            \
           M,                                                                \
           CAFFE_CUDA_NUM_THREADS,                                           \
           0,                                                                \
           context->cuda_stream()>>>(N, DeviceComp, A, B, C);                \
     }                                                                       \
   }
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, EQ, thrust::equal_to<bool>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int32_t,
     EQ,
     thrust::equal_to<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int64_t,
     EQ,
     thrust::equal_to<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, EQ, thrust::equal_to<float>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, EQ, thrust::equal_to<double>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, NE, thrust::not_equal_to<bool>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int32_t,
     NE,
     thrust::not_equal_to<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int64_t,
     NE,
     thrust::not_equal_to<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, NE, thrust::not_equal_to<float>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     double,
     NE,
     thrust::not_equal_to<double>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LT, thrust::less<bool>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int32_t,
     LT,
     thrust::less<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int64_t,
     LT,
     thrust::less<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LT, thrust::less<float>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LT, thrust::less<double>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LE, thrust::less_equal<bool>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int32_t,
     LE,
     thrust::less_equal<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int64_t,
     LE,
     thrust::less_equal<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LE, thrust::less_equal<float>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LE, thrust::less_equal<double>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GT, thrust::greater<bool>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int32_t,
     GT,
     thrust::greater<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int64_t,
     GT,
     thrust::greater<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GT, thrust::greater<float>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, GT, thrust::greater<double>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GE, thrust::greater_equal<bool>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int32_t,
     GE,
     thrust::greater_equal<std::int32_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     std::int64_t,
     GE,
     thrust::greater_equal<std::int64_t>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GE, thrust::greater_equal<float>())
 DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
     double,
     GE,
     thrust::greater_equal<double>())
 #undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION

 } // namespace math
 } // namespace caffe2
	#include "caffe2/utils/math/elementwise.h"

	#include <thrust/functional.h>

	#include "caffe2/core/context_gpu.h"
	#include "caffe2/utils/conversions.h"
	#include "caffe2/utils/math/half_utils.h"
	#include "caffe2/utils/math/utils.h"

	namespace caffe2 {
	namespace math {

	namespace {

	#define DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(T, Func, DeviceFunc) \
	__global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; \
	if (i < N) { \
	Y[i] = DeviceFunc(X[i]); \
	} \
	}
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Exp, expf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Log, logf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cos, cosf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Acos, acosf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sin, sinf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Asin, asinf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tan, tanf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Atan, atanf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sinh, sinhf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cosh, coshf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tanh, tanhf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Abs, fabsf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqr, utils::Square<float>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqrt, sqrtf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Rsqrt, rsqrtf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cbrt, cbrtf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Erf, erff)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Erf, erf)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
	std::int32_t,
	Cube,
	utils::Cube<std::int32_t>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
	std::int64_t,
	Cube,
	utils::Cube<std::int64_t>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cube, utils::Cube<float>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Cube, utils::Cube<double>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(bool, Not, utils::Not<bool>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
	std::int32_t,
	Neg,
	utils::Negate<std::int32_t>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
	std::int64_t,
	Neg,
	utils::Negate<std::int64_t>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Neg, utils::Negate<float>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Neg, utils::Negate<double>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
	std::int32_t,
	Sign,
	utils::Sign<std::int32_t>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
	std::int64_t,
	Sign,
	utils::Sign<std::int64_t>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sign, utils::Sign<float>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Sign, utils::Sign<double>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Inv, utils::Inv<float>)
	DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Inv, utils::Inv<double>)
	#undef DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION

	template <typename T>
	__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	#if __CUDA_ARCH__ >= 350 \|\| defined(__HIP_PLATFORM_HCC__)
	c10::cuda::compat::sincos(__ldg(X + i), S + i, C + i);
	#else
	c10::cuda::compat::sincos(X[i], S + i, C + i);
	#endif
	}
	}

	template <typename T, class Func>
	__global__ void SimpleBinaryCUDAKernel(
	const int N,
	const Func func,
	const T* A,
	const T* B,
	T* C) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	C[i] = func(A[i], B[i]);
	}
	}

	template <typename T, class Comp>
	__global__ void SimpleCompareCUDAKernel(
	const int N,
	const Comp comp,
	const T* A,
	const T* B,
	bool* C) {
	const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
	if (i < N) {
	C[i] = comp(A[i], B[i]);
	}
	}

	} // namespace

	#define DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func) \
	template <> \
	CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
	const int N, const T* X, T* Y, CUDAContext* context) { \
	if (N > 0) { \
	const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
	Func##CUDAKernel<<< \
	M, \
	CAFFE_CUDA_NUM_THREADS, \
	0, \
	context->cuda_stream()>>>(N, X, Y); \
	} \
	}
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Erf)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Erf)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Cube)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Cube)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Inv)
	DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv)
	#undef DEFINE_SIMPLE_CUDA_UNARY_FUNCTION

	#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \
	template <> \
	CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>( \
	const int N, const T* X, T* S, T* C, CUDAContext* context) { \
	if (N > 0) { \
	const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
	SinCosCUDAKernel<T> \
	<<<K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
	N, X, S, C); \
	} \
	}
	CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
	CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
	#undef CAFFE2_SPECIALIZED_CUDA_SINCOS

	#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Func, DeviceFunc) \
	template <> \
	CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
	const int N, const T* A, const T* B, T* C, CUDAContext* context) { \
	if (N > 0) { \
	const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
	SimpleBinaryCUDAKernel<<< \
	M, \
	CAFFE_CUDA_NUM_THREADS, \
	0, \
	context->cuda_stream()>>>(N, DeviceFunc, A, B, C); \
	} \
	}
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int32_t,
	Add,
	thrust::plus<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int64_t,
	Add,
	thrust::plus<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, thrust::plus<float>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Add, thrust::plus<double>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Add, utils::HalfAddFunctor())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int32_t,
	Sub,
	thrust::minus<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int64_t,
	Sub,
	thrust::minus<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, thrust::minus<float>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Sub, thrust::minus<double>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Sub, utils::HalfSubFunctor())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int32_t,
	Mul,
	thrust::multiplies<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int64_t,
	Mul,
	thrust::multiplies<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, thrust::multiplies<float>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Mul, thrust::multiplies<double>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Mul, utils::HalfMulFunctor())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int32_t,
	Div,
	thrust::divides<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int64_t,
	Div,
	thrust::divides<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, thrust::divides<float>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Div, thrust::divides<double>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Div, utils::HalfDivFunctor())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Min, thrust::minimum<float>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Min, thrust::minimum<double>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Max, thrust::maximum<float>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Max, thrust::maximum<double>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, And, thrust::logical_and<bool>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Or, thrust::logical_or<bool>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Xor, thrust::bit_xor<bool>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseAnd, thrust::bit_and<bool>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int32_t,
	BitwiseAnd,
	thrust::bit_and<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int64_t,
	BitwiseAnd,
	thrust::bit_and<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseOr, thrust::bit_or<bool>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int32_t,
	BitwiseOr,
	thrust::bit_or<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int64_t,
	BitwiseOr,
	thrust::bit_or<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseXor, thrust::bit_xor<bool>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int32_t,
	BitwiseXor,
	thrust::bit_xor<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
	std::int64_t,
	BitwiseXor,
	thrust::bit_xor<std::int64_t>())
	#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION

	#define DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(T, Func, DeviceComp) \
	template <> \
	CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
	const int N, const T* A, const T* B, bool* C, CUDAContext* context) { \
	if (N > 0) { \
	const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
	SimpleCompareCUDAKernel<<< \
	M, \
	CAFFE_CUDA_NUM_THREADS, \
	0, \
	context->cuda_stream()>>>(N, DeviceComp, A, B, C); \
	} \
	}
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, EQ, thrust::equal_to<bool>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int32_t,
	EQ,
	thrust::equal_to<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int64_t,
	EQ,
	thrust::equal_to<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, EQ, thrust::equal_to<float>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, EQ, thrust::equal_to<double>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, NE, thrust::not_equal_to<bool>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int32_t,
	NE,
	thrust::not_equal_to<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int64_t,
	NE,
	thrust::not_equal_to<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, NE, thrust::not_equal_to<float>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	double,
	NE,
	thrust::not_equal_to<double>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LT, thrust::less<bool>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int32_t,
	LT,
	thrust::less<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int64_t,
	LT,
	thrust::less<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LT, thrust::less<float>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LT, thrust::less<double>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LE, thrust::less_equal<bool>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int32_t,
	LE,
	thrust::less_equal<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int64_t,
	LE,
	thrust::less_equal<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LE, thrust::less_equal<float>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LE, thrust::less_equal<double>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GT, thrust::greater<bool>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int32_t,
	GT,
	thrust::greater<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int64_t,
	GT,
	thrust::greater<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GT, thrust::greater<float>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, GT, thrust::greater<double>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GE, thrust::greater_equal<bool>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int32_t,
	GE,
	thrust::greater_equal<std::int32_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	std::int64_t,
	GE,
	thrust::greater_equal<std::int64_t>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GE, thrust::greater_equal<float>())
	DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
	double,
	GE,
	thrust::greater_equal<double>())
	#undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION

	} // namespace math
	} // namespace caffe2