caffe2/operators/elementwise_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/elementwise_op.h"

 namespace caffe2 {

 // For arithmetic operators, Eigen provides a good way to vectorize even
 // when broadcasting.
 #define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type)               \
   struct Eigen##name##Functor {                                              \
     template <int b_is_scalar, typename T, typename R>                       \
     inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
       if (b_is_scalar) {                                                     \
         EigenVectorArrayMap<R>(out, n) =                                     \
             eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0]));           \
       } else {                                                               \
         EigenVectorArrayMap<R>(out, n) = eigen_op(                           \
             (ConstEigenVectorArrayMap<T>(a, n)),                             \
             (ConstEigenVectorArrayMap<T>(b, n)));                            \
       }                                                                      \
     }                                                                        \
     template <typename T, typename R>                                        \
     void RunWithBroadcast(                                                   \
         const T* a,                                                          \
         const T* b,                                                          \
         R* out,                                                              \
         size_t pre,                                                          \
         size_t n,                                                            \
         CPUContext*) {                                                       \
       EigenArrayMap<R>(out, n, pre) = eigen_op(                              \
           (ConstEigenArrayMap<T>(a, n, pre).colwise()),                      \
           (ConstEigenVectorArrayMap<T>(b, n)));                              \
     }                                                                        \
     template <typename T, typename R>                                        \
     void RunWithBroadcast2(                                                  \
         const T* a,                                                          \
         const T* b,                                                          \
         R* out,                                                              \
         size_t pre,                                                          \
         size_t n,                                                            \
         size_t post,                                                         \
         CPUContext*) {                                                       \
       for (int i = 0; i < pre; ++i) {                                        \
         EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op(            \
             (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()),    \
             (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n)));   \
       }                                                                      \
     }                                                                        \
   };                                                                         \
   REGISTER_CPU_OPERATOR(                                                     \
       name,                                                                  \
       BinaryElementwiseOp<                                                   \
           input_type,                                                        \
           CPUContext,                                                        \
           Eigen##name##Functor,                                              \
           output_type>)

 // For some comparison and logical operators, eigen does not have vectorized
 // math so we need to improvise.
 #define NAIVE_FUNCTOR(name, op, input_type, output_type)                       \
   struct Naive##name##Functor {                                                \
     template <int b_is_scalar, typename T, typename R>                         \
     inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) {   \
       for (int i = 0; i < n; ++i) {                                            \
         out[i] = op(a[i], b[b_is_scalar ? 0 : i]);                             \
       }                                                                        \
     }                                                                          \
     template <typename T, typename R>                                          \
     void RunWithBroadcast(                                                     \
         const T* a,                                                            \
         const T* b,                                                            \
         R* out,                                                                \
         size_t pre,                                                            \
         size_t n,                                                              \
         CPUContext*) {                                                         \
       for (int i = 0; i < pre; ++i) {                                          \
         for (int j = 0; j < n; ++j) {                                          \
           out[i * n + j] = op(a[i * n + j], b[j]);                             \
         }                                                                      \
       }                                                                        \
     }                                                                          \
     template <typename T, typename R>                                          \
     void RunWithBroadcast2(                                                    \
         const T* a,                                                            \
         const T* b,                                                            \
         R* out,                                                                \
         size_t pre,                                                            \
         size_t n,                                                              \
         size_t post,                                                           \
         CPUContext*) {                                                         \
       for (int i = 0; i < pre; ++i) {                                          \
         for (int j = 0; j < n; ++j) {                                          \
           for (int k = 0; k < post; ++k) {                                     \
             out[(i * n + j) * post + k] = op(a[(i * n + j) * post + k], b[j]); \
           }                                                                    \
         }                                                                      \
       }                                                                        \
     }                                                                          \
   };                                                                           \
   REGISTER_CPU_OPERATOR(                                                       \
       name,                                                                    \
       BinaryElementwiseOp<                                                     \
           input_type,                                                          \
           CPUContext,                                                          \
           Naive##name##Functor,                                                \
           output_type>)

 // See the operations supported here:
 // https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html
 #define EIGEN_ADD(x, y) ((x) + (y))
 EIGEN_FUNCTOR(Add, EIGEN_ADD, NumericTypes, SameTypeAsInput);
 #undef EIGEN_ADD
 #define EIGEN_SUB(x, y) ((x) - (y))
 EIGEN_FUNCTOR(Sub, EIGEN_SUB, NumericTypes, SameTypeAsInput);
 #undef EIGEN_SUB
 #define EIGEN_MUL(x, y) ((x) * (y))
 EIGEN_FUNCTOR(Mul, EIGEN_MUL, NumericTypes, SameTypeAsInput);
 #undef EIGEN_MUL
 #define EIGEN_DIV(x, y) ((x) / (y))
 EIGEN_FUNCTOR(Div, EIGEN_DIV, NumericTypes, SameTypeAsInput);
 #undef EIGEN_DIV

 #define NAIVE_LT(x, y) ((x) < (y))
 NAIVE_FUNCTOR(LT, NAIVE_LT, NumericTypes, FixedType<bool>);
 #undef NAIVE_LT
 #define NAIVE_LE(x, y) ((x) <= (y))
 NAIVE_FUNCTOR(LE, NAIVE_LE, NumericTypes, FixedType<bool>);
 #undef NAIVE_LE
 #define NAIVE_GT(x, y) ((x) > (y))
 NAIVE_FUNCTOR(GT, NAIVE_GT, NumericTypes, FixedType<bool>);
 #undef NAIVE_GT
 #define NAIVE_GE(x, y) ((x) >= (y))
 NAIVE_FUNCTOR(GE, NAIVE_GE, NumericTypes, FixedType<bool>);
 #undef NAIVE_GE
 #define NAIVE_EQ(x, y) ((x) == (y))
 NAIVE_FUNCTOR(EQ, NAIVE_EQ, IntTypes, FixedType<bool>);
 #undef NAIVE_EQ
 #define NAIVE_AND(x, y) ((x) & (y))
 NAIVE_FUNCTOR(And, NAIVE_AND, BoolTypes, FixedType<bool>);
 #undef NAIVE_AND
 #define NAIVE_OR(x, y) ((x) | (y))
 NAIVE_FUNCTOR(Or, NAIVE_OR, BoolTypes, FixedType<bool>);
 #undef NAIVE_OR
 #define NAIVE_XOR(x, y) ((x) ^ (y))
 NAIVE_FUNCTOR(Xor, NAIVE_XOR, BoolTypes, FixedType<bool>);
 #undef NAIVE_XOR

 struct NotFunctor {
   inline void operator()(const int n, const bool* x, bool* y, CPUContext*) {
     for (int i = 0; i < n; ++i) {
       y[i] = !x[i];
     }
   }
 };
 REGISTER_CPU_OPERATOR(
     Not,
     UnaryElementwiseOp<BoolTypes, CPUContext, NotFunctor>);

 template <>
 bool DivGradientOp<float, CPUContext>::RunOnDevice() {
   auto& Y = Input(0);
   auto& Z = Input(1);
   auto& dZ = Input(2);
   auto* dX = Output(0);
   auto* dY = Output(1);
   DCHECK_GT(Y.size(), 0);
   DCHECK_GT(Z.size(), 0);
   dX->ResizeLike(Y);
   dY->ResizeLike(Y);

   const float* Ydata = Y.data<float>();
   const float* Zdata = Z.data<float>();
   const float* dZdata = dZ.data<float>();
   float* dXdata = dX->mutable_data<float>();
   float* dYdata = dY->mutable_data<float>();
   #pragma omp parallel for
   for (int i = 0; i < Y.size(); ++i) {
     dXdata[i] = dZdata[i] / Ydata[i];
     dYdata[i] = - (dZdata[i] * Zdata[i]) / Ydata[i];
   }
   return true;
 }

 REGISTER_CPU_OPERATOR(DivGradient, DivGradientOp<float, CPUContext>);

 }  // namespace caffe2
	#include "caffe2/operators/elementwise_op.h"

	namespace caffe2 {

	// For arithmetic operators, Eigen provides a good way to vectorize even
	// when broadcasting.
	#define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type) \
	struct Eigen##name##Functor { \
	template <int b_is_scalar, typename T, typename R> \
	inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
	if (b_is_scalar) { \
	EigenVectorArrayMap<R>(out, n) = \
	eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0])); \
	} else { \
	EigenVectorArrayMap<R>(out, n) = eigen_op( \
	(ConstEigenVectorArrayMap<T>(a, n)), \
	(ConstEigenVectorArrayMap<T>(b, n))); \
	} \
	} \
	template <typename T, typename R> \
	void RunWithBroadcast( \
	const T* a, \
	const T* b, \
	R* out, \
	size_t pre, \
	size_t n, \
	CPUContext*) { \
	EigenArrayMap<R>(out, n, pre) = eigen_op( \
	(ConstEigenArrayMap<T>(a, n, pre).colwise()), \
	(ConstEigenVectorArrayMap<T>(b, n))); \
	} \
	template <typename T, typename R> \
	void RunWithBroadcast2( \
	const T* a, \
	const T* b, \
	R* out, \
	size_t pre, \
	size_t n, \
	size_t post, \
	CPUContext*) { \
	for (int i = 0; i < pre; ++i) { \
	EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op( \
	(ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()), \
	(Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n))); \
	} \
	} \
	}; \
	REGISTER_CPU_OPERATOR( \
	name, \
	BinaryElementwiseOp< \
	input_type, \
	CPUContext, \
	Eigen##name##Functor, \
	output_type>)

	// For some comparison and logical operators, eigen does not have vectorized
	// math so we need to improvise.
	#define NAIVE_FUNCTOR(name, op, input_type, output_type) \
	struct Naive##name##Functor { \
	template <int b_is_scalar, typename T, typename R> \
	inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
	for (int i = 0; i < n; ++i) { \
	out[i] = op(a[i], b[b_is_scalar ? 0 : i]); \
	} \
	} \
	template <typename T, typename R> \
	void RunWithBroadcast( \
	const T* a, \
	const T* b, \
	R* out, \
	size_t pre, \
	size_t n, \
	CPUContext*) { \
	for (int i = 0; i < pre; ++i) { \
	for (int j = 0; j < n; ++j) { \
	out[i * n + j] = op(a[i * n + j], b[j]); \
	} \
	} \
	} \
	template <typename T, typename R> \
	void RunWithBroadcast2( \
	const T* a, \
	const T* b, \
	R* out, \
	size_t pre, \
	size_t n, \
	size_t post, \
	CPUContext*) { \
	for (int i = 0; i < pre; ++i) { \
	for (int j = 0; j < n; ++j) { \
	for (int k = 0; k < post; ++k) { \
	out[(i * n + j) * post + k] = op(a[(i * n + j) * post + k], b[j]); \
	} \
	} \
	} \
	} \
	}; \
	REGISTER_CPU_OPERATOR( \
	name, \
	BinaryElementwiseOp< \
	input_type, \
	CPUContext, \
	Naive##name##Functor, \
	output_type>)

	// See the operations supported here:
	// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html
	#define EIGEN_ADD(x, y) ((x) + (y))
	EIGEN_FUNCTOR(Add, EIGEN_ADD, NumericTypes, SameTypeAsInput);
	#undef EIGEN_ADD
	#define EIGEN_SUB(x, y) ((x) - (y))
	EIGEN_FUNCTOR(Sub, EIGEN_SUB, NumericTypes, SameTypeAsInput);
	#undef EIGEN_SUB
	#define EIGEN_MUL(x, y) ((x) * (y))
	EIGEN_FUNCTOR(Mul, EIGEN_MUL, NumericTypes, SameTypeAsInput);
	#undef EIGEN_MUL
	#define EIGEN_DIV(x, y) ((x) / (y))
	EIGEN_FUNCTOR(Div, EIGEN_DIV, NumericTypes, SameTypeAsInput);
	#undef EIGEN_DIV

	#define NAIVE_LT(x, y) ((x) < (y))
	NAIVE_FUNCTOR(LT, NAIVE_LT, NumericTypes, FixedType<bool>);
	#undef NAIVE_LT
	#define NAIVE_LE(x, y) ((x) <= (y))
	NAIVE_FUNCTOR(LE, NAIVE_LE, NumericTypes, FixedType<bool>);
	#undef NAIVE_LE
	#define NAIVE_GT(x, y) ((x) > (y))
	NAIVE_FUNCTOR(GT, NAIVE_GT, NumericTypes, FixedType<bool>);
	#undef NAIVE_GT
	#define NAIVE_GE(x, y) ((x) >= (y))
	NAIVE_FUNCTOR(GE, NAIVE_GE, NumericTypes, FixedType<bool>);
	#undef NAIVE_GE
	#define NAIVE_EQ(x, y) ((x) == (y))
	NAIVE_FUNCTOR(EQ, NAIVE_EQ, IntTypes, FixedType<bool>);
	#undef NAIVE_EQ
	#define NAIVE_AND(x, y) ((x) & (y))
	NAIVE_FUNCTOR(And, NAIVE_AND, BoolTypes, FixedType<bool>);
	#undef NAIVE_AND
	#define NAIVE_OR(x, y) ((x) \| (y))
	NAIVE_FUNCTOR(Or, NAIVE_OR, BoolTypes, FixedType<bool>);
	#undef NAIVE_OR
	#define NAIVE_XOR(x, y) ((x) ^ (y))
	NAIVE_FUNCTOR(Xor, NAIVE_XOR, BoolTypes, FixedType<bool>);
	#undef NAIVE_XOR

	struct NotFunctor {
	inline void operator()(const int n, const bool* x, bool* y, CPUContext*) {
	for (int i = 0; i < n; ++i) {
	y[i] = !x[i];
	}
	}
	};
	REGISTER_CPU_OPERATOR(
	Not,
	UnaryElementwiseOp<BoolTypes, CPUContext, NotFunctor>);

	template <>
	bool DivGradientOp<float, CPUContext>::RunOnDevice() {
	auto& Y = Input(0);
	auto& Z = Input(1);
	auto& dZ = Input(2);
	auto* dX = Output(0);
	auto* dY = Output(1);
	DCHECK_GT(Y.size(), 0);
	DCHECK_GT(Z.size(), 0);
	dX->ResizeLike(Y);
	dY->ResizeLike(Y);

	const float* Ydata = Y.data<float>();
	const float* Zdata = Z.data<float>();
	const float* dZdata = dZ.data<float>();
	float* dXdata = dX->mutable_data<float>();
	float* dYdata = dY->mutable_data<float>();
	#pragma omp parallel for
	for (int i = 0; i < Y.size(); ++i) {
	dXdata[i] = dZdata[i] / Ydata[i];
	dYdata[i] = - (dZdata[i] * Zdata[i]) / Ydata[i];
	}
	return true;
	}

	REGISTER_CPU_OPERATOR(DivGradient, DivGradientOp<float, CPUContext>);

	} // namespace caffe2