blob: 164581b627de3125392b7c240f900382ca8b9a79 [file] [log] [blame]
#include "caffe2/operators/elementwise_op.h"
namespace caffe2 {
// For arithmetic operators, Eigen provides a good way to vectorize even
// when broadcasting.
#define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type) \
struct Eigen##name##Functor { \
template <int b_is_scalar, typename T, typename R> \
inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
if (b_is_scalar) { \
EigenVectorArrayMap<R>(out, n) = \
eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0])); \
} else { \
EigenVectorArrayMap<R>(out, n) = eigen_op( \
(ConstEigenVectorArrayMap<T>(a, n)), \
(ConstEigenVectorArrayMap<T>(b, n))); \
} \
} \
template <typename T, typename R> \
void RunWithBroadcast( \
const T* a, \
const T* b, \
R* out, \
size_t pre, \
size_t n, \
CPUContext*) { \
EigenArrayMap<R>(out, n, pre) = eigen_op( \
(ConstEigenArrayMap<T>(a, n, pre).colwise()), \
(ConstEigenVectorArrayMap<T>(b, n))); \
} \
template <typename T, typename R> \
void RunWithBroadcast2( \
const T* a, \
const T* b, \
R* out, \
size_t pre, \
size_t n, \
size_t post, \
CPUContext*) { \
for (int i = 0; i < pre; ++i) { \
EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op( \
(ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()), \
(Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n))); \
} \
} \
}; \
REGISTER_CPU_OPERATOR( \
name, \
BinaryElementwiseOp< \
input_type, \
CPUContext, \
Eigen##name##Functor, \
output_type>)
// For some comparison and logical operators, eigen does not have vectorized
// math so we need to improvise.
#define NAIVE_FUNCTOR(name, op, input_type, output_type) \
struct Naive##name##Functor { \
template <int b_is_scalar, typename T, typename R> \
inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
for (int i = 0; i < n; ++i) { \
out[i] = op(a[i], b[b_is_scalar ? 0 : i]); \
} \
} \
template <typename T, typename R> \
void RunWithBroadcast( \
const T* a, \
const T* b, \
R* out, \
size_t pre, \
size_t n, \
CPUContext*) { \
for (int i = 0; i < pre; ++i) { \
for (int j = 0; j < n; ++j) { \
out[i * n + j] = op(a[i * n + j], b[j]); \
} \
} \
} \
template <typename T, typename R> \
void RunWithBroadcast2( \
const T* a, \
const T* b, \
R* out, \
size_t pre, \
size_t n, \
size_t post, \
CPUContext*) { \
for (int i = 0; i < pre; ++i) { \
for (int j = 0; j < n; ++j) { \
for (int k = 0; k < post; ++k) { \
out[(i * n + j) * post + k] = op(a[(i * n + j) * post + k], b[j]); \
} \
} \
} \
} \
}; \
REGISTER_CPU_OPERATOR( \
name, \
BinaryElementwiseOp< \
input_type, \
CPUContext, \
Naive##name##Functor, \
output_type>)
// See the operations supported here:
// https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html
#define EIGEN_ADD(x, y) ((x) + (y))
EIGEN_FUNCTOR(Add, EIGEN_ADD, NumericTypes, SameTypeAsInput);
#undef EIGEN_ADD
#define EIGEN_SUB(x, y) ((x) - (y))
EIGEN_FUNCTOR(Sub, EIGEN_SUB, NumericTypes, SameTypeAsInput);
#undef EIGEN_SUB
#define EIGEN_MUL(x, y) ((x) * (y))
EIGEN_FUNCTOR(Mul, EIGEN_MUL, NumericTypes, SameTypeAsInput);
#undef EIGEN_MUL
#define EIGEN_DIV(x, y) ((x) / (y))
EIGEN_FUNCTOR(Div, EIGEN_DIV, NumericTypes, SameTypeAsInput);
#undef EIGEN_DIV
#define NAIVE_LT(x, y) ((x) < (y))
NAIVE_FUNCTOR(LT, NAIVE_LT, NumericTypes, FixedType<bool>);
#undef NAIVE_LT
#define NAIVE_LE(x, y) ((x) <= (y))
NAIVE_FUNCTOR(LE, NAIVE_LE, NumericTypes, FixedType<bool>);
#undef NAIVE_LE
#define NAIVE_GT(x, y) ((x) > (y))
NAIVE_FUNCTOR(GT, NAIVE_GT, NumericTypes, FixedType<bool>);
#undef NAIVE_GT
#define NAIVE_GE(x, y) ((x) >= (y))
NAIVE_FUNCTOR(GE, NAIVE_GE, NumericTypes, FixedType<bool>);
#undef NAIVE_GE
#define NAIVE_EQ(x, y) ((x) == (y))
NAIVE_FUNCTOR(EQ, NAIVE_EQ, IntTypes, FixedType<bool>);
#undef NAIVE_EQ
#define NAIVE_AND(x, y) ((x) & (y))
NAIVE_FUNCTOR(And, NAIVE_AND, BoolTypes, FixedType<bool>);
#undef NAIVE_AND
#define NAIVE_OR(x, y) ((x) | (y))
NAIVE_FUNCTOR(Or, NAIVE_OR, BoolTypes, FixedType<bool>);
#undef NAIVE_OR
#define NAIVE_XOR(x, y) ((x) ^ (y))
NAIVE_FUNCTOR(Xor, NAIVE_XOR, BoolTypes, FixedType<bool>);
#undef NAIVE_XOR
struct NotFunctor {
inline void operator()(const int n, const bool* x, bool* y, CPUContext*) {
for (int i = 0; i < n; ++i) {
y[i] = !x[i];
}
}
};
REGISTER_CPU_OPERATOR(
Not,
UnaryElementwiseOp<BoolTypes, CPUContext, NotFunctor>);
template <>
bool DivGradientOp<float, CPUContext>::RunOnDevice() {
auto& Y = Input(0);
auto& Z = Input(1);
auto& dZ = Input(2);
auto* dX = Output(0);
auto* dY = Output(1);
DCHECK_GT(Y.size(), 0);
DCHECK_GT(Z.size(), 0);
dX->ResizeLike(Y);
dY->ResizeLike(Y);
const float* Ydata = Y.data<float>();
const float* Zdata = Z.data<float>();
const float* dZdata = dZ.data<float>();
float* dXdata = dX->mutable_data<float>();
float* dYdata = dY->mutable_data<float>();
#pragma omp parallel for
for (int i = 0; i < Y.size(); ++i) {
dXdata[i] = dZdata[i] / Ydata[i];
dYdata[i] = - (dZdata[i] * Zdata[i]) / Ydata[i];
}
return true;
}
REGISTER_CPU_OPERATOR(DivGradient, DivGradientOp<float, CPUContext>);
} // namespace caffe2