| #include "caffe2/operators/elementwise_op.h" |
| |
| namespace caffe2 { |
| |
| // For arithmetic operators, Eigen provides a good way to vectorize even |
| // when broadcasting. |
| #define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type) \ |
| struct Eigen##name##Functor { \ |
| template <int b_is_scalar, typename T, typename R> \ |
| inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \ |
| if (b_is_scalar) { \ |
| EigenVectorArrayMap<R>(out, n) = \ |
| eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0])); \ |
| } else { \ |
| EigenVectorArrayMap<R>(out, n) = eigen_op( \ |
| (ConstEigenVectorArrayMap<T>(a, n)), \ |
| (ConstEigenVectorArrayMap<T>(b, n))); \ |
| } \ |
| } \ |
| template <typename T, typename R> \ |
| void RunWithBroadcast( \ |
| const T* a, \ |
| const T* b, \ |
| R* out, \ |
| size_t pre, \ |
| size_t n, \ |
| CPUContext*) { \ |
| EigenArrayMap<R>(out, n, pre) = eigen_op( \ |
| (ConstEigenArrayMap<T>(a, n, pre).colwise()), \ |
| (ConstEigenVectorArrayMap<T>(b, n))); \ |
| } \ |
| template <typename T, typename R> \ |
| void RunWithBroadcast2( \ |
| const T* a, \ |
| const T* b, \ |
| R* out, \ |
| size_t pre, \ |
| size_t n, \ |
| size_t post, \ |
| CPUContext*) { \ |
| for (int i = 0; i < pre; ++i) { \ |
| EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op( \ |
| (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()), \ |
| (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n))); \ |
| } \ |
| } \ |
| }; \ |
| REGISTER_CPU_OPERATOR( \ |
| name, \ |
| BinaryElementwiseOp< \ |
| input_type, \ |
| CPUContext, \ |
| Eigen##name##Functor, \ |
| output_type>) |
| |
| // For some comparison and logical operators, eigen does not have vectorized |
| // math so we need to improvise. |
| #define NAIVE_FUNCTOR(name, op, input_type, output_type) \ |
| struct Naive##name##Functor { \ |
| template <int b_is_scalar, typename T, typename R> \ |
| inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \ |
| for (int i = 0; i < n; ++i) { \ |
| out[i] = op(a[i], b[b_is_scalar ? 0 : i]); \ |
| } \ |
| } \ |
| template <typename T, typename R> \ |
| void RunWithBroadcast( \ |
| const T* a, \ |
| const T* b, \ |
| R* out, \ |
| size_t pre, \ |
| size_t n, \ |
| CPUContext*) { \ |
| for (int i = 0; i < pre; ++i) { \ |
| for (int j = 0; j < n; ++j) { \ |
| out[i * n + j] = op(a[i * n + j], b[j]); \ |
| } \ |
| } \ |
| } \ |
| template <typename T, typename R> \ |
| void RunWithBroadcast2( \ |
| const T* a, \ |
| const T* b, \ |
| R* out, \ |
| size_t pre, \ |
| size_t n, \ |
| size_t post, \ |
| CPUContext*) { \ |
| for (int i = 0; i < pre; ++i) { \ |
| for (int j = 0; j < n; ++j) { \ |
| for (int k = 0; k < post; ++k) { \ |
| out[(i * n + j) * post + k] = op(a[(i * n + j) * post + k], b[j]); \ |
| } \ |
| } \ |
| } \ |
| } \ |
| }; \ |
| REGISTER_CPU_OPERATOR( \ |
| name, \ |
| BinaryElementwiseOp< \ |
| input_type, \ |
| CPUContext, \ |
| Naive##name##Functor, \ |
| output_type>) |
| |
| // See the operations supported here: |
| // https://eigen.tuxfamily.org/dox-devel/group__QuickRefPage.html |
| #define EIGEN_ADD(x, y) ((x) + (y)) |
| EIGEN_FUNCTOR(Add, EIGEN_ADD, NumericTypes, SameTypeAsInput); |
| #undef EIGEN_ADD |
| #define EIGEN_SUB(x, y) ((x) - (y)) |
| EIGEN_FUNCTOR(Sub, EIGEN_SUB, NumericTypes, SameTypeAsInput); |
| #undef EIGEN_SUB |
| #define EIGEN_MUL(x, y) ((x) * (y)) |
| EIGEN_FUNCTOR(Mul, EIGEN_MUL, NumericTypes, SameTypeAsInput); |
| #undef EIGEN_MUL |
| #define EIGEN_DIV(x, y) ((x) / (y)) |
| EIGEN_FUNCTOR(Div, EIGEN_DIV, NumericTypes, SameTypeAsInput); |
| #undef EIGEN_DIV |
| |
| #define NAIVE_LT(x, y) ((x) < (y)) |
| NAIVE_FUNCTOR(LT, NAIVE_LT, NumericTypes, FixedType<bool>); |
| #undef NAIVE_LT |
| #define NAIVE_LE(x, y) ((x) <= (y)) |
| NAIVE_FUNCTOR(LE, NAIVE_LE, NumericTypes, FixedType<bool>); |
| #undef NAIVE_LE |
| #define NAIVE_GT(x, y) ((x) > (y)) |
| NAIVE_FUNCTOR(GT, NAIVE_GT, NumericTypes, FixedType<bool>); |
| #undef NAIVE_GT |
| #define NAIVE_GE(x, y) ((x) >= (y)) |
| NAIVE_FUNCTOR(GE, NAIVE_GE, NumericTypes, FixedType<bool>); |
| #undef NAIVE_GE |
| #define NAIVE_EQ(x, y) ((x) == (y)) |
| NAIVE_FUNCTOR(EQ, NAIVE_EQ, IntTypes, FixedType<bool>); |
| #undef NAIVE_EQ |
| #define NAIVE_AND(x, y) ((x) & (y)) |
| NAIVE_FUNCTOR(And, NAIVE_AND, BoolTypes, FixedType<bool>); |
| #undef NAIVE_AND |
| #define NAIVE_OR(x, y) ((x) | (y)) |
| NAIVE_FUNCTOR(Or, NAIVE_OR, BoolTypes, FixedType<bool>); |
| #undef NAIVE_OR |
| #define NAIVE_XOR(x, y) ((x) ^ (y)) |
| NAIVE_FUNCTOR(Xor, NAIVE_XOR, BoolTypes, FixedType<bool>); |
| #undef NAIVE_XOR |
| |
| struct NotFunctor { |
| inline void operator()(const int n, const bool* x, bool* y, CPUContext*) { |
| for (int i = 0; i < n; ++i) { |
| y[i] = !x[i]; |
| } |
| } |
| }; |
| REGISTER_CPU_OPERATOR( |
| Not, |
| UnaryElementwiseOp<BoolTypes, CPUContext, NotFunctor>); |
| |
| template <> |
| bool DivGradientOp<float, CPUContext>::RunOnDevice() { |
| auto& Y = Input(0); |
| auto& Z = Input(1); |
| auto& dZ = Input(2); |
| auto* dX = Output(0); |
| auto* dY = Output(1); |
| DCHECK_GT(Y.size(), 0); |
| DCHECK_GT(Z.size(), 0); |
| dX->ResizeLike(Y); |
| dY->ResizeLike(Y); |
| |
| const float* Ydata = Y.data<float>(); |
| const float* Zdata = Z.data<float>(); |
| const float* dZdata = dZ.data<float>(); |
| float* dXdata = dX->mutable_data<float>(); |
| float* dYdata = dY->mutable_data<float>(); |
| #pragma omp parallel for |
| for (int i = 0; i < Y.size(); ++i) { |
| dXdata[i] = dZdata[i] / Ydata[i]; |
| dYdata[i] = - (dZdata[i] * Zdata[i]) / Ydata[i]; |
| } |
| return true; |
| } |
| |
| REGISTER_CPU_OPERATOR(DivGradient, DivGradientOp<float, CPUContext>); |
| |
| } // namespace caffe2 |