caffe2/operators/elementwise_op.h - platform/external/pytorch - Git at Google

 /**
  * Copyright (c) 2016-present, Facebook, Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
 #define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_

 #include "caffe2/core/common_omp.h"
 #include "caffe2/core/context.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/utils/math.h"

 namespace caffe2 {

 using NumericTypes = TensorTypes<int32_t, int64_t, float, double>;
 using IntTypes = TensorTypes<int32_t, int64_t>;
 using BoolTypes = TensorTypes<bool>;

 struct SameTypeAsInput {
   template <typename T>
   using type = T;
 };

 template <typename R>
 struct FixedType {
   template <typename T>
   using type = R;
 };

 template <
     typename InputTypes,
     class Context,
     class Functor,
     class TypeMap = SameTypeAsInput>
 class UnaryElementwiseWithArgsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws), functor_(*this) {}

   bool RunOnDevice() override {
     return DispatchHelper<InputTypes>::call(this, Input(0));
   }

   template <typename T>
   bool DoRunWithType() {
     auto& input = Input(0);
     auto* output = Output(0);
     output->ResizeLike(input);
     using R = typename TypeMap::template type<T>;
     functor_(
         input.size(),
         input.template data<T>(),
         output->template mutable_data<R>(),
         &context_);
     return true;
   }

  private:
   Functor functor_;
 };

 /**
  * WithDefaultConstructor is a functor that can be used as the functor of an
  * UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
  * another functor that doesn't accept arguments in its constructor.
  */
 template <typename Functor>
 struct WithDefaultConstructor {
   explicit WithDefaultConstructor(OperatorBase& /*op*/) {}

   template <typename In, typename Out, typename Context>
   void operator()(int n, const In* in, Out* out, Context* c) {
     Functor()(n, in, out, c);
   }
 };

 /**
  * UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
  * difference that it takes a functor with default constructor, e.g. that does
  * not need to take into consideration any arguments during operator creation.
  */
 template <
     typename InputTypes,
     class Context,
     class Functor,
     class OutputType = SameTypeAsInput>
 using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
     InputTypes,
     Context,
     WithDefaultConstructor<Functor>,
     OutputType>;

 /**
  * Performs a binary operation (e.g. +, - or /) with optional broadcast support.
  *
  * Functor specifies actual operation to be performed.
  *
  * If AllowBroadcast=false tensors has to be of exactly the same shape.
  *
  * If AllowBroadcast=true it support limited broadcasting of the right-hand-side
  * argument to match the shape of left-hand-side argument. Only suffix matching
  * is supported for now, 1-dim expansion doesn't work yet. More precisely
  * tensors A and B can be operated on iff
  *   `shape(A)[-len(shape(B)):] == * shape(B)`
  */
 template <
     typename InputTypes,
     class Context,
     class Functor,
     class TypeMap = SameTypeAsInput>
 class BinaryElementwiseOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   BinaryElementwiseOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0),
         OP_SINGLE_ARG(int, "axis", axis_, -1),
         OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
         OP_SINGLE_ARG(string, "order", order_, "NCHW"),
         functor_() {
     // Figure out the correct axis to use.
     if (enable_broadcast_) {
       if (axis_ != -1) {
         // Get axis from an explicit axis argument.
         CAFFE_ENFORCE_EQ(
             axis_str_.size(),
             0,
             "Args axis and axis_str cannot be used simultaneously.");
       } else if (axis_str_.size()) {
         // Get the axis index semantically.
         CAFFE_ENFORCE_EQ(
             axis_str_.size(), 1, "Unsupported axis string", axis_str_);
         size_t semantic_axis_ = order_.find(axis_str_);
         CAFFE_ENFORCE_NE(
             semantic_axis_,
             string::npos,
             "Unrecognizable axis string ",
             axis_str_,
             " from order string ",
             order_);
         axis_ = semantic_axis_;
       }
     } else {
       CAFFE_ENFORCE(
           axis_ == -1 && axis_str_.size() == 0,
           "Do not specify axis or axis_str if broadcast is not enabled.");
     }
   }

   bool RunOnDevice() override {
     return DispatchHelper<InputTypes>::call(this, Input(0));
   }

   template <typename T>
   bool DoRunWithType() {
     const auto& A = Input(0);
     const auto& B = Input(1);
     auto* C = Output(0);
     CAFFE_ENFORCE(
         &B != C || !enable_broadcast_,
         "In-place is allowed only with the first tensor when broadcasting");
     C->ResizeLike(A);
     const T* Adata = A.template data<T>();
     const T* Bdata = B.template data<T>();
     auto* Cdata =
         C->template mutable_data<typename TypeMap::template type<T>>();
     if (!enable_broadcast_) {
       CAFFE_ENFORCE_EQ(
           A.dims(),
           B.dims(),
           "Dimension mismatch - did you forget to set broadcast=1?");
       functor_.template Run<false>(A.size(), Adata, Bdata, Cdata, &context_);
     } else if (B.size() == 1) {
       functor_.template Run<true>(A.size(), Adata, Bdata, Cdata, &context_);
     } else {
       CAFFE_ENFORCE_GT(
           A.ndim(),
           B.ndim(),
           "If you are doing broadcasting, input1 should have "
           "a smaller number of dimensions.");
       const int axis = (axis_ == -1 ? A.ndim() - B.ndim() : axis_);
       CAFFE_ENFORCE(
           axis >= 0 && axis < A.ndim(),
           "Broadcast axis should be in the range of the number "
           "of dimensions of the first input.");
       size_t pre = 1, n = 1, post = 1;
       for (int i = 0; i < axis; ++i) {
         pre *= A.dim(i);
       }
       for (int i = 0; i < B.ndim(); ++i) {
         CAFFE_ENFORCE_EQ(
             A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
         n *= B.dim(i);
       }
       for (int i = axis + B.ndim(); i < A.ndim(); ++i) {
         post *= A.dim(i);
       }
       if (post == 1) {
         functor_.RunWithBroadcast(Adata, Bdata, Cdata, pre, n, &context_);
       } else {
         functor_.RunWithBroadcast2(
             Adata, Bdata, Cdata, pre, n, post, &context_);
       }
     }
     return true;
   }

  private:
   bool enable_broadcast_;
   int axis_;
   string axis_str_;
   string order_;
   Functor functor_;
 };

 template <typename Functor>
 struct WithoutBroadcast {
   template <bool b_is_scalar, typename T, typename R, typename Context>
   inline void Run(size_t n, const T* a, const T* b, R* out, Context* c) {
     if (b_is_scalar) {
       CAFFE_THROW("Broadcast not supported.");
     } else {
       Functor().Run(n, a, b, out, c);
     }
   }
   template <typename T, typename R, typename Context>
   inline void RunWithBroadcast(
       const T* /*a*/,
       const T* /*b*/,
       R* /*out*/,
       size_t /*pre*/,
       size_t /*n*/,
       Context*) {
     CAFFE_NOT_IMPLEMENTED;
   }
   template <typename T, typename R, typename Context>
   inline void RunWithBroadcast2(
       const T* /*a*/,
       const T* /*b*/,
       R* /*out*/,
       size_t /*pre*/,
       size_t /*n*/,
       size_t /*post*/,
       Context*) {
     CAFFE_NOT_IMPLEMENTED;
   }
 };

 // Gradient operator for elementwise division.
 template <class Context>
 class DivGradientOp final : public Operator<Context> {
  public:
   USE_SIMPLE_CTOR_DTOR(DivGradientOp);
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   bool RunOnDevice() override;
 };

 namespace SRLHelper {

 template <typename T>
 void sum2one(const T* a, T* y, size_t n);

 template <typename T>
 void RunWithBroadcastFront(const T* a, T* y, size_t pre, size_t n, CPUContext*);

 template <typename T>
 void RunWithBroadcastBack(const T* a, T* y, size_t post, size_t n, CPUContext*);

 template <typename T>
 void RunWithBroadcast2(
     const T* a,
     T* y,
     size_t pre,
     size_t n,
     size_t post,
     CPUContext*);

 } // namespace SRLHelper

 // Sum reduction operator that is used for computing the gradient in cases
 // where the forward op is in broadcast mode.
 template <class Context>
 class SumReduceLikeOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
   SumReduceLikeOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
         OP_SINGLE_ARG(int, "axis", axis_, -1),
         OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
         OP_SINGLE_ARG(string, "order", order_, "NCHW") {
     if (axis_ != -1) {
       // Get axis from an explicit axis argument.
       CAFFE_ENFORCE_EQ(
           axis_str_.size(),
           0,
           "Args axis and axis_str cannot be used simultaneously.");
     } else if (axis_str_.size()) {
       // Get the axis index semantically.
       CAFFE_ENFORCE_EQ(
           axis_str_.size(), 1, "Unsupported axis string", axis_str_);
       size_t semantic_axis = order_.find(axis_str_);
       CAFFE_ENFORCE_NE(
           semantic_axis,
           string::npos,
           "Unrecognizable axis string ",
           axis_str_,
           " from order string ",
           order_);
       axis_ = semantic_axis;
     }
   }

   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
   }

   template <typename T>
   bool DoRunWithType();

  private:
   int axis_;
   string axis_str_;
   string order_;
   Tensor<Context> ones_;
   Tensor<Context> sum_buffer_;
 };

 template <class Context>
 bool DivGradientOp<Context>::RunOnDevice() {
   auto& Y = Input(0);
   auto& Z = Input(1);
   auto& dZ = Input(2);
   auto* dX = Output(0);
   auto* dY = Output(1);
   CAFFE_ENFORCE_GT(Y.size(), 0);
   CAFFE_ENFORCE_GT(Z.size(), 0);
   dX->ResizeLike(Y);
   dY->ResizeLike(Y);

   const float* Ydata = Y.template data<float>();
   const float* Zdata = Z.template data<float>();
   const float* dZdata = dZ.template data<float>();
   float* dXdata = dX->template mutable_data<float>();
   float* dYdata = dY->template mutable_data<float>();

   ElementWiseDivide(context_, Y.size(), dXdata, dYdata, dZdata, Ydata, Zdata);
   return true;
 }

 // For arithmetic operators, Eigen provides a good way to vectorize even
 // when broadcasting.
 #define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type)               \
   struct Eigen##name##Functor {                                              \
     template <int b_is_scalar, typename T, typename R>                       \
     inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
       if (b_is_scalar) {                                                     \
         EigenVectorArrayMap<R>(out, n) =                                     \
             eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0]));           \
       } else {                                                               \
         EigenVectorArrayMap<R>(out, n) = eigen_op(                           \
             (ConstEigenVectorArrayMap<T>(a, n)),                             \
             (ConstEigenVectorArrayMap<T>(b, n)));                            \
       }                                                                      \
     }                                                                        \
     template <typename T, typename R>                                        \
     void RunWithBroadcast(                                                   \
         const T* a,                                                          \
         const T* b,                                                          \
         R* out,                                                              \
         size_t pre,                                                          \
         size_t n,                                                            \
         CPUContext*) {                                                       \
       EigenArrayMap<R>(out, n, pre) = eigen_op(                              \
           (ConstEigenArrayMap<T>(a, n, pre).colwise()),                      \
           (ConstEigenVectorArrayMap<T>(b, n)));                              \
     }                                                                        \
     template <typename T, typename R>                                        \
     void RunWithBroadcast2(                                                  \
         const T* a,                                                          \
         const T* b,                                                          \
         R* out,                                                              \
         size_t pre,                                                          \
         size_t n,                                                            \
         size_t post,                                                         \
         CPUContext*) {                                                       \
       for (int i = 0; i < pre; ++i) {                                        \
         EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op(            \
             (ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()),    \
             (Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n)));   \
       }                                                                      \
     }                                                                        \
   };                                                                         \
   REGISTER_CPU_OPERATOR(                                                     \
       name,                                                                  \
       BinaryElementwiseOp<                                                   \
           input_type,                                                        \
           CPUContext,                                                        \
           Eigen##name##Functor,                                              \
           output_type>)

 } // namespace caffe2

 #endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
	/**
	* Copyright (c) 2016-present, Facebook, Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#ifndef CAFFE2_OPERATORS_ELEMENTWISE_OP_H_
	#define CAFFE2_OPERATORS_ELEMENTWISE_OP_H_

	#include "caffe2/core/common_omp.h"
	#include "caffe2/core/context.h"
	#include "caffe2/core/logging.h"
	#include "caffe2/core/operator.h"
	#include "caffe2/core/tensor.h"
	#include "caffe2/utils/math.h"

	namespace caffe2 {

	using NumericTypes = TensorTypes<int32_t, int64_t, float, double>;
	using IntTypes = TensorTypes<int32_t, int64_t>;
	using BoolTypes = TensorTypes<bool>;

	struct SameTypeAsInput {
	template <typename T>
	using type = T;
	};

	template <typename R>
	struct FixedType {
	template <typename T>
	using type = R;
	};

	template <
	typename InputTypes,
	class Context,
	class Functor,
	class TypeMap = SameTypeAsInput>
	class UnaryElementwiseWithArgsOp : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	UnaryElementwiseWithArgsOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws), functor_(*this) {}

	bool RunOnDevice() override {
	return DispatchHelper<InputTypes>::call(this, Input(0));
	}

	template <typename T>
	bool DoRunWithType() {
	auto& input = Input(0);
	auto* output = Output(0);
	output->ResizeLike(input);
	using R = typename TypeMap::template type<T>;
	functor_(
	input.size(),
	input.template data<T>(),
	output->template mutable_data<R>(),
	&context_);
	return true;
	}

	private:
	Functor functor_;
	};

	/**
	* WithDefaultConstructor is a functor that can be used as the functor of an
	* UnaryElementwiseWithArgsOp. It simply forwards the operator() call into
	* another functor that doesn't accept arguments in its constructor.
	*/
	template <typename Functor>
	struct WithDefaultConstructor {
	explicit WithDefaultConstructor(OperatorBase& /op/) {}

	template <typename In, typename Out, typename Context>
	void operator()(int n, const In* in, Out* out, Context* c) {
	Functor()(n, in, out, c);
	}
	};

	/**
	* UnaryElementwiseOp is a wrapper around UnaryElementwiseWithArgsOp, with the
	* difference that it takes a functor with default constructor, e.g. that does
	* not need to take into consideration any arguments during operator creation.
	*/
	template <
	typename InputTypes,
	class Context,
	class Functor,
	class OutputType = SameTypeAsInput>
	using UnaryElementwiseOp = UnaryElementwiseWithArgsOp<
	InputTypes,
	Context,
	WithDefaultConstructor<Functor>,
	OutputType>;

	/**
	* Performs a binary operation (e.g. +, - or /) with optional broadcast support.
	*
	* Functor specifies actual operation to be performed.
	*
	* If AllowBroadcast=false tensors has to be of exactly the same shape.
	*
	* If AllowBroadcast=true it support limited broadcasting of the right-hand-side
	* argument to match the shape of left-hand-side argument. Only suffix matching
	* is supported for now, 1-dim expansion doesn't work yet. More precisely
	* tensors A and B can be operated on iff
	* `shape(A)[-len(shape(B)):] == * shape(B)`
	*/
	template <
	typename InputTypes,
	class Context,
	class Functor,
	class TypeMap = SameTypeAsInput>
	class BinaryElementwiseOp : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;

	BinaryElementwiseOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	OP_SINGLE_ARG(bool, "broadcast", enable_broadcast_, 0),
	OP_SINGLE_ARG(int, "axis", axis_, -1),
	OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
	OP_SINGLE_ARG(string, "order", order_, "NCHW"),
	functor_() {
	// Figure out the correct axis to use.
	if (enable_broadcast_) {
	if (axis_ != -1) {
	// Get axis from an explicit axis argument.
	CAFFE_ENFORCE_EQ(
	axis_str_.size(),
	0,
	"Args axis and axis_str cannot be used simultaneously.");
	} else if (axis_str_.size()) {
	// Get the axis index semantically.
	CAFFE_ENFORCE_EQ(
	axis_str_.size(), 1, "Unsupported axis string", axis_str_);
	size_t semantic_axis_ = order_.find(axis_str_);
	CAFFE_ENFORCE_NE(
	semantic_axis_,
	string::npos,
	"Unrecognizable axis string ",
	axis_str_,
	" from order string ",
	order_);
	axis_ = semantic_axis_;
	}
	} else {
	CAFFE_ENFORCE(
	axis_ == -1 && axis_str_.size() == 0,
	"Do not specify axis or axis_str if broadcast is not enabled.");
	}
	}

	bool RunOnDevice() override {
	return DispatchHelper<InputTypes>::call(this, Input(0));
	}

	template <typename T>
	bool DoRunWithType() {
	const auto& A = Input(0);
	const auto& B = Input(1);
	auto* C = Output(0);
	CAFFE_ENFORCE(
	&B != C \|\| !enable_broadcast_,
	"In-place is allowed only with the first tensor when broadcasting");
	C->ResizeLike(A);
	const T* Adata = A.template data<T>();
	const T* Bdata = B.template data<T>();
	auto* Cdata =
	C->template mutable_data<typename TypeMap::template type<T>>();
	if (!enable_broadcast_) {
	CAFFE_ENFORCE_EQ(
	A.dims(),
	B.dims(),
	"Dimension mismatch - did you forget to set broadcast=1?");
	functor_.template Run<false>(A.size(), Adata, Bdata, Cdata, &context_);
	} else if (B.size() == 1) {
	functor_.template Run<true>(A.size(), Adata, Bdata, Cdata, &context_);
	} else {
	CAFFE_ENFORCE_GT(
	A.ndim(),
	B.ndim(),
	"If you are doing broadcasting, input1 should have "
	"a smaller number of dimensions.");
	const int axis = (axis_ == -1 ? A.ndim() - B.ndim() : axis_);
	CAFFE_ENFORCE(
	axis >= 0 && axis < A.ndim(),
	"Broadcast axis should be in the range of the number "
	"of dimensions of the first input.");
	size_t pre = 1, n = 1, post = 1;
	for (int i = 0; i < axis; ++i) {
	pre *= A.dim(i);
	}
	for (int i = 0; i < B.ndim(); ++i) {
	CAFFE_ENFORCE_EQ(
	A.dim(i + axis), B.dim(i), "Broadcast dimension mismatch.");
	n *= B.dim(i);
	}
	for (int i = axis + B.ndim(); i < A.ndim(); ++i) {
	post *= A.dim(i);
	}
	if (post == 1) {
	functor_.RunWithBroadcast(Adata, Bdata, Cdata, pre, n, &context_);
	} else {
	functor_.RunWithBroadcast2(
	Adata, Bdata, Cdata, pre, n, post, &context_);
	}
	}
	return true;
	}

	private:
	bool enable_broadcast_;
	int axis_;
	string axis_str_;
	string order_;
	Functor functor_;
	};

	template <typename Functor>
	struct WithoutBroadcast {
	template <bool b_is_scalar, typename T, typename R, typename Context>
	inline void Run(size_t n, const T* a, const T* b, R* out, Context* c) {
	if (b_is_scalar) {
	CAFFE_THROW("Broadcast not supported.");
	} else {
	Functor().Run(n, a, b, out, c);
	}
	}
	template <typename T, typename R, typename Context>
	inline void RunWithBroadcast(
	const T* /a/,
	const T* /b/,
	R* /out/,
	size_t /pre/,
	size_t /n/,
	Context*) {
	CAFFE_NOT_IMPLEMENTED;
	}
	template <typename T, typename R, typename Context>
	inline void RunWithBroadcast2(
	const T* /a/,
	const T* /b/,
	R* /out/,
	size_t /pre/,
	size_t /n/,
	size_t /post/,
	Context*) {
	CAFFE_NOT_IMPLEMENTED;
	}
	};

	// Gradient operator for elementwise division.
	template <class Context>
	class DivGradientOp final : public Operator<Context> {
	public:
	USE_SIMPLE_CTOR_DTOR(DivGradientOp);
	USE_OPERATOR_CONTEXT_FUNCTIONS;

	bool RunOnDevice() override;
	};

	namespace SRLHelper {

	template <typename T>
	void sum2one(const T* a, T* y, size_t n);

	template <typename T>
	void RunWithBroadcastFront(const T* a, T* y, size_t pre, size_t n, CPUContext*);

	template <typename T>
	void RunWithBroadcastBack(const T* a, T* y, size_t post, size_t n, CPUContext*);

	template <typename T>
	void RunWithBroadcast2(
	const T* a,
	T* y,
	size_t pre,
	size_t n,
	size_t post,
	CPUContext*);

	} // namespace SRLHelper

	// Sum reduction operator that is used for computing the gradient in cases
	// where the forward op is in broadcast mode.
	template <class Context>
	class SumReduceLikeOp final : public Operator<Context> {
	public:
	USE_OPERATOR_CONTEXT_FUNCTIONS;
	SumReduceLikeOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<Context>(operator_def, ws),
	OP_SINGLE_ARG(int, "axis", axis_, -1),
	OP_SINGLE_ARG(string, "axis_str", axis_str_, ""),
	OP_SINGLE_ARG(string, "order", order_, "NCHW") {
	if (axis_ != -1) {
	// Get axis from an explicit axis argument.
	CAFFE_ENFORCE_EQ(
	axis_str_.size(),
	0,
	"Args axis and axis_str cannot be used simultaneously.");
	} else if (axis_str_.size()) {
	// Get the axis index semantically.
	CAFFE_ENFORCE_EQ(
	axis_str_.size(), 1, "Unsupported axis string", axis_str_);
	size_t semantic_axis = order_.find(axis_str_);
	CAFFE_ENFORCE_NE(
	semantic_axis,
	string::npos,
	"Unrecognizable axis string ",
	axis_str_,
	" from order string ",
	order_);
	axis_ = semantic_axis;
	}
	}

	bool RunOnDevice() override {
	return DispatchHelper<TensorTypes<float, double>>::call(this, Input(0));
	}

	template <typename T>
	bool DoRunWithType();

	private:
	int axis_;
	string axis_str_;
	string order_;
	Tensor<Context> ones_;
	Tensor<Context> sum_buffer_;
	};

	template <class Context>
	bool DivGradientOp<Context>::RunOnDevice() {
	auto& Y = Input(0);
	auto& Z = Input(1);
	auto& dZ = Input(2);
	auto* dX = Output(0);
	auto* dY = Output(1);
	CAFFE_ENFORCE_GT(Y.size(), 0);
	CAFFE_ENFORCE_GT(Z.size(), 0);
	dX->ResizeLike(Y);
	dY->ResizeLike(Y);

	const float* Ydata = Y.template data<float>();
	const float* Zdata = Z.template data<float>();
	const float* dZdata = dZ.template data<float>();
	float* dXdata = dX->template mutable_data<float>();
	float* dYdata = dY->template mutable_data<float>();

	ElementWiseDivide(context_, Y.size(), dXdata, dYdata, dZdata, Ydata, Zdata);
	return true;
	}

	// For arithmetic operators, Eigen provides a good way to vectorize even
	// when broadcasting.
	#define EIGEN_FUNCTOR(name, eigen_op, input_type, output_type) \
	struct Eigen##name##Functor { \
	template <int b_is_scalar, typename T, typename R> \
	inline void Run(size_t n, const T* a, const T* b, R* out, CPUContext*) { \
	if (b_is_scalar) { \
	EigenVectorArrayMap<R>(out, n) = \
	eigen_op((ConstEigenVectorArrayMap<T>(a, n)), (b[0])); \
	} else { \
	EigenVectorArrayMap<R>(out, n) = eigen_op( \
	(ConstEigenVectorArrayMap<T>(a, n)), \
	(ConstEigenVectorArrayMap<T>(b, n))); \
	} \
	} \
	template <typename T, typename R> \
	void RunWithBroadcast( \
	const T* a, \
	const T* b, \
	R* out, \
	size_t pre, \
	size_t n, \
	CPUContext*) { \
	EigenArrayMap<R>(out, n, pre) = eigen_op( \
	(ConstEigenArrayMap<T>(a, n, pre).colwise()), \
	(ConstEigenVectorArrayMap<T>(b, n))); \
	} \
	template <typename T, typename R> \
	void RunWithBroadcast2( \
	const T* a, \
	const T* b, \
	R* out, \
	size_t pre, \
	size_t n, \
	size_t post, \
	CPUContext*) { \
	for (int i = 0; i < pre; ++i) { \
	EigenArrayMap<R>(out + i * n * post, post, n) = eigen_op( \
	(ConstEigenArrayMap<T>(a + i * n * post, post, n).rowwise()), \
	(Eigen::Map<const Eigen::Array<T, 1, Eigen::Dynamic>>(b, n))); \
	} \
	} \
	}; \
	REGISTER_CPU_OPERATOR( \
	name, \
	BinaryElementwiseOp< \
	input_type, \
	CPUContext, \
	Eigen##name##Functor, \
	output_type>)

	} // namespace caffe2

	#endif // CAFFE2_OPERATORS_ELEMENTWISE_OP_H_