caffe2/operators/reduction_ops.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/reduction_ops.h"

 namespace caffe2 {

 REGISTER_CPU_OPERATOR(SumElements, SumElementsOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(SumElementsInt, SumElementsIntOp<int, CPUContext>);
 REGISTER_CPU_OPERATOR(SumSqrElements, SumSqrElementsOp<CPUContext>);

 REGISTER_CPU_OPERATOR(
     SumElementsGradient,
     SumElementsGradientOp<float, CPUContext>);

 REGISTER_CPU_OPERATOR(RowwiseMax, MaxReductionOp<float, CPUContext, true>);
 REGISTER_CPU_OPERATOR(
     RowwiseMaxGradient,
     MaxReductionGradientOp<float, CPUContext, true>);
 REGISTER_CPU_OPERATOR(
     ColwiseMaxGradient,
     MaxReductionGradientOp<float, CPUContext, false>);
 REGISTER_CPU_OPERATOR(ColwiseMax, MaxReductionOp<float, CPUContext, false>);

 OPERATOR_SCHEMA(SumElements)
     .NumInputs(1)
     .NumOutputs(1)
     .ScalarType(TensorProto::FLOAT)
     .SetDoc("Sums the elements of the input tensor.")
     .Arg("average", "whether to average or not")
     .Input(0, "X", "Tensor to sum up")
     .Output(0, "sum", "Scalar sum");

 OPERATOR_SCHEMA(SumElementsInt)
     .NumInputs(1)
     .NumOutputs(1)
     .ScalarType(TensorProto::INT32)
     .SetDoc("Sums the integer elements of the input tensor.")
     .Input(0, "X", "Tensor to sum up")
     .Output(0, "sum", "Scalar sum");
 SHOULD_NOT_DO_GRADIENT(SumElementsInt);

 OPERATOR_SCHEMA(SumSqrElements)
     .NumInputs(1)
     .NumOutputs(1)
     .ScalarType(TensorProto::FLOAT)
     .SetDoc("Sums the squares elements of the input tensor.")
     .Arg("average", "whether to average or not")
     .Input(0, "X", "Tensor to sum up")
     .Output(0, "sum", "Scalar sum of squares");

 OPERATOR_SCHEMA(SumElementsGradient).NumInputs(2).NumOutputs(1);

 class GetSumElementsGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "SumElementsGradient",
         "",
         vector<string>{I(0), GO(0)},
         vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(SumElements, GetSumElementsGradient);

 OPERATOR_SCHEMA(RowwiseMax)
     .NumInputs(1)
     .NumOutputs(1)
     .SetDoc("Compute row-wise max reduction of the input tensor.")
     .Input(
         0,
         "X",
         "A tenosr of dimensions batch_size x M x N to compute rowwise-max.")
     .Output(0, "Y", "batch_size x M rowwise-max results matrix.");

 OPERATOR_SCHEMA(RowwiseMaxGradient).NumInputs(3).NumOutputs(1);
 class GetRowwiseMaxGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "RowwiseMaxGradient",
         "",
         vector<string>{I(0), O(0), GO(0)},
         vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(RowwiseMax, GetRowwiseMaxGradient);

 OPERATOR_SCHEMA(ColwiseMaxGradient);

 OPERATOR_SCHEMA(ColwiseMax)
     .NumInputs(1)
     .NumOutputs(1)
     .SetDoc("Compute column-wise max reduction of the input tensor.")
     .Input(
         0,
         "X",
         "A tenosr of dimensions batch_size x M x N to compute colwise-max.")
     .Output(0, "Y", "batch_size x N column-max results matrix.");

 OPERATOR_SCHEMA(ColumnMaxGradient).NumInputs(3).NumOutputs(1);
 class GetColwiseMaxGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
         "ColwiseMaxGradient",
         "",
         vector<string>{I(0), O(0), GO(0)},
         vector<string>{GI(0)});
   }
 };
 REGISTER_GRADIENT(ColwiseMax, GetColwiseMaxGradient);

 template <typename T, class Context>
 bool SumElementsGradientOp<T, Context>::RunOnDevice()
 // TODO: T21635077 fix float-divide-by-zero undefined behavior
 #if defined(__has_feature)
 #if __has_feature(__address_sanitizer__)
     __attribute__((__no_sanitize__("float-divide-by-zero")))
 #endif
 #endif
 {
   auto& X = Input(0);
   TensorCPU sum_grad = TensorCPU(Input(1));
   auto* dX = Output(0);
   dX->ResizeLike(X);
   DCHECK_EQ(sum_grad.size(), 1);
   math::Set<T, Context>(
       dX->size(),
       static_cast<T>(sum_grad.data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
       dX->template mutable_data<T>(),
       &context_);
   return true;
 }

 template <typename T, class Context, bool ROWWISE>
 bool MaxReductionGradientOp<T, Context, ROWWISE>::RunOnDevice() {
   auto& X = Input(0);
   auto& Y = Input(1);
   auto& dY = Input(2);

   auto* dX = Output(0);
   dX->ResizeLike(X);

   CAFFE_ENFORCE_EQ(X.ndim(), 3);

   const int batch_size = X.dim32(0);
   const int M = X.dim32(1);
   const int N = X.dim32(2);

   const T* Xdata = X.template data<T>();
   const T* Ydata = Y.template data<T>();
   const T* dYdata = dY.template data<T>();
   T* dXdata = dX->template mutable_data<T>();

   const int input_size = M * N;
   for (int i = 0; i < batch_size; ++i) {
     const T* Xdata_i = Xdata + i * input_size;
     T* dXdata_i = dXdata + i * input_size;
     if (ROWWISE) {
       const T* Ydata_i = Ydata + i * M;
       const T* dYdata_i = dYdata + i * M;
       for (int m = 0; m < M; ++m) {
         const T* Xdata_m = Xdata_i + m * N;
         T* dXdata_m = dXdata_i + m * N;
         for (int n = 0; n < N; ++n) {
           if (Xdata_m[n] == Ydata_i[m]) {
             dXdata_m[n] = dYdata_i[m];
           } else {
             dXdata_m[n] = static_cast<T>(0);
           }
         }
       }
     } else {
       const T* Ydata_i = Ydata + i * N;
       const T* dYdata_i = dYdata + i * N;
       for (int n = 0; n < N; ++n) {
         for (int m = 0; m < M; ++m) {
           const T* Xdata_m = Xdata_i + m * N;
           T* dXdata_m = dXdata_i + m * N;
           if (Xdata_m[n] == Ydata_i[n]) {
             dXdata_m[n] = dYdata_i[n];
           } else {
             dXdata_m[n] = static_cast<T>(0);
           }
         }
       }
     }
   }

   return true;
 }

 } // namespace caffe2
	#include "caffe2/operators/reduction_ops.h"

	namespace caffe2 {

	REGISTER_CPU_OPERATOR(SumElements, SumElementsOp<float, CPUContext>);
	REGISTER_CPU_OPERATOR(SumElementsInt, SumElementsIntOp<int, CPUContext>);
	REGISTER_CPU_OPERATOR(SumSqrElements, SumSqrElementsOp<CPUContext>);

	REGISTER_CPU_OPERATOR(
	SumElementsGradient,
	SumElementsGradientOp<float, CPUContext>);

	REGISTER_CPU_OPERATOR(RowwiseMax, MaxReductionOp<float, CPUContext, true>);
	REGISTER_CPU_OPERATOR(
	RowwiseMaxGradient,
	MaxReductionGradientOp<float, CPUContext, true>);
	REGISTER_CPU_OPERATOR(
	ColwiseMaxGradient,
	MaxReductionGradientOp<float, CPUContext, false>);
	REGISTER_CPU_OPERATOR(ColwiseMax, MaxReductionOp<float, CPUContext, false>);

	OPERATOR_SCHEMA(SumElements)
	.NumInputs(1)
	.NumOutputs(1)
	.ScalarType(TensorProto::FLOAT)
	.SetDoc("Sums the elements of the input tensor.")
	.Arg("average", "whether to average or not")
	.Input(0, "X", "Tensor to sum up")
	.Output(0, "sum", "Scalar sum");

	OPERATOR_SCHEMA(SumElementsInt)
	.NumInputs(1)
	.NumOutputs(1)
	.ScalarType(TensorProto::INT32)
	.SetDoc("Sums the integer elements of the input tensor.")
	.Input(0, "X", "Tensor to sum up")
	.Output(0, "sum", "Scalar sum");
	SHOULD_NOT_DO_GRADIENT(SumElementsInt);

	OPERATOR_SCHEMA(SumSqrElements)
	.NumInputs(1)
	.NumOutputs(1)
	.ScalarType(TensorProto::FLOAT)
	.SetDoc("Sums the squares elements of the input tensor.")
	.Arg("average", "whether to average or not")
	.Input(0, "X", "Tensor to sum up")
	.Output(0, "sum", "Scalar sum of squares");

	OPERATOR_SCHEMA(SumElementsGradient).NumInputs(2).NumOutputs(1);

	class GetSumElementsGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;
	vector<OperatorDef> GetGradientDefs() override {
	return SingleGradientDef(
	"SumElementsGradient",
	"",
	vector<string>{I(0), GO(0)},
	vector<string>{GI(0)});
	}
	};
	REGISTER_GRADIENT(SumElements, GetSumElementsGradient);

	OPERATOR_SCHEMA(RowwiseMax)
	.NumInputs(1)
	.NumOutputs(1)
	.SetDoc("Compute row-wise max reduction of the input tensor.")
	.Input(
	0,
	"X",
	"A tenosr of dimensions batch_size x M x N to compute rowwise-max.")
	.Output(0, "Y", "batch_size x M rowwise-max results matrix.");

	OPERATOR_SCHEMA(RowwiseMaxGradient).NumInputs(3).NumOutputs(1);
	class GetRowwiseMaxGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;
	vector<OperatorDef> GetGradientDefs() override {
	return SingleGradientDef(
	"RowwiseMaxGradient",
	"",
	vector<string>{I(0), O(0), GO(0)},
	vector<string>{GI(0)});
	}
	};
	REGISTER_GRADIENT(RowwiseMax, GetRowwiseMaxGradient);

	OPERATOR_SCHEMA(ColwiseMaxGradient);

	OPERATOR_SCHEMA(ColwiseMax)
	.NumInputs(1)
	.NumOutputs(1)
	.SetDoc("Compute column-wise max reduction of the input tensor.")
	.Input(
	0,
	"X",
	"A tenosr of dimensions batch_size x M x N to compute colwise-max.")
	.Output(0, "Y", "batch_size x N column-max results matrix.");

	OPERATOR_SCHEMA(ColumnMaxGradient).NumInputs(3).NumOutputs(1);
	class GetColwiseMaxGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;
	vector<OperatorDef> GetGradientDefs() override {
	return SingleGradientDef(
	"ColwiseMaxGradient",
	"",
	vector<string>{I(0), O(0), GO(0)},
	vector<string>{GI(0)});
	}
	};
	REGISTER_GRADIENT(ColwiseMax, GetColwiseMaxGradient);

	template <typename T, class Context>
	bool SumElementsGradientOp<T, Context>::RunOnDevice()
	// TODO: T21635077 fix float-divide-by-zero undefined behavior
	#if defined(__has_feature)
	#if __has_feature(__address_sanitizer__)
	__attribute__((__no_sanitize__("float-divide-by-zero")))
	#endif
	#endif
	{
	auto& X = Input(0);
	TensorCPU sum_grad = TensorCPU(Input(1));
	auto* dX = Output(0);
	dX->ResizeLike(X);
	DCHECK_EQ(sum_grad.size(), 1);
	math::Set<T, Context>(
	dX->size(),
	static_cast<T>(sum_grad.data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
	dX->template mutable_data<T>(),
	&context_);
	return true;
	}

	template <typename T, class Context, bool ROWWISE>
	bool MaxReductionGradientOp<T, Context, ROWWISE>::RunOnDevice() {
	auto& X = Input(0);
	auto& Y = Input(1);
	auto& dY = Input(2);

	auto* dX = Output(0);
	dX->ResizeLike(X);

	CAFFE_ENFORCE_EQ(X.ndim(), 3);

	const int batch_size = X.dim32(0);
	const int M = X.dim32(1);
	const int N = X.dim32(2);

	const T* Xdata = X.template data<T>();
	const T* Ydata = Y.template data<T>();
	const T* dYdata = dY.template data<T>();
	T* dXdata = dX->template mutable_data<T>();

	const int input_size = M * N;
	for (int i = 0; i < batch_size; ++i) {
	const T* Xdata_i = Xdata + i * input_size;
	T* dXdata_i = dXdata + i * input_size;
	if (ROWWISE) {
	const T* Ydata_i = Ydata + i * M;
	const T* dYdata_i = dYdata + i * M;
	for (int m = 0; m < M; ++m) {
	const T* Xdata_m = Xdata_i + m * N;
	T* dXdata_m = dXdata_i + m * N;
	for (int n = 0; n < N; ++n) {
	if (Xdata_m[n] == Ydata_i[m]) {
	dXdata_m[n] = dYdata_i[m];
	} else {
	dXdata_m[n] = static_cast<T>(0);
	}
	}
	}
	} else {
	const T* Ydata_i = Ydata + i * N;
	const T* dYdata_i = dYdata + i * N;
	for (int n = 0; n < N; ++n) {
	for (int m = 0; m < M; ++m) {
	const T* Xdata_m = Xdata_i + m * N;
	T* dXdata_m = dXdata_i + m * N;
	if (Xdata_m[n] == Ydata_i[n]) {
	dXdata_m[n] = dYdata_i[n];
	} else {
	dXdata_m[n] = static_cast<T>(0);
	}
	}
	}
	}
	}

	return true;
	}

	} // namespace caffe2