blob: 02a89e9da0e3c88fc93090ee6b7501a7efa52b43 [file] [log] [blame]
#include "caffe2/operators/reduction_ops.h"
namespace caffe2 {
REGISTER_CPU_OPERATOR(SumElements, SumElementsOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(SumElementsInt, SumElementsIntOp<int, CPUContext>);
REGISTER_CPU_OPERATOR(SumSqrElements, SumSqrElementsOp<CPUContext>);
REGISTER_CPU_OPERATOR(
SumElementsGradient,
SumElementsGradientOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(RowwiseMax, MaxReductionOp<float, CPUContext, true>);
REGISTER_CPU_OPERATOR(
RowwiseMaxGradient,
MaxReductionGradientOp<float, CPUContext, true>);
REGISTER_CPU_OPERATOR(
ColwiseMaxGradient,
MaxReductionGradientOp<float, CPUContext, false>);
REGISTER_CPU_OPERATOR(ColwiseMax, MaxReductionOp<float, CPUContext, false>);
OPERATOR_SCHEMA(SumElements)
.NumInputs(1)
.NumOutputs(1)
.ScalarType(TensorProto::FLOAT)
.SetDoc("Sums the elements of the input tensor.")
.Arg("average", "whether to average or not")
.Input(0, "X", "Tensor to sum up")
.Output(0, "sum", "Scalar sum");
OPERATOR_SCHEMA(SumElementsInt)
.NumInputs(1)
.NumOutputs(1)
.ScalarType(TensorProto::INT32)
.SetDoc("Sums the integer elements of the input tensor.")
.Input(0, "X", "Tensor to sum up")
.Output(0, "sum", "Scalar sum");
SHOULD_NOT_DO_GRADIENT(SumElementsInt);
OPERATOR_SCHEMA(SumSqrElements)
.NumInputs(1)
.NumOutputs(1)
.ScalarType(TensorProto::FLOAT)
.SetDoc("Sums the squares elements of the input tensor.")
.Arg("average", "whether to average or not")
.Input(0, "X", "Tensor to sum up")
.Output(0, "sum", "Scalar sum of squares");
OPERATOR_SCHEMA(SumElementsGradient).NumInputs(2).NumOutputs(1);
class GetSumElementsGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"SumElementsGradient",
"",
vector<string>{I(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(SumElements, GetSumElementsGradient);
OPERATOR_SCHEMA(RowwiseMax)
.NumInputs(1)
.NumOutputs(1)
.SetDoc("Compute row-wise max reduction of the input tensor.")
.Input(
0,
"X",
"A tenosr of dimensions batch_size x M x N to compute rowwise-max.")
.Output(0, "Y", "batch_size x M rowwise-max results matrix.");
OPERATOR_SCHEMA(RowwiseMaxGradient).NumInputs(3).NumOutputs(1);
class GetRowwiseMaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"RowwiseMaxGradient",
"",
vector<string>{I(0), O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(RowwiseMax, GetRowwiseMaxGradient);
OPERATOR_SCHEMA(ColwiseMaxGradient);
OPERATOR_SCHEMA(ColwiseMax)
.NumInputs(1)
.NumOutputs(1)
.SetDoc("Compute column-wise max reduction of the input tensor.")
.Input(
0,
"X",
"A tenosr of dimensions batch_size x M x N to compute colwise-max.")
.Output(0, "Y", "batch_size x N column-max results matrix.");
OPERATOR_SCHEMA(ColumnMaxGradient).NumInputs(3).NumOutputs(1);
class GetColwiseMaxGradient : public GradientMakerBase {
using GradientMakerBase::GradientMakerBase;
vector<OperatorDef> GetGradientDefs() override {
return SingleGradientDef(
"ColwiseMaxGradient",
"",
vector<string>{I(0), O(0), GO(0)},
vector<string>{GI(0)});
}
};
REGISTER_GRADIENT(ColwiseMax, GetColwiseMaxGradient);
template <typename T, class Context>
bool SumElementsGradientOp<T, Context>::RunOnDevice()
// TODO: T21635077 fix float-divide-by-zero undefined behavior
#if defined(__has_feature)
#if __has_feature(__address_sanitizer__)
__attribute__((__no_sanitize__("float-divide-by-zero")))
#endif
#endif
{
auto& X = Input(0);
TensorCPU sum_grad = TensorCPU(Input(1));
auto* dX = Output(0);
dX->ResizeLike(X);
DCHECK_EQ(sum_grad.size(), 1);
math::Set<T, Context>(
dX->size(),
static_cast<T>(sum_grad.data<T>()[0] * (average_ ? 1.0 / X.size() : 1)),
dX->template mutable_data<T>(),
&context_);
return true;
}
template <typename T, class Context, bool ROWWISE>
bool MaxReductionGradientOp<T, Context, ROWWISE>::RunOnDevice() {
auto& X = Input(0);
auto& Y = Input(1);
auto& dY = Input(2);
auto* dX = Output(0);
dX->ResizeLike(X);
CAFFE_ENFORCE_EQ(X.ndim(), 3);
const int batch_size = X.dim32(0);
const int M = X.dim32(1);
const int N = X.dim32(2);
const T* Xdata = X.template data<T>();
const T* Ydata = Y.template data<T>();
const T* dYdata = dY.template data<T>();
T* dXdata = dX->template mutable_data<T>();
const int input_size = M * N;
for (int i = 0; i < batch_size; ++i) {
const T* Xdata_i = Xdata + i * input_size;
T* dXdata_i = dXdata + i * input_size;
if (ROWWISE) {
const T* Ydata_i = Ydata + i * M;
const T* dYdata_i = dYdata + i * M;
for (int m = 0; m < M; ++m) {
const T* Xdata_m = Xdata_i + m * N;
T* dXdata_m = dXdata_i + m * N;
for (int n = 0; n < N; ++n) {
if (Xdata_m[n] == Ydata_i[m]) {
dXdata_m[n] = dYdata_i[m];
} else {
dXdata_m[n] = static_cast<T>(0);
}
}
}
} else {
const T* Ydata_i = Ydata + i * N;
const T* dYdata_i = dYdata + i * N;
for (int n = 0; n < N; ++n) {
for (int m = 0; m < M; ++m) {
const T* Xdata_m = Xdata_i + m * N;
T* dXdata_m = dXdata_i + m * N;
if (Xdata_m[n] == Ydata_i[n]) {
dXdata_m[n] = dYdata_i[n];
} else {
dXdata_m[n] = static_cast<T>(0);
}
}
}
}
}
return true;
}
} // namespace caffe2