caffe2/operators/fully_connected_op.cc - platform/external/pytorch - Git at Google

 #include <functional>

 #include "caffe2/operators/fully_connected_op.h"

 namespace caffe2 {

 REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<CPUContext>);
 REGISTER_CPU_OPERATOR(FCGradient, FullyConnectedGradientOp<CPUContext>);

 REGISTER_CPU_OPERATOR(
     FCTransposed,
     FullyConnectedOp<
         CPUContext,
         DefaultEngine,
         false /* don't transpose weight */>);
 REGISTER_CPU_OPERATOR(
     FCTransposedGradient,
     FullyConnectedGradientOp<
         CPUContext,
         DefaultEngine,
         false /* don't transpose weight */>);

 namespace {
 std::vector<TensorShape> FCShapeInference(
     const OperatorDef& def,
     const vector<TensorShape>& in,
     bool pretransposed_weight) {
   vector<TensorShape> out(1);

   if (in[0].unknown_shape() || in[1].unknown_shape()) {
       out[0].set_unknown_shape(true);
       return out;
   }

   ArgumentHelper helper(def);

   auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
   const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
   auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
   const int canonical_axis_w =
       canonical_axis_index_(axis_w, in[1].dims().size());
   const int N = pretransposed_weight
       ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
       : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

   vector<int> y_shape(in[0].dims().begin(), in[0].dims().end());
   CAFFE_ENFORCE_LE(canonical_axis + 1, y_shape.size());
   y_shape.resize(canonical_axis + 1);
   y_shape[canonical_axis] = N;
   out[0] = CreateTensorShape(y_shape, in[0].data_type());
   return out;
 }

 OpSchema::Cost CostInferenceForFC(
     const OperatorDef& def,
     const vector<TensorShape>& in,
     bool pretransposed_weight) {
   struct OpSchema::Cost c;
   ArgumentHelper helper(def);

   auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
   const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
   const int M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
   const int K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
   auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
   const int canonical_axis_w =
       canonical_axis_index_(axis_w, in[1].dims().size());
   const int N = pretransposed_weight
       ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
       : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

   c.flops = 2 * K * M * N + M * N;
   c.bytes_moved = M * N * sizeof(float);
   c.params_bytes = (K * N + N) * sizeof(float);
   return c;
 }

 std::vector<TensorShape> FCGradientShapeInference(
     const OperatorDef& def,
     const vector<TensorShape>& in,
     bool pretransposed_weight) {
   vector<TensorShape> out(2);
   ArgumentHelper helper(def);

   auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
   const int canonical_axis_w =
       canonical_axis_index_(axis_w, in[1].dims().size());
   const int N = pretransposed_weight
       ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
       : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

   vector<int> dW_shape(in[1].dims().begin(), in[1].dims().end());
   out[0] = CreateTensorShape(dW_shape, in[1].data_type());
   out[1] = CreateTensorShape(vector<int>{N}, in[1].data_type()); // db
   if (def.output_size() == 3) {
     vector<int> dX_shape(in[0].dims().begin(), in[0].dims().end());
     out.push_back(CreateTensorShape(dX_shape, in[0].data_type()));
   }
   return out;
 }

 OpSchema::Cost CostInferenceForFCGradient(
     const OperatorDef& def,
     const vector<TensorShape>& in,
     bool pretransposed_weight) {
   struct OpSchema::Cost c;
   ArgumentHelper helper(def);
   std::vector<TensorShape> out =
       FCGradientShapeInference(def, in, pretransposed_weight);

   CAFFE_ENFORCE_LT(0, out.size());
   const TensorShape dW = out[0];
   const TensorShape db = out[1];

   auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
   const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
   const int M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
   const int K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
   auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
   const int canonical_axis_w =
       canonical_axis_index_(axis_w, in[1].dims().size());
   const int N = pretransposed_weight
       ? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
       : size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

   uint64_t size_dW = 1;
   for (int i = 0; i < dW.dims().size(); i++) {
     size_dW *= dW.dims(i);
   }

   uint64_t size_db = 1;
   for (int i = 0; i < db.dims().size(); i++) {
     size_db *= db.dims(i);
   }

   c.flops = 2 * (M * N * K + M * N);
   c.bytes_moved = (size_dW + size_db) * sizeof(float);
   c.params_bytes = (K * N + N) * sizeof(float);

   if (out.size() == 3) {
     const TensorShape dX = out[2];
     uint64_t size_dX = 1;
     for (int i = 0; i < dX.dims().size(); i++) {
       size_dX *= dX.dims(i);
     }

     c.flops += M * N * K;
     c.bytes_moved += size_dX * sizeof(float);
   }
   return c;
 }

 } // namespace

 using namespace std::placeholders;
 OPERATOR_SCHEMA(FCTransposed)
     .NumInputs(3)
     .NumOutputs(1)
     .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, true))
     .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, true))
     .SetDoc(R"DOC(
 Same as FC, but weight matrix is supposed to be already pretransposed.
 FCTransposed stands for calling blass with no noTrans, noTrans
 )DOC");

 OPERATOR_SCHEMA(FC)
     .NumInputs(3)
     .NumOutputs(1)
     .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
     .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false))
     .SetDoc(R"DOC(
 Computes the result of passing an input vector X into a fully
 connected layer with 2D weight matrix W and 1D bias vector b. That is,
 the layer computes Y = X * W^T + b, where X has size (M x K),
 W has size (N x K), b has size (N), and Y has size (M x N),
 where M is often the batch size.


 NOTE: X does not need to explicitly be a 2D vector; rather, it will be
 coerced into one. For an arbitrary n-dimensional tensor
 X \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}] where a_i \in N+ and k is
 the axis provided, then X will be coerced into a 2-dimensional tensor with
 dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. For the default
 case where axis=1, this means the X tensor will be coerced into a 2D tensor
 of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the batch size.
 In this situation, we must have a_0 = M and a_1 * ... * a_{n-1} = K.
 Lastly, even though b is a 1D vector of size N, it is copied/resized to
 be size (M x N) implicitly and added to each vector in the batch.
 Each of these dimensions must be matched correctly, or else the operator
 will throw errors.
 )DOC")
     .Arg(
         "axis",
         "(int32_t) default to 1; describes the axis of the inputs; "
         "defaults to one because the 0th axis most likely describes "
         "the batch_size")
     .Arg(
         "axis_w",
         "(int32_t) default to 1; describes the axis of the weight matrix W; "
         "defaults to one because the 0th axis most likely describes "
         "the batch_size")
     .Arg("float16_compute", "Whether to use float-16 compute kernel")
     .Input(
         0,
         "X",
         "input tensor that's coerced into a 2D matrix of size (MxK) "
         "as described above")
     .Input(
         1,
         "W",
         "A tensor that is coerced into a 2D blob of size (KxN) "
         "containing fully connected weight matrix")
     .Input(2, "b", "1D blob containing bias vector")
     .Output(0, "Y", "2D output tensor")
     .InheritOnnxSchema("Gemm");

 OPERATOR_SCHEMA(FCGradient)
     .NumInputs(3)
     .NumOutputs(2, 3)
     .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
     .CostInferenceFunction(
         std::bind(CostInferenceForFCGradient, _1, _2, false));
 OPERATOR_SCHEMA(FCTransposedGradient)
     .NumInputs(3)
     .NumOutputs(2, 3)
     .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
     .CostInferenceFunction(
         std::bind(CostInferenceForFCGradient, _1, _2, false));

 namespace {

 class GetFCGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;

   std::vector<OperatorDef> GetGradientDefs() override {
     CAFFE_ENFORCE_EQ(def_.input_size(), 3);
     CAFFE_ENFORCE(def_.type() == "FC" || def_.type() == "FCTransposed");
     return SingleGradientDef(
         def_.type() + "Gradient",
         "",
         vector<string>{I(0), I(1), GO(0)},
         vector<string>{GI(1), GI(2), GI(0)});
   }
 };

 REGISTER_GRADIENT(FC, GetFCGradient);
 REGISTER_GRADIENT(FCTransposed, GetFCGradient);

 } // namespace

 } // namespace caffe2
	#include <functional>

	#include "caffe2/operators/fully_connected_op.h"

	namespace caffe2 {

	REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<CPUContext>);
	REGISTER_CPU_OPERATOR(FCGradient, FullyConnectedGradientOp<CPUContext>);

	REGISTER_CPU_OPERATOR(
	FCTransposed,
	FullyConnectedOp<
	CPUContext,
	DefaultEngine,
	false /* don't transpose weight */>);
	REGISTER_CPU_OPERATOR(
	FCTransposedGradient,
	FullyConnectedGradientOp<
	CPUContext,
	DefaultEngine,
	false /* don't transpose weight */>);

	namespace {
	std::vector<TensorShape> FCShapeInference(
	const OperatorDef& def,
	const vector<TensorShape>& in,
	bool pretransposed_weight) {
	vector<TensorShape> out(1);

	if (in[0].unknown_shape() \|\| in[1].unknown_shape()) {
	out[0].set_unknown_shape(true);
	return out;
	}

	ArgumentHelper helper(def);

	auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
	const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
	auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
	const int canonical_axis_w =
	canonical_axis_index_(axis_w, in[1].dims().size());
	const int N = pretransposed_weight
	? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
	: size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

	vector<int> y_shape(in[0].dims().begin(), in[0].dims().end());
	CAFFE_ENFORCE_LE(canonical_axis + 1, y_shape.size());
	y_shape.resize(canonical_axis + 1);
	y_shape[canonical_axis] = N;
	out[0] = CreateTensorShape(y_shape, in[0].data_type());
	return out;
	}

	OpSchema::Cost CostInferenceForFC(
	const OperatorDef& def,
	const vector<TensorShape>& in,
	bool pretransposed_weight) {
	struct OpSchema::Cost c;
	ArgumentHelper helper(def);

	auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
	const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
	const int M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
	const int K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
	auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
	const int canonical_axis_w =
	canonical_axis_index_(axis_w, in[1].dims().size());
	const int N = pretransposed_weight
	? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
	: size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

	c.flops = 2 * K * M * N + M * N;
	c.bytes_moved = M * N * sizeof(float);
	c.params_bytes = (K * N + N) * sizeof(float);
	return c;
	}

	std::vector<TensorShape> FCGradientShapeInference(
	const OperatorDef& def,
	const vector<TensorShape>& in,
	bool pretransposed_weight) {
	vector<TensorShape> out(2);
	ArgumentHelper helper(def);

	auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
	const int canonical_axis_w =
	canonical_axis_index_(axis_w, in[1].dims().size());
	const int N = pretransposed_weight
	? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
	: size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

	vector<int> dW_shape(in[1].dims().begin(), in[1].dims().end());
	out[0] = CreateTensorShape(dW_shape, in[1].data_type());
	out[1] = CreateTensorShape(vector<int>{N}, in[1].data_type()); // db
	if (def.output_size() == 3) {
	vector<int> dX_shape(in[0].dims().begin(), in[0].dims().end());
	out.push_back(CreateTensorShape(dX_shape, in[0].data_type()));
	}
	return out;
	}

	OpSchema::Cost CostInferenceForFCGradient(
	const OperatorDef& def,
	const vector<TensorShape>& in,
	bool pretransposed_weight) {
	struct OpSchema::Cost c;
	ArgumentHelper helper(def);
	std::vector<TensorShape> out =
	FCGradientShapeInference(def, in, pretransposed_weight);

	CAFFE_ENFORCE_LT(0, out.size());
	const TensorShape dW = out[0];
	const TensorShape db = out[1];

	auto axis = helper.GetSingleArgument<int32_t>("axis", 1);
	const auto canonical_axis = canonical_axis_index_(axis, in[0].dims().size());
	const int M = size_to_dim_(canonical_axis, GetDimsVector(in[0]));
	const int K = size_from_dim_(canonical_axis, GetDimsVector(in[0]));
	auto axis_w = helper.GetSingleArgument<int32_t>("axis_w", 1);
	const int canonical_axis_w =
	canonical_axis_index_(axis_w, in[1].dims().size());
	const int N = pretransposed_weight
	? size_from_dim_(canonical_axis_w, GetDimsVector(in[1]))
	: size_to_dim_(canonical_axis_w, GetDimsVector(in[1]));

	uint64_t size_dW = 1;
	for (int i = 0; i < dW.dims().size(); i++) {
	size_dW *= dW.dims(i);
	}

	uint64_t size_db = 1;
	for (int i = 0; i < db.dims().size(); i++) {
	size_db *= db.dims(i);
	}

	c.flops = 2 * (M * N * K + M * N);
	c.bytes_moved = (size_dW + size_db) * sizeof(float);
	c.params_bytes = (K * N + N) * sizeof(float);

	if (out.size() == 3) {
	const TensorShape dX = out[2];
	uint64_t size_dX = 1;
	for (int i = 0; i < dX.dims().size(); i++) {
	size_dX *= dX.dims(i);
	}

	c.flops += M * N * K;
	c.bytes_moved += size_dX * sizeof(float);
	}
	return c;
	}

	} // namespace

	using namespace std::placeholders;
	OPERATOR_SCHEMA(FCTransposed)
	.NumInputs(3)
	.NumOutputs(1)
	.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, true))
	.CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, true))
	.SetDoc(R"DOC(
	Same as FC, but weight matrix is supposed to be already pretransposed.
	FCTransposed stands for calling blass with no noTrans, noTrans
	)DOC");

	OPERATOR_SCHEMA(FC)
	.NumInputs(3)
	.NumOutputs(1)
	.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
	.CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false))
	.SetDoc(R"DOC(
	Computes the result of passing an input vector X into a fully
	connected layer with 2D weight matrix W and 1D bias vector b. That is,
	the layer computes Y = X * W^T + b, where X has size (M x K),
	W has size (N x K), b has size (N), and Y has size (M x N),
	where M is often the batch size.


	NOTE: X does not need to explicitly be a 2D vector; rather, it will be
	coerced into one. For an arbitrary n-dimensional tensor
	X \in [a_0, a_1, ...,a_{k-1}, a_k, ..., a_{n-1}] where a_i \in N+ and k is
	the axis provided, then X will be coerced into a 2-dimensional tensor with
	dimensions [a_0 * ... * a_{k-1}, a_k * ... * a_{n-1}]. For the default
	case where axis=1, this means the X tensor will be coerced into a 2D tensor
	of dimensions [a_0, a_1 * ... * a_{n-1}], where a_0 is often the batch size.
	In this situation, we must have a_0 = M and a_1 * ... * a_{n-1} = K.
	Lastly, even though b is a 1D vector of size N, it is copied/resized to
	be size (M x N) implicitly and added to each vector in the batch.
	Each of these dimensions must be matched correctly, or else the operator
	will throw errors.
	)DOC")
	.Arg(
	"axis",
	"(int32_t) default to 1; describes the axis of the inputs; "
	"defaults to one because the 0th axis most likely describes "
	"the batch_size")
	.Arg(
	"axis_w",
	"(int32_t) default to 1; describes the axis of the weight matrix W; "
	"defaults to one because the 0th axis most likely describes "
	"the batch_size")
	.Arg("float16_compute", "Whether to use float-16 compute kernel")
	.Input(
	0,
	"X",
	"input tensor that's coerced into a 2D matrix of size (MxK) "
	"as described above")
	.Input(
	1,
	"W",
	"A tensor that is coerced into a 2D blob of size (KxN) "
	"containing fully connected weight matrix")
	.Input(2, "b", "1D blob containing bias vector")
	.Output(0, "Y", "2D output tensor")
	.InheritOnnxSchema("Gemm");

	OPERATOR_SCHEMA(FCGradient)
	.NumInputs(3)
	.NumOutputs(2, 3)
	.TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
	.CostInferenceFunction(
	std::bind(CostInferenceForFCGradient, _1, _2, false));
	OPERATOR_SCHEMA(FCTransposedGradient)
	.NumInputs(3)
	.NumOutputs(2, 3)
	.TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
	.CostInferenceFunction(
	std::bind(CostInferenceForFCGradient, _1, _2, false));

	namespace {

	class GetFCGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;

	std::vector<OperatorDef> GetGradientDefs() override {
	CAFFE_ENFORCE_EQ(def_.input_size(), 3);
	CAFFE_ENFORCE(def_.type() == "FC" \|\| def_.type() == "FCTransposed");
	return SingleGradientDef(
	def_.type() + "Gradient",
	"",
	vector<string>{I(0), I(1), GO(0)},
	vector<string>{GI(1), GI(2), GI(0)});
	}
	};

	REGISTER_GRADIENT(FC, GetFCGradient);
	REGISTER_GRADIENT(FCTransposed, GetFCGradient);

	} // namespace

	} // namespace caffe2