caffe2/operators/fully_connected_op.cc - platform/external/pytorch - Git at Google

 #include "caffe2/operators/fully_connected_op.h"

 #include <functional>

 #include "caffe2/operators/fc_inference.h"

 namespace caffe2 {

 REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<CPUContext>);
 REGISTER_CPU_GRADIENT_OPERATOR(
     FCGradient,
     FullyConnectedGradientOp<CPUContext>);

 REGISTER_CPU_OPERATOR(
     FCTransposed,
     FullyConnectedOp<
         CPUContext,
         DefaultEngine,
         false /* don't transpose weight */>);
 REGISTER_CPU_GRADIENT_OPERATOR(
     FCTransposedGradient,
     FullyConnectedGradientOp<
         CPUContext,
         DefaultEngine,
         false /* don't transpose weight */>);

 using namespace std::placeholders;
 OPERATOR_SCHEMA(FCTransposed)
     .NumInputs(3)
     .NumOutputs(1)
     // NOLINTNEXTLINE(modernize-avoid-bind)
     .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, true))
     // NOLINTNEXTLINE(modernize-avoid-bind)
     .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, true))
     .SetDoc(R"DOC(
 Same as FC, but weight matrix is supposed to be already pretransposed.
 FCTransposed stands for calling blass with no noTrans, noTrans
 )DOC")
     .InheritOnnxSchema();

 OPERATOR_SCHEMA(FC)
     .NumInputs(3)
     .NumOutputs(1)
     // NOLINTNEXTLINE(modernize-avoid-bind)
     .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
     // NOLINTNEXTLINE(modernize-avoid-bind)
     .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false))
     .SetDoc(R"DOC(
 The FC operator computes an output $(Y)$ as a linear combination of the input data blob $(X)$ with a weight blob $(W)$ and bias blob $(b)$. More formally,

 $$Y = XW^T+b$$

 Here, $X$ is a matrix of shape $(M,K)$, $W$ is a matrix of shape $(N,K)$, $b$ is a vector of length $N$, and $Y$ is a matrix of shape $(M,N)$. $N$ can be thought of as the number of nodes in the layer, $M$ is the batch size, and $K$ is the number of features in an input observation.

 *NOTE: $X$ does not need to explicitly be a 2-dimensional matrix, however, if it is not it will be coerced into one. For an arbitrary $n$-dimensional tensor $X$, e.g. $[a_0, a_1, \ldots ,a_{k-1}, a_k, \ldots , a_{n-1}]$, where $a_i$ in $N$, and $k$ is the $axis$ arg provided, then $X$ will be coerced into a 2-dimensional tensor with dimensions $[a_0 * \ldots * a_{k-1}, a_k * \ldots * a_{n-1}]$. For the default case where axis=1, this means the $X$ tensor will be coerced into a 2D tensor of dimensions $[a_0, a_1 * \ldots * a_{n-1}]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = M$ and $a_1 * \ldots * a_{n-1} = K$. Lastly, even though $b$ is a vector of length $N$, it is copied and resized to shape $(M x N)$ implicitly, then added to each vector in the batch.*

 Github Links:
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.h
 - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.cc

 <details>

 <summary> <b>Example</b> </summary>

 **Code**

 ```

 // In this example, our batch size is 1 (M=1), the input observation will have
 //   6 features (K=6), and the layer will have one hidden node (N=1). The
 //   expected output is Y=7.
 workspace.ResetWorkspace()

 op = core.CreateOperator(
     "FC",
     ["X", "W", "b"],
     ["Y"]
 )

 // Create X: MxK
 data = np.array([1,2,3,4,5,6]).astype(np.float32)
 data = data[np.newaxis,:]

 // Create W: NxK
 weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32)
 weights = weights[np.newaxis,:]

 // Create b: N
 bias = np.array([1.]).astype(np.float32)

 // Put the inputs into the workspace
 workspace.FeedBlob("X", data)
 workspace.FeedBlob("W", weights)
 workspace.FeedBlob("b", bias)

 // Run the operator
 workspace.RunOperatorOnce(op)
 print("Y:\n", workspace.FetchBlob("Y"))

 ```

 **Result**

 ```

 Y:
  [[7.]]

 ```

 </details>

 )DOC")
     .Arg(
         "axis",
         "*(type: int; default: 1)* Describes the axis of the input data $X$. Defaults to one because in the common case when the input $X$ has shape $(M,K)$, the first axis encodes the batch size.")
     .Arg(
         "axis_w",
         "*(type: int; default: 1)* Describes the axis of the input weight matrix $W$. Defaults to one because the first axis most likely describes the batch_size.")
     .Arg(
         "float16_compute",
         "*(type: bool; default: False)* Whether to use float-16 compute kernel.")
     .Input(
         0,
         "X",
         "Input blob to be coerced into a 2D matrix of shape $(M,K)$, where $M$ is the batch size and $K$ is the number of features in a single observation.")
     .Input(
         1,
         "W",
         "Input blob to be coerced into a 2D matrix of shape $(N,K)$ describing a fully connected weight matrix. Here, $K$ is the number of features in a single observation and $N$ is the number of nodes in the FC layer.")
     .Input(
         2,
         "b",
         "Input blob containing vector of length $N$ which describes one bias for each node in the layer.")
     .Output(
         0,
         "Y",
         "Output blob containing a 2D output matrix of shape $(M,N)$, where $M$ is the batch size and $N$ is the number of nodes in the layer. The output is calculated as $Y=XW^T+b$.")
     .InheritOnnxSchema("Gemm");

 GRADIENT_OPERATOR_SCHEMA(FCGradient)
     .NumInputs(3)
     .NumOutputs(2, 3)
     // NOLINTNEXTLINE(modernize-avoid-bind)
     .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
     .CostInferenceFunction(
         // NOLINTNEXTLINE(modernize-avoid-bind)
         std::bind(CostInferenceForFCGradient, _1, _2, false));
 GRADIENT_OPERATOR_SCHEMA(FCTransposedGradient)
     .NumInputs(3)
     .NumOutputs(2, 3)
     // NOLINTNEXTLINE(modernize-avoid-bind)
     .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
     .CostInferenceFunction(
         // NOLINTNEXTLINE(modernize-avoid-bind)
         std::bind(CostInferenceForFCGradient, _1, _2, false));

 namespace {

 class GetFCGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;

   std::vector<OperatorDef> GetGradientDefs() override {
     CAFFE_ENFORCE_EQ(def_.input_size(), 3);
     CAFFE_ENFORCE(def_.type() == "FC" || def_.type() == "FCTransposed");
     return SingleGradientDef(
         def_.type() + "Gradient",
         "",
         vector<string>{I(0), I(1), GO(0)},
         vector<string>{GI(1), GI(2), GI(0)});
   }
 };

 REGISTER_GRADIENT(FC, GetFCGradient);
 REGISTER_GRADIENT(FCTransposed, GetFCGradient);

 } // namespace

 } // namespace caffe2
	#include "caffe2/operators/fully_connected_op.h"

	#include <functional>

	#include "caffe2/operators/fc_inference.h"

	namespace caffe2 {

	REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<CPUContext>);
	REGISTER_CPU_GRADIENT_OPERATOR(
	FCGradient,
	FullyConnectedGradientOp<CPUContext>);

	REGISTER_CPU_OPERATOR(
	FCTransposed,
	FullyConnectedOp<
	CPUContext,
	DefaultEngine,
	false /* don't transpose weight */>);
	REGISTER_CPU_GRADIENT_OPERATOR(
	FCTransposedGradient,
	FullyConnectedGradientOp<
	CPUContext,
	DefaultEngine,
	false /* don't transpose weight */>);

	using namespace std::placeholders;
	OPERATOR_SCHEMA(FCTransposed)
	.NumInputs(3)
	.NumOutputs(1)
	// NOLINTNEXTLINE(modernize-avoid-bind)
	.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, true))
	// NOLINTNEXTLINE(modernize-avoid-bind)
	.CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, true))
	.SetDoc(R"DOC(
	Same as FC, but weight matrix is supposed to be already pretransposed.
	FCTransposed stands for calling blass with no noTrans, noTrans
	)DOC")
	.InheritOnnxSchema();

	OPERATOR_SCHEMA(FC)
	.NumInputs(3)
	.NumOutputs(1)
	// NOLINTNEXTLINE(modernize-avoid-bind)
	.TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
	// NOLINTNEXTLINE(modernize-avoid-bind)
	.CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false))
	.SetDoc(R"DOC(
	The FC operator computes an output $(Y)$ as a linear combination of the input data blob $(X)$ with a weight blob $(W)$ and bias blob $(b)$. More formally,

	$$Y = XW^T+b$$

	Here, $X$ is a matrix of shape $(M,K)$, $W$ is a matrix of shape $(N,K)$, $b$ is a vector of length $N$, and $Y$ is a matrix of shape $(M,N)$. $N$ can be thought of as the number of nodes in the layer, $M$ is the batch size, and $K$ is the number of features in an input observation.

	NOTE: $X$ does not need to explicitly be a 2-dimensional matrix, however, if it is not it will be coerced into one. For an arbitrary $n$-dimensional tensor $X$, e.g. $[a_0, a_1, \ldots ,a_{k-1}, a_k, \ldots , a_{n-1}]$, where $a_i$ in $N$, and $k$ is the $axis$ arg provided, then $X$ will be coerced into a 2-dimensional tensor with dimensions $[a_0 \ldots * a_{k-1}, a_k * \ldots * a_{n-1}]$. For the default case where axis=1, this means the $X$ tensor will be coerced into a 2D tensor of dimensions $[a_0, a_1 * \ldots * a_{n-1}]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = M$ and $a_1 * \ldots * a_{n-1} = K$. Lastly, even though $b$ is a vector of length $N$, it is copied and resized to shape $(M x N)$ implicitly, then added to each vector in the batch.*

	Github Links:
	- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.h
	- https://github.com/pytorch/pytorch/blob/master/caffe2/operators/fully_connected_op.cc

	<details>

	<summary> <b>Example</b> </summary>

	Code

	```

	// In this example, our batch size is 1 (M=1), the input observation will have
	// 6 features (K=6), and the layer will have one hidden node (N=1). The
	// expected output is Y=7.
	workspace.ResetWorkspace()

	op = core.CreateOperator(
	"FC",
	["X", "W", "b"],
	["Y"]
	)

	// Create X: MxK
	data = np.array([1,2,3,4,5,6]).astype(np.float32)
	data = data[np.newaxis,:]

	// Create W: NxK
	weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32)
	weights = weights[np.newaxis,:]

	// Create b: N
	bias = np.array([1.]).astype(np.float32)

	// Put the inputs into the workspace
	workspace.FeedBlob("X", data)
	workspace.FeedBlob("W", weights)
	workspace.FeedBlob("b", bias)

	// Run the operator
	workspace.RunOperatorOnce(op)
	print("Y:\n", workspace.FetchBlob("Y"))

	```

	Result

	```

	Y:
	[[7.]]

	```

	</details>

	)DOC")
	.Arg(
	"axis",
	"(type: int; default: 1) Describes the axis of the input data $X$. Defaults to one because in the common case when the input $X$ has shape $(M,K)$, the first axis encodes the batch size.")
	.Arg(
	"axis_w",
	"(type: int; default: 1) Describes the axis of the input weight matrix $W$. Defaults to one because the first axis most likely describes the batch_size.")
	.Arg(
	"float16_compute",
	"(type: bool; default: False) Whether to use float-16 compute kernel.")
	.Input(
	0,
	"X",
	"Input blob to be coerced into a 2D matrix of shape $(M,K)$, where $M$ is the batch size and $K$ is the number of features in a single observation.")
	.Input(
	1,
	"W",
	"Input blob to be coerced into a 2D matrix of shape $(N,K)$ describing a fully connected weight matrix. Here, $K$ is the number of features in a single observation and $N$ is the number of nodes in the FC layer.")
	.Input(
	2,
	"b",
	"Input blob containing vector of length $N$ which describes one bias for each node in the layer.")
	.Output(
	0,
	"Y",
	"Output blob containing a 2D output matrix of shape $(M,N)$, where $M$ is the batch size and $N$ is the number of nodes in the layer. The output is calculated as $Y=XW^T+b$.")
	.InheritOnnxSchema("Gemm");

	GRADIENT_OPERATOR_SCHEMA(FCGradient)
	.NumInputs(3)
	.NumOutputs(2, 3)
	// NOLINTNEXTLINE(modernize-avoid-bind)
	.TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
	.CostInferenceFunction(
	// NOLINTNEXTLINE(modernize-avoid-bind)
	std::bind(CostInferenceForFCGradient, _1, _2, false));
	GRADIENT_OPERATOR_SCHEMA(FCTransposedGradient)
	.NumInputs(3)
	.NumOutputs(2, 3)
	// NOLINTNEXTLINE(modernize-avoid-bind)
	.TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false))
	.CostInferenceFunction(
	// NOLINTNEXTLINE(modernize-avoid-bind)
	std::bind(CostInferenceForFCGradient, _1, _2, false));

	namespace {

	class GetFCGradient : public GradientMakerBase {
	using GradientMakerBase::GradientMakerBase;

	std::vector<OperatorDef> GetGradientDefs() override {
	CAFFE_ENFORCE_EQ(def_.input_size(), 3);
	CAFFE_ENFORCE(def_.type() == "FC" \|\| def_.type() == "FCTransposed");
	return SingleGradientDef(
	def_.type() + "Gradient",
	"",
	vector<string>{I(0), I(1), GO(0)},
	vector<string>{GI(1), GI(2), GI(0)});
	}
	};

	REGISTER_GRADIENT(FC, GetFCGradient);
	REGISTER_GRADIENT(FCTransposed, GetFCGradient);

	} // namespace

	} // namespace caffe2