|  | #include "caffe2/operators/fully_connected_op.h" | 
|  |  | 
|  | #include <functional> | 
|  |  | 
|  | #include "caffe2/operators/fc_inference.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | REGISTER_CPU_OPERATOR(FC, FullyConnectedOp<CPUContext>); | 
|  | REGISTER_CPU_GRADIENT_OPERATOR( | 
|  | FCGradient, | 
|  | FullyConnectedGradientOp<CPUContext>); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR( | 
|  | FCTransposed, | 
|  | FullyConnectedOp< | 
|  | CPUContext, | 
|  | DefaultEngine, | 
|  | false /* don't transpose weight */>); | 
|  | REGISTER_CPU_GRADIENT_OPERATOR( | 
|  | FCTransposedGradient, | 
|  | FullyConnectedGradientOp< | 
|  | CPUContext, | 
|  | DefaultEngine, | 
|  | false /* don't transpose weight */>); | 
|  |  | 
|  | using namespace std::placeholders; | 
|  | OPERATOR_SCHEMA(FCTransposed) | 
|  | .NumInputs(3) | 
|  | .NumOutputs(1) | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, true)) | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, true)) | 
|  | .SetDoc(R"DOC( | 
|  | Same as FC, but weight matrix is supposed to be already pretransposed. | 
|  | FCTransposed stands for calling blass with no noTrans, noTrans | 
|  | )DOC") | 
|  | .InheritOnnxSchema(); | 
|  |  | 
|  | OPERATOR_SCHEMA(FC) | 
|  | .NumInputs(3) | 
|  | .NumOutputs(1) | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false)) | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | .CostInferenceFunction(std::bind(CostInferenceForFC, _1, _2, false)) | 
|  | .SetDoc(R"DOC( | 
|  | The FC operator computes an output $(Y)$ as a linear combination of the input data blob $(X)$ with a weight blob $(W)$ and bias blob $(b)$. More formally, | 
|  |  | 
|  | $$Y = XW^T+b$$ | 
|  |  | 
|  | Here, $X$ is a matrix of shape $(M,K)$, $W$ is a matrix of shape $(N,K)$, $b$ is a vector of length $N$, and $Y$ is a matrix of shape $(M,N)$. $N$ can be thought of as the number of nodes in the layer, $M$ is the batch size, and $K$ is the number of features in an input observation. | 
|  |  | 
|  | *NOTE: $X$ does not need to explicitly be a 2-dimensional matrix, however, if it is not it will be coerced into one. For an arbitrary $n$-dimensional tensor $X$, e.g. $[a_0, a_1, \ldots ,a_{k-1}, a_k, \ldots , a_{n-1}]$, where $a_i$ in $N$, and $k$ is the $axis$ arg provided, then $X$ will be coerced into a 2-dimensional tensor with dimensions $[a_0 * \ldots * a_{k-1}, a_k * \ldots * a_{n-1}]$. For the default case where axis=1, this means the $X$ tensor will be coerced into a 2D tensor of dimensions $[a_0, a_1 * \ldots * a_{n-1}]$, where $a_0$ is often the batch size. In this situation, we must have $a_0 = M$ and $a_1 * \ldots * a_{n-1} = K$. Lastly, even though $b$ is a vector of length $N$, it is copied and resized to shape $(M x N)$ implicitly, then added to each vector in the batch.* | 
|  |  | 
|  | Github Links: | 
|  | - https://github.com/pytorch/pytorch/blob/main/caffe2/operators/fully_connected_op.h | 
|  | - https://github.com/pytorch/pytorch/blob/main/caffe2/operators/fully_connected_op.cc | 
|  |  | 
|  | <details> | 
|  |  | 
|  | <summary> <b>Example</b> </summary> | 
|  |  | 
|  | **Code** | 
|  |  | 
|  | ``` | 
|  |  | 
|  | // In this example, our batch size is 1 (M=1), the input observation will have | 
|  | //   6 features (K=6), and the layer will have one hidden node (N=1). The | 
|  | //   expected output is Y=7. | 
|  | workspace.ResetWorkspace() | 
|  |  | 
|  | op = core.CreateOperator( | 
|  | "FC", | 
|  | ["X", "W", "b"], | 
|  | ["Y"] | 
|  | ) | 
|  |  | 
|  | // Create X: MxK | 
|  | data = np.array([1,2,3,4,5,6]).astype(np.float32) | 
|  | data = data[np.newaxis,:] | 
|  |  | 
|  | // Create W: NxK | 
|  | weights = np.array(np.array([1,1/2.,1/3.,1/4.,1/5.,1/6.])).astype(np.float32) | 
|  | weights = weights[np.newaxis,:] | 
|  |  | 
|  | // Create b: N | 
|  | bias = np.array([1.]).astype(np.float32) | 
|  |  | 
|  | // Put the inputs into the workspace | 
|  | workspace.FeedBlob("X", data) | 
|  | workspace.FeedBlob("W", weights) | 
|  | workspace.FeedBlob("b", bias) | 
|  |  | 
|  | // Run the operator | 
|  | workspace.RunOperatorOnce(op) | 
|  | print("Y:\n", workspace.FetchBlob("Y")) | 
|  |  | 
|  | ``` | 
|  |  | 
|  | **Result** | 
|  |  | 
|  | ``` | 
|  |  | 
|  | Y: | 
|  | [[7.]] | 
|  |  | 
|  | ``` | 
|  |  | 
|  | </details> | 
|  |  | 
|  | )DOC") | 
|  | .Arg( | 
|  | "axis", | 
|  | "*(type: int; default: 1)* Describes the axis of the input data $X$. Defaults to one because in the common case when the input $X$ has shape $(M,K)$, the first axis encodes the batch size.") | 
|  | .Arg( | 
|  | "axis_w", | 
|  | "*(type: int; default: 1)* Describes the axis of the input weight matrix $W$. Defaults to one because the first axis most likely describes the batch_size.") | 
|  | .Arg( | 
|  | "float16_compute", | 
|  | "*(type: bool; default: False)* Whether to use float-16 compute kernel.") | 
|  | .Input( | 
|  | 0, | 
|  | "X", | 
|  | "Input blob to be coerced into a 2D matrix of shape $(M,K)$, where $M$ is the batch size and $K$ is the number of features in a single observation.") | 
|  | .Input( | 
|  | 1, | 
|  | "W", | 
|  | "Input blob to be coerced into a 2D matrix of shape $(N,K)$ describing a fully connected weight matrix. Here, $K$ is the number of features in a single observation and $N$ is the number of nodes in the FC layer.") | 
|  | .Input( | 
|  | 2, | 
|  | "b", | 
|  | "Input blob containing vector of length $N$ which describes one bias for each node in the layer.") | 
|  | .Output( | 
|  | 0, | 
|  | "Y", | 
|  | "Output blob containing a 2D output matrix of shape $(M,N)$, where $M$ is the batch size and $N$ is the number of nodes in the layer. The output is calculated as $Y=XW^T+b$.") | 
|  | .InheritOnnxSchema("Gemm"); | 
|  |  | 
|  | GRADIENT_OPERATOR_SCHEMA(FCGradient) | 
|  | .NumInputs(3) | 
|  | .NumOutputs(2, 3) | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false)) | 
|  | .CostInferenceFunction( | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | std::bind(CostInferenceForFCGradient, _1, _2, false)); | 
|  | GRADIENT_OPERATOR_SCHEMA(FCTransposedGradient) | 
|  | .NumInputs(3) | 
|  | .NumOutputs(2, 3) | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | .TensorInferenceFunction(std::bind(FCGradientShapeInference, _1, _2, false)) | 
|  | .CostInferenceFunction( | 
|  | // NOLINTNEXTLINE(modernize-avoid-bind) | 
|  | std::bind(CostInferenceForFCGradient, _1, _2, false)); | 
|  |  | 
|  | namespace { | 
|  |  | 
|  | class GetFCGradient : public GradientMakerBase { | 
|  | using GradientMakerBase::GradientMakerBase; | 
|  |  | 
|  | std::vector<OperatorDef> GetGradientDefs() override { | 
|  | CAFFE_ENFORCE_EQ(def_.input_size(), 3); | 
|  | CAFFE_ENFORCE(def_.type() == "FC" || def_.type() == "FCTransposed"); | 
|  | return SingleGradientDef( | 
|  | def_.type() + "Gradient", | 
|  | "", | 
|  | vector<string>{I(0), I(1), GO(0)}, | 
|  | vector<string>{GI(1), GI(2), GI(0)}); | 
|  | } | 
|  | }; | 
|  |  | 
|  | REGISTER_GRADIENT(FC, GetFCGradient); | 
|  | REGISTER_GRADIENT(FCTransposed, GetFCGradient); | 
|  |  | 
|  | } // namespace | 
|  |  | 
|  | } // namespace caffe2 |