|  | #include "caffe2/operators/softmax_op.h" | 
|  |  | 
|  | #include "caffe2/operators/softmax_utils.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | // Implementation for the CPU context. | 
|  | template <> | 
|  | bool SoftmaxOp<float, CPUContext>::RunOnDevice() { | 
|  | const auto& X = Input(0); | 
|  | const int canonical_axis = X.canonical_axis_index(axis_); | 
|  | const int N = X.size_to_dim(canonical_axis); | 
|  | const int D = X.size_from_dim(canonical_axis); | 
|  | auto* Y = Output(0, X.sizes(), at::dtype<float>()); | 
|  | const float* X_data = X.data<float>(); | 
|  | float* Y_data = Y->mutable_data<float>(); | 
|  | if (N == 0 || D == 0) { | 
|  | return true; | 
|  | } | 
|  | if (!scale_.defined()) { | 
|  | scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU)); | 
|  | } else if (scale_.numel() != N) { | 
|  | scale_.Resize(N); | 
|  | } | 
|  | softmax_utils::SoftmaxCPU<float>( | 
|  | N, D, false, X_data, Y_data, scale_.mutable_data<float>(), &context_); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | // Implementation for the CPU context. | 
|  | template <> | 
|  | bool SoftmaxGradientOp<float, CPUContext>::RunOnDevice() { | 
|  | auto& Y = Input(0); | 
|  | auto& dY = Input(1); | 
|  |  | 
|  | const auto canonical_axis = Y.canonical_axis_index(axis_); | 
|  | const int64_t N = Y.size_to_dim(canonical_axis); | 
|  | const int64_t D = Y.size_from_dim(canonical_axis); | 
|  | // First, get scales | 
|  | if (!scale_.defined()) { | 
|  | scale_ = caffe2::empty({N}, at::dtype<float>().device(CPU)); | 
|  | } else if (scale_.numel() != N) { | 
|  | scale_.Resize(N); | 
|  | } | 
|  |  | 
|  | if (!sum_multiplier_.defined()) { | 
|  | sum_multiplier_ = caffe2::empty({D}, at::dtype<float>().device(CPU)); | 
|  | math::Set<float, CPUContext>( | 
|  | D, 1.f, sum_multiplier_.mutable_data<float>(), &context_); | 
|  | } else if (sum_multiplier_.numel() != D) { | 
|  | sum_multiplier_.Resize(D); | 
|  | math::Set<float, CPUContext>( | 
|  | D, 1.f, sum_multiplier_.mutable_data<float>(), &context_); | 
|  | } | 
|  |  | 
|  | auto* dX = Output(0, Y.sizes(), at::dtype<float>()); | 
|  | const float* Ydata = Y.data<float>(); | 
|  | const float* dYdata = dY.data<float>(); | 
|  | float* dXdata = dX->mutable_data<float>(); | 
|  | if (N == 0 || D == 0) { | 
|  | return true; | 
|  | } | 
|  | context_.CopySameDevice<float>(Y.numel(), dYdata, dXdata); | 
|  | float* scaledata = scale_.mutable_data<float>(); | 
|  | for (int i = 0; i < N; ++i) { | 
|  | math::Dot<float, CPUContext>( | 
|  | D, Ydata + i * D, dYdata + i * D, scaledata + i, &context_); | 
|  | } | 
|  | math::Gemm<float, CPUContext>( | 
|  | CblasNoTrans, | 
|  | CblasNoTrans, | 
|  | N, | 
|  | D, | 
|  | 1, | 
|  | -1, | 
|  | scaledata, | 
|  | sum_multiplier_.data<float>(), | 
|  | 1, | 
|  | dXdata, | 
|  | &context_); | 
|  | math::Mul<float, CPUContext>(Y.numel(), dXdata, Ydata, dXdata, &context_); | 
|  | return true; | 
|  | } | 
|  |  | 
|  | REGISTER_CPU_OPERATOR(Softmax, SoftmaxOp<float, CPUContext>); | 
|  | REGISTER_CPU_GRADIENT_OPERATOR( | 
|  | SoftmaxGradient, | 
|  | SoftmaxGradientOp<float, CPUContext>); | 
|  |  | 
|  | OPERATOR_SCHEMA(Softmax) | 
|  | .NumInputs(1) | 
|  | .NumOutputs(1) | 
|  | .IdenticalTypeAndShape() | 
|  | .SetDoc(R"DOC( | 
|  |  | 
|  | Applies the Softmax function to an n-dimensional input Tensor rescaling them so | 
|  | that the elements of the n-dimensional output Tensor lie in the range (0,1) and | 
|  | sum to 1. The softmax operator is typically the last layer in a classifier network, | 
|  | as its output can be interpreted as confidence probabilities of an input belonging | 
|  | to each class. The input is a 2-D tensor (Tensor) of size (batch_size x | 
|  | input_feature_dimensions). The output tensor has the same shape and contains the | 
|  | softmax normalized values of the corresponding input. The softmax function is | 
|  | defined as follows: | 
|  |  | 
|  | $$softmax(x_i) = \frac{\exp(x_i)}{\sum_{j} \exp(x_j)}$$ | 
|  |  | 
|  | The input does not need to explicitly be a 2D vector; rather, it will be coerced | 
|  | into one. For an arbitrary n-dimensional tensor `X` in | 
|  | $[a_0, a_1, ..., a_{k-1}, a_k, ..., a_{n-1}]$, where k is the `axis` provided, | 
|  | then `X` will be coerced into a 2-dimensional tensor with dimensions | 
|  | $[(a_0 * ... * a_{k-1}), (a_k * ... * a_{n-1})]$. For the default case where | 
|  | `axis`=1, the `X` tensor will be coerced into a 2D tensor of dimensions | 
|  | $[a_0, (a_1 * ... * a_{n-1})]$, where $a_0$ is often the batch size. In this | 
|  | situation, we must have $a_0 = N$ and $a_1 * ... * a_{n-1} = D$. Each of these | 
|  | dimensions must be matched correctly, or else the operator will throw errors. | 
|  |  | 
|  | Github Links: | 
|  |  | 
|  | - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.h | 
|  | - https://github.com/pytorch/pytorch/blob/master/caffe2/operators/softmax_op.cc | 
|  |  | 
|  |  | 
|  | <details> | 
|  |  | 
|  | <summary> <b>Example</b> </summary> | 
|  |  | 
|  | **Code** | 
|  |  | 
|  | ``` | 
|  | workspace.ResetWorkspace() | 
|  |  | 
|  | op = core.CreateOperator( | 
|  | "Softmax", | 
|  | ["X"], | 
|  | ["Y"] | 
|  | ) | 
|  |  | 
|  | workspace.FeedBlob("X", np.random.randn(1, 5).astype(np.float32)) | 
|  | print("input:", workspace.FetchBlob("X")) | 
|  | workspace.RunOperatorOnce(op) | 
|  | print("softmax:", workspace.FetchBlob("Y")) | 
|  |  | 
|  | ``` | 
|  |  | 
|  | **Result** | 
|  |  | 
|  | ``` | 
|  | input: [[ 0.0417839   0.61960053 -0.23150268 -0.64389366 -3.0000346 ]] | 
|  | softmax: [[0.24422921 0.43525138 0.18582782 0.12303016 0.01166145]] | 
|  |  | 
|  | ``` | 
|  |  | 
|  | </details> | 
|  |  | 
|  |  | 
|  |  | 
|  | )DOC") | 
|  | .Arg( | 
|  | "axis", | 
|  | "*(type: int; default: 1)* Axis of the inputs when coerced to 2D matrix.") | 
|  | .Input( | 
|  | 0, | 
|  | "X", | 
|  | "*(type: Tensor`<float>`)* Input tensor that's coerced into a 2D matrix of size (NxD) as described above.") | 
|  | .Output( | 
|  | 0, | 
|  | "Y", | 
|  | "*(type: Tensor`<float>`)* The softmax normalized output tensor with the same shape as input tensor.") | 
|  | .InheritOnnxSchema(); | 
|  |  | 
|  | // Input: Y, dY. Output: dX | 
|  | GRADIENT_OPERATOR_SCHEMA(SoftmaxGradient).NumInputs(2).NumOutputs(1); | 
|  |  | 
|  | class GetSoftmaxGradient : public GradientMakerBase { | 
|  | using GradientMakerBase::GradientMakerBase; | 
|  | vector<OperatorDef> GetGradientDefs() override { | 
|  | return SingleGradientDef( | 
|  | def_.type() + "Gradient", | 
|  | "", | 
|  | vector<string>{O(0), GO(0)}, | 
|  | vector<string>{GI(0)}); | 
|  | } | 
|  | }; | 
|  | REGISTER_GRADIENT(Softmax, GetSoftmaxGradient); | 
|  | REGISTER_GRADIENT(SoftmaxFp16, GetSoftmaxGradient); | 
|  |  | 
|  | } // namespace caffe2 |