| #include "caffe2/operators/lengths_reducer_ops.h" | 
 | #include "caffe2/core/context.h" | 
 | #include "caffe2/core/operator.h" | 
 | #include "caffe2/operators/segment_reduction_op.h" | 
 | #include "caffe2/utils/math.h" | 
 |  | 
 | namespace caffe2 { | 
 |  | 
 | // Use _STR option because the schema is declared using _STR version too in | 
 | // generic fashion. Otherwise it'd break schema declaration check. | 
 | // TODO(dzhulgakov): remove _STR when all lengths ops are off generic version. | 
 |  | 
 | using SparseLengthsSumOp = | 
 |     // NOLINTNEXTLINE(modernize-use-bool-literals) | 
 |     CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 0, 0>; | 
 | using SparseLengthsWeightedSumOp = | 
 |     // NOLINTNEXTLINE(modernize-use-bool-literals) | 
 |     CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 1, 0>; | 
 | using SparseLengthsMeanOp = | 
 |     // NOLINTNEXTLINE(modernize-use-bool-literals) | 
 |     CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 0, 1>; | 
 |  | 
 | REGISTER_CPU_OPERATOR(SparseLengthsSum, SparseLengthsSumOp); | 
 | REGISTER_CPU_OPERATOR(SparseLengthsWeightedSum, SparseLengthsWeightedSumOp); | 
 | REGISTER_CPU_OPERATOR(SparseLengthsMean, SparseLengthsMeanOp); | 
 |  | 
 | OPERATOR_SCHEMA(SparseLengthsPositionalWeightedSum) | 
 |     .NumInputs(4) | 
 |     .NumOutputs(1) | 
 |     .SetDoc(R"DOC( | 
 | Variation of SparseLengthsWeightedSum operator, where, for each row, | 
 | weights are accessed by indices [0..L-1], where L is the length of given row. | 
 | This is basically a fused operator of LengthsRangeFill + Gather + | 
 | SparseWeightedSum | 
 | )DOC") | 
 |     .Input( | 
 |         0, | 
 |         "DATA", | 
 |         "uint8 tensor obtained with " | 
 |         "operator FloatToRowwiseQuantized8Bits") | 
 |     .Input( | 
 |         1, | 
 |         "WEIGHT", | 
 |         "Scalar multipliers for the input slices. Must " | 
 |         "be a vector with the length matching the length of DATA") | 
 |     .Input( | 
 |         2, | 
 |         "INDICES", | 
 |         "Integer vector containing indices of the first " | 
 |         "dimension of DATA for the slices that are being aggregated") | 
 |     .Input( | 
 |         3, | 
 |         "LENGTHS", | 
 |         "Vector with the same sum of elements as the first dimension of DATA") | 
 |     .Output(0, "output", "output"); | 
 |  | 
 | REGISTER_CPU_OPERATOR_STR( | 
 |     "SparseLengthsPositionalWeightedSum", | 
 |     CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 1, 0, 1>); | 
 |  | 
 | template <typename Def> | 
 | string FormatDoc() { | 
 |   string doc = Def::doc; | 
 |   c10::ReplaceAll(doc, "{op}", Def::OpDef::name); | 
 |   c10::ReplaceAll(doc, "{op_doc}", Def::OpDef::doc); | 
 |   auto replaced = c10::ReplaceAll(doc, "{extra}", ""); | 
 |   CAFFE_ENFORCE_EQ(replaced, 0); | 
 |   return doc; | 
 | } | 
 |  | 
 | using SparseLengthsSumDef = AbstractSparseLengthsDef< | 
 |     float, | 
 |     int, | 
 |     CPUContext, | 
 |     SumReducerDef, | 
 |     true /*GradientNeedIndices*/>; | 
 | OPERATOR_SCHEMA(SparseLengthsSum) | 
 |     .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs) | 
 |     .NumOutputs(1) | 
 |     .ValueKeyLengthInputFillers( | 
 |         SparseLengthsSumOp::DATA, | 
 |         SparseLengthsSumOp::INDICES, | 
 |         SparseLengthsSumOp::LENGTHS) | 
 |     .SetDoc(FormatDoc<SparseLengthsSumDef>()) | 
 |     .Output(0, "OUTPUT", "Aggregated tensor") | 
 |     .FillUsing(SparseLengthsSumDef::PopulateSchema) | 
 |     .InheritOnnxSchema(); | 
 | REGISTER_CPU_OPERATOR( | 
 |     SparseLengthsSumGradient, | 
 |     SparseLengthsSumDef::BackwardOp); | 
 | OPERATOR_SCHEMA(SparseLengthsSumGradient) | 
 |     .NumInputs(SparseLengthsSumDef::BackwardOp::kNumInputs) | 
 |     .NumOutputs(1) | 
 |     .DisallowInputFillers(); | 
 | REGISTER_GRADIENT(SparseLengthsSum, SparseLengthsSumDef::GetGradient) | 
 |  | 
 | REGISTER_CPU_OPERATOR( | 
 |     TTSparseLengthsSum, | 
 |     TTSparseLengthsSumOp<float, CPUContext>); | 
 | REGISTER_CPU_OPERATOR( | 
 |     TTSparseLengthsSumGradient, | 
 |     TTSparseLengthsSumGradientOp<float, CPUContext>); | 
 |  | 
 | OPERATOR_SCHEMA(TTSparseLengthsSum) | 
 |     .NumInputs(5) | 
 |     .NumOutputs(4) | 
 |     .SetDoc(R"DOC( | 
 | This operator introduce a new, parameter efficient embedding layer, termed TT embedding, which | 
 | can be plugged in into any model and trained end-to-end. The benefits of our compressed TT layer | 
 | are twofold. Firstly, instead of storing huge embedding matrix, it stores a sequence of much smaller | 
 | 2-dimensional and 3-dimensional tensors, necessary for reconstructing the required embeddings, | 
 | which allows compressing the model significantly at the cost of a negligible performance drop. | 
 | Secondly, the overall number of parameters can be relatively small (and constant) during the whole | 
 | training stage, which allows to use larger batches or train efficiently in a case of limited resources. | 
 | )DOC") | 
 |     .Arg("factor_i", "vector<int>: factorization of voc size") | 
 |     .Arg("factor_j", "vector<int>: factorization of emb size") | 
 |     .Arg("ranks", "int[] Ranks of cores") | 
 |     .Arg("emb_size", "int: the size of each embedding entry") | 
 |     .Input(0, "core0", "tensor core 0") | 
 |     .Input(1, "core1", "tensor core 1") | 
 |     .Input(2, "core2", "tensor core 2") | 
 |     .Input(3, "index", "index for embedding") | 
 |     .Input(4, "lengths", "segment lengths") | 
 |     .Output(0, "OUTPUT", "Aggregated tensor") | 
 |     .Output( | 
 |         1, | 
 |         "core0_output", | 
 |         "intermediate mm result from core0 for backward path") | 
 |     .Output( | 
 |         2, | 
 |         "core1_output", | 
 |         "intermediate mm result from core1 for backward path") | 
 |     .Output(3, "indices", "the index for each core"); | 
 |  | 
 | using SparseLengthsWeightedSumDef = AbstractSparseLengthsDef< | 
 |     float, | 
 |     int, | 
 |     CPUContext, | 
 |     WeightedSumReducerDef, | 
 |     true /*GradientNeedIndices*/>; | 
 | OPERATOR_SCHEMA(SparseLengthsWeightedSum) | 
 |     .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs) | 
 |     .NumOutputs(1) | 
 |     .WeightedValueKeyLengthInputFillers( | 
 |         SparseLengthsWeightedSumOp::DATA, | 
 |         SparseLengthsWeightedSumOp::INDICES, | 
 |         SparseLengthsWeightedSumOp::LENGTHS, | 
 |         SparseLengthsWeightedSumOp::WEIGHT) | 
 |     .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>()) | 
 |     .Output(0, "OUTPUT", "Aggregated tensor") | 
 |     .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema) | 
 |     .InheritOnnxSchema(); | 
 | REGISTER_CPU_OPERATOR( | 
 |     SparseLengthsWeightedSumGradient, | 
 |     SparseLengthsWeightedSumDef::BackwardOp); | 
 | OPERATOR_SCHEMA(SparseLengthsWeightedSumGradient) | 
 |     .NumInputs(SparseLengthsWeightedSumDef::BackwardOp::kNumInputs) | 
 |     .NumOutputs(1) | 
 |     .DisallowInputFillers(); | 
 | REGISTER_GRADIENT( | 
 |     SparseLengthsWeightedSum, | 
 |     SparseLengthsWeightedSumDef::GetGradient) | 
 |  | 
 | using SparseLengthsMeanDef = AbstractSparseLengthsDef< | 
 |     float, | 
 |     int, | 
 |     CPUContext, | 
 |     MeanReducerDef, | 
 |     true /*GradientNeedIndices*/>; | 
 | OPERATOR_SCHEMA(SparseLengthsMean) | 
 |     .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs) | 
 |     .NumOutputs(1) | 
 |     .ValueKeyLengthInputFillers( | 
 |         SparseLengthsMeanOp::DATA, | 
 |         SparseLengthsMeanOp::INDICES, | 
 |         SparseLengthsMeanOp::LENGTHS) | 
 |     .SetDoc(FormatDoc<SparseLengthsMeanDef>()) | 
 |     .Output(0, "OUTPUT", "Aggregated tensor") | 
 |     .FillUsing(SparseLengthsMeanDef::PopulateSchema); | 
 | REGISTER_CPU_OPERATOR( | 
 |     SparseLengthsMeanGradient, | 
 |     SparseLengthsMeanDef::BackwardOp); | 
 | OPERATOR_SCHEMA(SparseLengthsMeanGradient) | 
 |     .NumInputs(SparseLengthsMeanDef::BackwardOp::kNumInputs) | 
 |     .NumOutputs(1) | 
 |     .DisallowInputFillers(); | 
 | REGISTER_GRADIENT(SparseLengthsMean, SparseLengthsMeanDef::GetGradient) | 
 |  | 
 | // NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-avoid-magic-numbers) | 
 | OPERATOR_SCHEMA(TTSparseLengthsSumGradient).NumInputs(8).NumOutputs(3); | 
 |  | 
 | class GetTTSparseLengthsGradient : public GradientMakerBase { | 
 |   using GradientMakerBase::GradientMakerBase; | 
 |   vector<OperatorDef> GetGradientDefs() override { | 
 |     // set up the input and output | 
 |     return SingleGradientDef( | 
 |         "TTSparseLengthsSumGradient", | 
 |         "", | 
 |         // CORE0, CORE1, CORE2, LENGTHS, CORE0_output, CORE1_output, | 
 |         // indices, dY | 
 |         vector<string>{ | 
 |             I(0), I(1), I(2), I(4), O(1), O(2), O(3), GO(0)}, | 
 |         // dCore0, dCore1, dCore2 | 
 |         vector<string>{GI(0), GI(1), GI(2)}); | 
 |   } | 
 | }; | 
 |  | 
 | REGISTER_GRADIENT(TTSparseLengthsSum, GetTTSparseLengthsGradient) | 
 |  | 
 | } // namespace caffe2 |