|  | #include "caffe2/sgd/rowwise_adagrad_fused.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | OPERATOR_SCHEMA(RowWiseSparseAdagradFusedWithSparseLengthsSumGradient) | 
|  | .NumInputs(6,7) | 
|  | .NumOutputs(2) | 
|  | .EnforceOneToOneInplace() | 
|  | .SetDoc(R"DOC( | 
|  |  | 
|  | Fused operator of | 
|  | SparseLengthsIndicesInGradientSumGradient (gradient of SparseLengthsSum) + | 
|  | RowWiseSparseAdagrad. | 
|  |  | 
|  | Given inputs (param, moment, indices, grad, lr), runs the row-wise sparse | 
|  | AdaGrad update on (param, grad, moment[indices], lr), and returns (new_param, | 
|  | new_moment) as in the dense case. Additional input (lengths) is for fused | 
|  | SparseLengthsSumGradient operator. | 
|  |  | 
|  | )DOC") | 
|  | .Input(0, "param", "Parameters to be updated") | 
|  | .Input(1, "moment", "Moment history") | 
|  | .Input( | 
|  | 2, | 
|  | "indices", | 
|  | "Integer vector containing indices of the first dimension of param for the slices that are being updated") | 
|  | .Input(3, "grad", "Gradient computed") | 
|  | .Input(4, "lr", "learning rate") | 
|  | .Input( | 
|  | 5, | 
|  | "lengths", | 
|  | "Non negative vector with sum of elements equal to indices length") | 
|  | .Input( | 
|  | 6, | 
|  | "counter", | 
|  | "Optional input when weight_decay is adjusted by frequency ignored " | 
|  | "when counter_halflife == -1") | 
|  | .Output(0, "output_param", "Updated parameters") | 
|  | .Output(1, "output_moment", "Updated moment") | 
|  | .Arg( | 
|  | "round_option", | 
|  | "rounding option: 0 for nearest rounding, 1 for stochastic rounding") | 
|  | .Arg("epsilon", "Default 1e-5"); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradient, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/false>); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradient, | 
|  | SIMD, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/false>); | 
|  |  | 
|  | // Match the GPU Approx op, here Approx and Exact are the same for | 
|  | // RowWiseSparseAdagradFusedWithSparseLengthsSumGradient op | 
|  | OPERATOR_SCHEMA(RowWiseSparseAdagradFusedWithSparseLengthsSumGradientApprox) | 
|  | .NumInputs(6,7) | 
|  | .NumOutputs(2) | 
|  | .EnforceOneToOneInplace() | 
|  | .SetDoc(R"DOC( | 
|  |  | 
|  | Fused operator of | 
|  | SparseLengthsIndicesInGradientSumGradient (gradient of SparseLengthsSum) + | 
|  | RowWiseSparseAdagrad. | 
|  |  | 
|  | Given inputs (param, moment, indices, grad, lr), runs the row-wise sparse | 
|  | AdaGrad update on (param, grad, moment[indices], lr), and returns (new_param, | 
|  | new_moment) as in the dense case. Additional input (lengths) is for fused | 
|  | SparseLengthsSumGradient operator. | 
|  |  | 
|  | )DOC") | 
|  | .Input(0, "param", "Parameters to be updated") | 
|  | .Input(1, "moment", "Moment history") | 
|  | .Input( | 
|  | 2, | 
|  | "indices", | 
|  | "Integer vector containing indices of the first dimension of param for the slices that are being updated") | 
|  | .Input(3, "grad", "Gradient computed") | 
|  | .Input(4, "lr", "learning rate") | 
|  | .Input( | 
|  | 5, | 
|  | "lengths", | 
|  | "Non negative vector with sum of elements equal to indices length") | 
|  | .Input( | 
|  | 6, | 
|  | "counter", | 
|  | "Optional input when weight_decay is adjusted by frequency ignored " | 
|  | "when counter_halflife == -1") | 
|  | .Output(0, "output_param", "Updated parameters") | 
|  | .Output(1, "output_moment", "Updated moment") | 
|  | .Arg( | 
|  | "round_option", | 
|  | "rounding option: 0 for nearest rounding, 1 for stochastic rounding") | 
|  | .Arg("epsilon", "Default 1e-5"); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientApprox, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/false>); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientApprox, | 
|  | SIMD, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/false>); | 
|  |  | 
|  | OPERATOR_SCHEMA(RowWiseSparseAdagradFusedWithSparseLengthsMeanGradient) | 
|  | .NumInputs(6,7) | 
|  | .NumOutputs(2) | 
|  | .EnforceOneToOneInplace() | 
|  | .SetDoc(R"DOC( | 
|  |  | 
|  | Fused operator of | 
|  | SparseLengthsIndicesInGradientMeanGradient (gradient of SparseLengthsMean) + | 
|  | RowWiseSparseAdagrad. | 
|  |  | 
|  | Given inputs (param, moment, indices, grad, lr), runs the row-wise sparse | 
|  | AdaGrad update on (param, grad, moment[indices], lr), and returns (new_param, | 
|  | new_moment) as in the dense case. Additional input (lengths) is for fused | 
|  | SparseLengthsMeanGradient operator. | 
|  |  | 
|  | )DOC") | 
|  | .Input(0, "param", "Parameters to be updated") | 
|  | .Input(1, "moment", "Moment history") | 
|  | .Input( | 
|  | 2, | 
|  | "indices", | 
|  | "Integer vector containing indices of the first dimension of param for the slices that are being updated") | 
|  | .Input(3, "grad", "Gradient computed") | 
|  | .Input(4, "lr", "learning rate") | 
|  | .Input( | 
|  | 5, | 
|  | "lengths", | 
|  | "Non negative vector with sum of elements equal to indices length") | 
|  | .Input( | 
|  | 6, | 
|  | "counter", | 
|  | "Optional input when weight_decay is adjusted by frequency ignored " | 
|  | "when counter_halflife == -1") | 
|  | .Output(0, "output_param", "Updated parameters") | 
|  | .Output(1, "output_moment", "Updated moment") | 
|  | .Arg("epsilon", "Default 1e-5"); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsMeanGradient, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/true>); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsMeanGradient, | 
|  | SIMD, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/true>); | 
|  |  | 
|  | // Match the GPU Approx op, here Approx and Exact are the same for | 
|  | // RowWiseSparseAdagradFusedWithSparseLengthsMeanGradient op | 
|  | OPERATOR_SCHEMA(RowWiseSparseAdagradFusedWithSparseLengthsMeanGradientApprox) | 
|  | .NumInputs(6,7) | 
|  | .NumOutputs(2) | 
|  | .EnforceOneToOneInplace() | 
|  | .SetDoc(R"DOC( | 
|  |  | 
|  | Fused operator of | 
|  | SparseLengthsIndicesInGradientMeanGradient (gradient of SparseLengthsMean) + | 
|  | RowWiseSparseAdagrad. | 
|  |  | 
|  | Given inputs (param, moment, indices, grad, lr), runs the row-wise sparse | 
|  | AdaGrad update on (param, grad, moment[indices], lr), and returns (new_param, | 
|  | new_moment) as in the dense case. Additional input (lengths) is for fused | 
|  | SparseLengthsMeanGradient operator. | 
|  |  | 
|  | )DOC") | 
|  | .Input(0, "param", "Parameters to be updated") | 
|  | .Input(1, "moment", "Moment history") | 
|  | .Input( | 
|  | 2, | 
|  | "indices", | 
|  | "Integer vector containing indices of the first dimension of param for the slices that are being updated") | 
|  | .Input(3, "grad", "Gradient computed") | 
|  | .Input(4, "lr", "learning rate") | 
|  | .Input( | 
|  | 5, | 
|  | "lengths", | 
|  | "Non negative vector with sum of elements equal to indices length") | 
|  | .Input( | 
|  | 6, | 
|  | "counter", | 
|  | "Optional input when weight_decay is adjusted by frequency ignored " | 
|  | "when counter_halflife == -1") | 
|  | .Output(0, "output_param", "Updated parameters") | 
|  | .Output(1, "output_moment", "Updated moment") | 
|  | .Arg( | 
|  | "round_option", | 
|  | "rounding option: 0 for nearest rounding, 1 for stochastic rounding") | 
|  | .Arg("epsilon", "Default 1e-5"); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsMeanGradientApprox, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/true>); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsMeanGradientApprox, | 
|  | SIMD, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined, | 
|  | /*is_mean=*/true>); | 
|  |  | 
|  | OPERATOR_SCHEMA(RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradient) | 
|  | .NumInputs(7,8) | 
|  | .NumOutputs(3) | 
|  | .EnforceInplace({{0, 0}, {1, 1}}) | 
|  | .SetDoc(R"DOC( | 
|  |  | 
|  | Fused operator of SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient | 
|  | (gradient of SparseLengthsWeightedSum) + RowWiseSparseAdagrad, where weights are | 
|  | positional weights computed with LengthsRangeFill + Gather pattern. | 
|  |  | 
|  | Given inputs (param, moment, indices, grad, lr), runs the row-wise sparse | 
|  | AdaGrad update on (param, grad, moment[indices], lr), and returns (new_param, | 
|  | new_moment) as in the dense case. | 
|  | There're auxiliary inputs (aux_param) for which gradient is computed and | 
|  | returns (aux_grad). | 
|  | Yet additional input (lengths) is for fused SparseLengthsWeightedSumGradient | 
|  | operator. | 
|  |  | 
|  | )DOC") | 
|  | .Input(0, "param", "Parameters to be updated") | 
|  | .Input(1, "moment", "Moment history") | 
|  | .Input(2, "aux_param", "Auxiliary parameters to be updated") | 
|  | .Input( | 
|  | 3, | 
|  | "indices", | 
|  | "Integer vector containing indices of the first dimension of param for the slices that are being updated") | 
|  | .Input(4, "grad", "Gradient computed") | 
|  | .Input(5, "lr", "learning rate") | 
|  | .Input( | 
|  | 6, | 
|  | "lengths", | 
|  | "Non negative vector with sum of elements equal to indices length") | 
|  | .Input( | 
|  | 7, | 
|  | "counter", | 
|  | "Optional input when weight_decay is adjusted by frequency ignored " | 
|  | "when counter_halflife == -1") | 
|  | .Output(0, "output_param", "Updated parameters") | 
|  | .Output(1, "output_moment", "Updated moment") | 
|  | .Output(2, "aux_grad", "Auxiliary gradient") | 
|  | .Arg("epsilon", "Default 1e-5"); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradient, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined>); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradient, | 
|  | SIMD, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined>); | 
|  |  | 
|  | OPERATOR_SCHEMA( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApprox) | 
|  | .NumInputs(7,8) | 
|  | .NumOutputs(3) | 
|  | .EnforceInplace({{0, 0}, {1, 1}}) | 
|  | .SetDoc(R"DOC( | 
|  |  | 
|  | Approximately fused operator of | 
|  | SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient | 
|  | (gradient of SparseLengthsWeightedSum) + RowWiseSparseAdagrad, where weights are | 
|  | positional weights computed with LengthsRangeFill + Gather pattern. | 
|  |  | 
|  | Given inputs (param, moment, indices, grad, lr), runs the row-wise sparse | 
|  | AdaGrad update on (param, grad, moment[indices], lr), and returns (new_param, | 
|  | new_moment) as in the dense case. | 
|  | There's race condition w.r.t. ordering between reading params and writing to | 
|  | param, hence the name Approx. | 
|  | There're auxiliary inputs (aux_param) for which gradient is computed | 
|  | and returns (aux_grad). | 
|  | Yet additional input (lengths) is for fused SparseLengthsWeightedSumGradient | 
|  | operator. | 
|  |  | 
|  | )DOC") | 
|  | .Input(0, "param", "Parameters to be updated") | 
|  | .Input(1, "moment", "Moment history") | 
|  | .Input(2, "aux_param", "Auxiliary parameters to be updated") | 
|  | .Input( | 
|  | 3, | 
|  | "indices", | 
|  | "Integer vector containing indices of the first dimension of param for the slices that are being updated") | 
|  | .Input(4, "grad", "Gradient computed") | 
|  | .Input(5, "lr", "learning rate") | 
|  | .Input( | 
|  | 6, | 
|  | "lengths", | 
|  | "Non negative vector with sum of elements equal to indices length") | 
|  | .Input( | 
|  | 7, | 
|  | "counter", | 
|  | "Optional input when weight_decay is adjusted by frequency ignored " | 
|  | "when counter_halflife == -1") | 
|  | .Output(0, "output_param", "Updated parameters") | 
|  | .Output(1, "output_moment", "Updated moment") | 
|  | .Output(2, "aux_grad", "Auxiliary gradient") | 
|  | .Arg("epsilon", "Default 1e-5"); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApprox, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined>); | 
|  |  | 
|  | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApprox, | 
|  | SIMD, | 
|  | RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp< | 
|  | float, | 
|  | float, | 
|  | int, | 
|  | rowwise_adagrad_update_inlined>); | 
|  |  | 
|  | } // namespace caffe2 |