caffe2/sgd/rowwise_adagrad_fused.h - platform/external/pytorch - Git at Google

 #pragma once

 #include "caffe2/core/operator.h"
 #include "caffe2/sgd/math_lp.h"

 namespace caffe2 {

 namespace internal {
 inline float compute_square_average_inlined_(const float* a, int len) {
   float sum = 0.0f;

   int i = 0;
 #ifdef __AVX__
   constexpr int kSize = 8;
   __m256 partial_sum = _mm256_setzero_ps();
   for (; i + kSize <= len; i += kSize) {
     __m256 ai = _mm256_loadu_ps(a + i);
     partial_sum = _mm256_add_ps(partial_sum, _mm256_mul_ps(ai, ai));
   }
   // Reduce sum to 1 value
   __m256 partial_sum_2 = _mm256_hadd_ps(partial_sum, partial_sum);
   __m256 partial_sum_3 = _mm256_hadd_ps(partial_sum_2, partial_sum_2);
   sum = _mm_cvtss_f32(_mm256_castps256_ps128(partial_sum_3)) +
       _mm_cvtss_f32(_mm256_extractf128_ps(partial_sum_3, 1));
 #endif

   for (; i < len; ++i) {
     sum = std::fma(a[i], a[i], sum);
   }

   return sum / len;
 }

 inline float compute_square_average_with_weight_decay_inlined_(
     const float* a,
     const float* w,
     int len,
     float weight_decay) {
   float sum = 0.0f;

   int i = 0;
 #ifdef __AVX__
   constexpr int kSize = 8;
   __m256 partial_sum = _mm256_setzero_ps();
   __m256 weight_decay_v = _mm256_set1_ps(weight_decay);
   for (; i + kSize <= len; i += kSize) {
     __m256 ai = _mm256_loadu_ps(a + i);
     __m256 wi = _mm256_loadu_ps(w + i);
 #ifdef __FMA__
     ai = _mm256_fmadd_ps(weight_decay_v, wi, ai);
 #else
     ai = _mm256_add_ps(_mm256_mul_ps(weight_decay_v, wi), ai);
 #endif
     partial_sum = _mm256_add_ps(partial_sum, _mm256_mul_ps(ai, ai));
   }
   // Reduce sum to 1 value
   __m256 partial_sum_2 = _mm256_hadd_ps(partial_sum, partial_sum);
   __m256 partial_sum_3 = _mm256_hadd_ps(partial_sum_2, partial_sum_2);
   sum = _mm_cvtss_f32(_mm256_castps256_ps128(partial_sum_3)) +
       _mm_cvtss_f32(_mm256_extractf128_ps(partial_sum_3, 1));
 #endif

   for (; i < len; ++i) {
     float ai = std::fma(weight_decay, w[i], a[i]);
     sum = std::fma(ai, ai, sum);
   }

   return sum / len;
 }

 inline float compute_square_average_with_weight_decay_inlined_(
     const float* a,
     const at::Half* w,
     int len,
     float weight_decay) {
   float sum = 0.0f;

   int i = 0;
 #ifdef __AVX__
   constexpr int kSize = 8;
   __m256 partial_sum = _mm256_setzero_ps();
   __m256 weight_decay_v = _mm256_set1_ps(weight_decay);
   for (; i + kSize <= len; i += kSize) {
     __m256 ai = _mm256_loadu_ps(a + i);
     __m128i whi = _mm_loadu_si128(reinterpret_cast<const __m128i*>(w + i));
     __m256 wi = _mm256_cvtph_ps(whi);
 #ifdef __FMA__
     ai = _mm256_fmadd_ps(weight_decay_v, wi, ai);
 #else
     ai = _mm256_add_ps(_mm256_mul_ps(weight_decay_v, wi), ai);
 #endif
     partial_sum = _mm256_add_ps(partial_sum, _mm256_mul_ps(ai, ai));
   }
   // Reduce sum to 1 value
   __m256 partial_sum_2 = _mm256_hadd_ps(partial_sum, partial_sum);
   __m256 partial_sum_3 = _mm256_hadd_ps(partial_sum_2, partial_sum_2);
   sum = _mm_cvtss_f32(_mm256_castps256_ps128(partial_sum_3)) +
       _mm_cvtss_f32(_mm256_extractf128_ps(partial_sum_3, 1));
 #endif

   for (; i < len; ++i) {
     float ai = std::fma(weight_decay, w[i], a[i]);
     sum = std::fma(ai, ai, sum);
   }

   return sum / len;
 }

 } // namespace internal

 /**
  * Fused operator of
  * SparseLengthsIndicesInGradientSumGradient (gradient of SparseLengthsSum) +
  * RowWiseSparseAdagrad.
  *
  * BW saving analysis for numSegments B, L_avg = avg(lengths), block_size D,
  * assuming T = float and SIndex = int64_t:
  * Before fusion, SparseLengthsIndicesInGradientSumGradient reads B*D*4 and
  * writes B*L_avg*D*4. RowWiseSparseAdagrad reads B*2*L_avg*D*4 and writes
  * B*L_avg*D*4. So, the total memory traffic is B*(1+4*L_avg)*D*4 .
  * After fusion, we read B*(1+L_avg)*D*4 and write B*L_avg*D*4 with total
  * memory traffic B*(1+2*L_avg)*D*4.
  * Assuming L_avg >> 1, the memory BW is saving is about 2x .
  *
  * See https://fb.quip.com/ldG7A55Ur5wM for more details on BW saving analysis
  * and evaluation results.
  */
 template <
     typename Tdata, // embedding types
     typename T, // everything else
     typename TLengths,
     typename rowWiseAdagradT,
     bool is_mean = false>
 class RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp final
     : public Operator<CPUContext> {
  public:
   RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp(
       const OperatorDef& operator_def,
       Workspace* ws)
       : Operator<CPUContext>(operator_def, ws),
         epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5)),
         weight_decay_(
             this->template GetSingleArgument<float>("weight_decay", 0.f)),
         counter_halflife_(
             this->template GetSingleArgument<int64_t>("counter_halflife", -1)) {
     VLOG(1) << "gradient optimization operator in use: "
             << " weight_decay_=" << weight_decay_
             << " counter_halflife=" << counter_halflife_
             << " RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp bcyuan";
     const T decay = this->template GetSingleArgument<T>("decay", 1.0);
     CAFFE_ENFORCE_EQ(
         decay, 1.0, "Decay is not supported for SparseSimdAdagradOp");
   }

   bool RunOnDevice() override {
     // Enforce shapes
     CAFFE_ENFORCE_EQ(Input(PARAM).sizes()[0], Input(MOMENT_1).numel());
     CAFFE_ENFORCE_EQ(Input(LR).numel(), 1);
     CAFFE_ENFORCE_EQ(
         Input(PARAM).size_from_dim(1),
         Input(GRAD).size_from_dim(Input(INDICES).dim()));

     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, Input(INDICES));
   }

   template <typename SIndex>
   bool DoRunWithType() {
     const auto* lr = Input(LR).template data<T>();
     Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
     Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));

     auto& segmentGradsInput = Input(GRAD);
     auto& lengthsInput = Input(LENGTHS);

     CAFFE_ENFORCE_EQ(lengthsInput.dim(), 1, "LENGTHS must be a vector");
     auto numSegments = lengthsInput.size(0);
     CAFFE_ENFORCE_GT(segmentGradsInput.dim(), 0);
     CAFFE_ENFORCE_EQ(numSegments, segmentGradsInput.size(0));
     const auto* lengths = lengthsInput.template data<TLengths>();

     auto n = Input(INDICES).numel();
     auto numParams = Input(PARAM).numel();

     const auto* indices = Input(INDICES).template data<SIndex>();
     const auto* gradIn = segmentGradsInput.template data<T>();
     const auto* paramIn = Input(PARAM).template data<Tdata>();
     const auto* momentIn = Input(MOMENT_1).template data<T>();
     const auto* count = counter_halflife_ == -1
       ? nullptr
       : Input(COUNTER).template data<double>();

     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<Tdata>();
     auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();

     if (numSegments == 0) {
       return true;
     }

     auto block_size = segmentGradsInput.size_from_dim(1);

     // Enforce:
     // Input(embedding/momentum) == outputs(embedding/momentum)
     CAFFE_ENFORCE_EQ(
         Input(PARAM).numel() / block_size,
         Input(MOMENT_1).numel(),
         "Input Param size: ",
         Input(PARAM).numel(),
         " Block size: ",
         block_size,
         " Input Moment size: ",
         Input(MOMENT_1).numel());

     if (is_mean) {
       grad_buffer_.ResizeLike(Input(GRAD));
     }
     auto* grad_buffer_data =
         is_mean ? grad_buffer_.template mutable_data<T>() : NULL;
     if (is_mean) {
       for (const auto rangeIndex : c10::irange(numSegments)) {
         for (const auto tmpIndex : c10::irange(block_size)) {
           auto offsetI = rangeIndex * block_size;
           grad_buffer_data[offsetI + tmpIndex] = lengths[rangeIndex] > 0
               ? gradIn[offsetI + tmpIndex] / lengths[rangeIndex]
               : gradIn[offsetI + tmpIndex];
         }
       }
     }

     compute<SIndex>(
         block_size,
         indices,
         n,
         lengths,
         numSegments,
         is_mean ? grad_buffer_data : gradIn,
         paramIn,
         numParams,
         momentIn,
         count,
         paramOut,
         momentOut,
         epsilon_,
         lr[0],
         weight_decay_,
         counter_halflife_,
         kernel_);

     return true;
   }

   template <typename SIndex, bool HAS_WEIGHT_DECAY>
   static void compute(
       int64_t block_size,
       const SIndex* indices,
       int64_t n,
       const TLengths* lengths,
       int64_t numSegments,
       const T* gradIn,
       const Tdata* paramIn,
       int64_t numParams,
       const T* momentIn,
       const double* count,
       Tdata* paramOut,
       T* momentOut,
       float epsilon,
       T lr,
       T weight_decay,
       T counter_halflife,
       rowWiseAdagradT& kernel) {
     int dataIndex = 0;
     for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;

       float g_sq_avg = 0;
       if (block_size > 1 && !HAS_WEIGHT_DECAY) {
         g_sq_avg = internal::compute_square_average_inlined_(g, block_size);
       }

       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
         auto offsetIdx = idx * block_size;

         // Enforce:
         // access within range
         // gradient access within range
         CAFFE_ENFORCE_GE(
             numParams,
             block_size + offsetIdx,
             "Accessing params out of bound,  idx:",
             idx,
             " for input dataIndex:",
             dataIndex,
             " and block size:",
             block_size,
             " max size:",
             numParams);

         float freq = (counter_halflife > 0 && count[idx] > 0)
         ? counter_halflife / count[idx]
         : 1.0;

         if (block_size == 1) {
           float gi = std::fma(weight_decay * freq, paramIn[idx], *g);
           float hi = momentOut[idx] = momentIn[idx] + gi * gi;
           paramOut[idx] = paramIn[idx] + lr / (std::sqrt(hi) + epsilon) * gi;
         } else {
           // prefetching
           const int prefdist_T0 = 16;
           int i_pref = (dataIndex < n - prefdist_T0) ? dataIndex + prefdist_T0
                                                      : dataIndex;
           std::size_t idx_pref = indices[i_pref];

           if (HAS_WEIGHT_DECAY) {
             g_sq_avg =
                 internal::compute_square_average_with_weight_decay_inlined_(
                     g, paramOut + offsetIdx, block_size, weight_decay * freq);
           }

           kernel(
               block_size,

               paramOut + offsetIdx,
               &paramOut[idx_pref * block_size],

               g,
               g_sq_avg,

               momentOut + idx,
               momentOut + idx_pref,

               epsilon,
               lr,
               HAS_WEIGHT_DECAY ? weight_decay * freq : 0.0f);
         }
       }
     }
     CAFFE_ENFORCE_EQ(dataIndex, n);
   }

   template <typename SIndex>
   static void compute(
       int64_t block_size,
       const SIndex* indices,
       int64_t n,
       const TLengths* lengths,
       int64_t numSegments,
       const T* gradIn,
       const Tdata* paramIn,
       int64_t numParams,
       const T* momentIn,
       const double* count,
       Tdata* paramOut,
       T* momentOut,
       float epsilon,
       T lr,
       T weight_decay,
       T counter_halflife,
       rowWiseAdagradT& kernel) {
     if (weight_decay == 0.0f) {
       compute<SIndex, false>(
           block_size,
           indices,
           n,
           lengths,
           numSegments,
           gradIn,
           paramIn,
           numParams,
           momentIn,
           count,
           paramOut,
           momentOut,
           epsilon,
           lr,
           0.0f,
           /*counter_halflife=*/-1,
           kernel);
     } else {
       compute<SIndex, true>(
           block_size,
           indices,
           n,
           lengths,
           numSegments,
           gradIn,
           paramIn,
           numParams,
           momentIn,
           count,
           paramOut,
           momentOut,
           epsilon,
           lr,
           weight_decay,
           counter_halflife,
           kernel);
     }
   }

  protected:
   T epsilon_;
   T weight_decay_;
   T counter_halflife_;
   rowWiseAdagradT kernel_;
   Tensor grad_buffer_{CPU};

   INPUT_TAGS(PARAM, MOMENT_1, INDICES, GRAD, LR, LENGTHS, COUNTER);
   OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1);
 };

 template <
     typename Tdata, // embedding types
     typename T, // everything else
     typename TLengths,
     typename rowWiseAdagradT>
 class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp final
     : public Operator<CPUContext> {
  public:
   RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientOp(
       const OperatorDef& operator_def,
       Workspace* ws)
       : Operator<CPUContext>(operator_def, ws),
         epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5)),
         weight_decay_(
             this->template GetSingleArgument<float>("weight_decay", 0.f)),
         counter_halflife_(
             this->template GetSingleArgument<int64_t>("counter_halflife", -1)) {
     VLOG(1) << "gradient optimization operator in use: "
             << " weight_decay_=" << weight_decay_
             << " counter_halflife=" << counter_halflife_
             << " RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp bcyuan";
   }

   bool RunOnDevice() override {
     // Enforce shapes
     CAFFE_ENFORCE_EQ(Input(PARAM).sizes()[0], Input(MOMENT_1).numel());
     CAFFE_ENFORCE_EQ(Input(LR).numel(), 1);
     CAFFE_ENFORCE_EQ(
         Input(PARAM).size_from_dim(1),
         Input(GRAD).size_from_dim(Input(INDICES).dim()));

     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, Input(INDICES));
   }

   template <typename SIndex>
   bool DoRunWithType() {
     const auto* lr = Input(LR).template data<T>();
     Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
     Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));

     auto& segmentGradsInput = Input(GRAD);
     auto& lengthsInput = Input(LENGTHS);

     CAFFE_ENFORCE_EQ(lengthsInput.dim(), 1, "LENGTHS must be a vector");
     auto numSegments = lengthsInput.size(0);
     CAFFE_ENFORCE_GT(segmentGradsInput.dim(), 0);
     CAFFE_ENFORCE_EQ(numSegments, segmentGradsInput.size(0));
     const auto* lengths = lengthsInput.template data<TLengths>();

     auto n = Input(INDICES).numel();
     auto numParams = Input(PARAM).numel();

     const auto* indices = Input(INDICES).template data<SIndex>();
     const auto* gradIn = segmentGradsInput.template data<T>();
     const auto* paramIn = Input(PARAM).template data<Tdata>();
     const auto* momentIn = Input(MOMENT_1).template data<T>();
     const auto* auxParamIn = Input(AUX_PARAM).template data<T>();
     const auto* count = counter_halflife_ == -1
       ? nullptr
       : Input(COUNTER).template data<double>();

     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<Tdata>();
     auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
     Output(AUX_GRAD)->Resize(n);
     auto* auxGrad = Output(AUX_GRAD)->template mutable_data<T>();

     CAFFE_ENFORCE_EQ(
         paramIn, paramOut, "RowWiseSparseAdagrad must use inplace param");
     CAFFE_ENFORCE_EQ(
         momentIn, momentOut, "RowWiseSparseAdagrad must use inplace momentum");

     if (numSegments == 0) {
       return true;
     }

     auto block_size = segmentGradsInput.size_from_dim(1);

     // Enforce:
     // Input(embedding/momentum) == outputs(embedding/momentum)
     CAFFE_ENFORCE_EQ(
         Input(PARAM).numel() / block_size,
         Input(MOMENT_1).numel(),
         "Input Param size: ",
         Input(PARAM).numel(),
         " Block size: ",
         block_size,
         " Input Moment size: ",
         Input(MOMENT_1).numel());

     compute<SIndex>(
         block_size,
         indices,
         n,
         lengths,
         numSegments,
         gradIn,
         paramIn,
         numParams,
         momentIn,
         count,
         auxParamIn,
         paramOut,
         momentOut,
         auxGrad,
         epsilon_,
         lr[0],
         weight_decay_,
         counter_halflife_,
         kernel_,
         &context_);

     return true;
   }

   template <typename SIndex, bool HAS_WEIGHT_DECAY>
   static void compute(
       int64_t block_size,
       const SIndex* indices,
       int64_t n,
       const TLengths* lengths,
       int64_t numSegments,
       const T* gradIn,
       const Tdata* paramIn,
       int64_t numParams,
       const T* momentIn,
       const double* count,
       const T* auxParamIn,
       Tdata* paramOut,
       T* momentOut,
       T* auxGrad,
       float epsilon,
       T lr,
       T weight_decay,
       T counter_halflife,
       rowWiseAdagradT& kernel,
       CPUContext* context) {
     // Cannot fuse this loop with the loop below because paramIn is updated
     // by the second loop. Specifically, there could be dataIndex1 != dataIndex2
     // s.t. indices[dataIndex1] == indices[dataIndex2], and fusing these two
     // loops would violate dependencies w.r.t.
     // paramIn[indices[dataIndex1]:block_size] The approximate version.
     // (RowWiseSparseSimdAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp)
     // ignores this dependency and fuses these two loops.
     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
     for (const auto rangeIndex : c10::irange(numSegments)) {
       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
         auto offsetI = rangeIndex * block_size;
         auto offsetIdx = idx * block_size;

         // Enforce:
         // access within range
         // gradient access within range
         CAFFE_ENFORCE_GE(
             numParams,
             block_size + offsetIdx,
             "Accessing params out of bound,  idx:",
             idx,
             " for input dataIndex:",
             dataIndex,
             " and block size:",
             block_size,
             " max size:",
             numParams);

         // temp_aux_grad[dataIndex] = gradIn[offsetI] dot paramIn[offsetIdx]
         internal::dot<T, Tdata, T>(
             block_size,
             gradIn + offsetI,
             paramIn + offsetIdx,
             auxGrad + dataIndex,
             context);
       }
     }
     CAFFE_ENFORCE_EQ(dataIndex, n);

     dataIndex = 0;
     for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;

       float g_sq_avg;
       if (block_size > 1 && !HAS_WEIGHT_DECAY) {
         g_sq_avg = internal::compute_square_average_inlined_(g, block_size);
       }

       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         auto idx = indices[dataIndex];
         auto offsetIdx = idx * block_size;
         auto localOffset = dataIndex - start;

         for (const auto i : c10::irange(block_size)) {
           temp_grad[i] = auxParamIn[localOffset] * g[i];
         }

         float freq = (counter_halflife > 0 && count[idx] > 0)
         ? counter_halflife / count[idx]
         : 1.0;

         if (block_size == 1) {
           float gi = std::fma(weight_decay * freq, paramIn[idx], temp_grad[0]);
           float hi = momentOut[idx] = momentIn[idx] + gi * gi;
           paramOut[idx] = paramIn[idx] + lr / (std::sqrt(hi) + epsilon) * gi;
         } else {
           // prefetching
           const int prefdist_T0 = 16;
           int i_pref = (dataIndex < n - prefdist_T0) ? dataIndex + prefdist_T0
                                                      : dataIndex;
           std::size_t idx_pref = indices[i_pref];

           if (HAS_WEIGHT_DECAY) {
             g_sq_avg =
                 internal::compute_square_average_with_weight_decay_inlined_(
                     temp_grad.data(),
                     paramOut + offsetIdx,
                     block_size,
                     weight_decay * freq);
           }

           kernel(
               block_size,

               paramOut + offsetIdx,
               &paramOut[idx_pref * block_size],

               temp_grad.data(),
               g_sq_avg *
                   (HAS_WEIGHT_DECAY
                        ? 1
                        : auxParamIn[localOffset] * auxParamIn[localOffset]),

               momentOut + idx,
               momentOut + idx_pref,

               epsilon,
               lr,
               HAS_WEIGHT_DECAY ? weight_decay * freq : 0.0f);
         }
       }
     }
   }

   template <typename SIndex>
   static void compute(
       int64_t block_size,
       const SIndex* indices,
       int64_t n,
       const TLengths* lengths,
       int64_t numSegments,
       const T* gradIn,
       const Tdata* paramIn,
       int64_t numParams,
       const T* momentIn,
       const double* count,
       const T* auxParamIn,
       Tdata* paramOut,
       T* momentOut,
       T* auxGrad,
       float epsilon,
       T lr,
       T weight_decay,
       T counter_halflife,
       rowWiseAdagradT& kernel,
       CPUContext* context) {
     if (weight_decay == 0.0f) {
       compute<SIndex, /*HAS_WEIGHT_DECAY=*/false>(
           block_size,
           indices,
           n,
           lengths,
           numSegments,
           gradIn,
           paramIn,
           numParams,
           momentIn,
           count,
           auxParamIn,
           paramOut,
           momentOut,
           auxGrad,
           epsilon,
           lr,
           0.0f,
           /*counter_halflife=*/-1,
           kernel,
           context);
     } else {
       compute<SIndex, /*HAS_WEIGHT_DECAY=*/true>(
           block_size,
           indices,
           n,
           lengths,
           numSegments,
           gradIn,
           paramIn,
           numParams,
           momentIn,
           count,
           auxParamIn,
           paramOut,
           momentOut,
           auxGrad,
           epsilon,
           lr,
           weight_decay,
           counter_halflife,
           kernel,
           context);
     }
   }

  protected:
   T epsilon_;
   T weight_decay_;
   T counter_halflife_;
   rowWiseAdagradT kernel_;

   INPUT_TAGS(PARAM, MOMENT_1, AUX_PARAM, INDICES, GRAD, LR, LENGTHS, COUNTER);
   OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, AUX_GRAD);
 };

 template <
     typename Tdata, // embedding types
     typename T, // everything else
     typename TLengths,
     typename rowWiseAdagradT>
 class RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp
     final : public Operator<CPUContext> {
  public:
   RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApproxOp(
       const OperatorDef& operator_def,
       Workspace* ws)
       : Operator<CPUContext>(operator_def, ws),
         epsilon_(this->template GetSingleArgument<float>("epsilon", 1e-5)),
         weight_decay_(
             this->template GetSingleArgument<float>("weight_decay", 0.f)),
         counter_halflife_(
             this->template GetSingleArgument<int64_t>("counter_halflife", -1)) {
     VLOG(1) << "gradient optimization operator in use: "
             << " weight_decay_=" << weight_decay_
             << " counter_halflife=" << counter_halflife_
             << " RowWiseSparseAdagradFusedWithSparseLengthsSumGradientOp bcyuan";
     const T decay = this->template GetSingleArgument<T>("decay", 1.0);
     CAFFE_ENFORCE_EQ(
         decay, 1.0, "Decay is not supported for SparseSimdAdagradOp");
   }

   bool RunOnDevice() override {
     // Enforce shapes
     CAFFE_ENFORCE_EQ(Input(PARAM).sizes()[0], Input(MOMENT_1).numel());
     CAFFE_ENFORCE_EQ(Input(LR).numel(), 1);
     CAFFE_ENFORCE_EQ(
         Input(PARAM).size_from_dim(1),
         Input(GRAD).size_from_dim(Input(INDICES).dim()));

     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
         this, Input(INDICES));
   }

   template <typename SIndex>
   bool DoRunWithType() {
     if (weight_decay_ == 0.0f) {
       return DoRunWithType<SIndex, false>();
     } else {
       return DoRunWithType<SIndex, true>();
     }
   }

   template <typename SIndex, bool HAS_WEIGHT_DECAY>
   bool DoRunWithType() {
     const auto* lr = Input(LR).template data<T>();
     Output(OUTPUT_PARAM)->ResizeLike(Input(PARAM));
     Output(OUTPUT_MOMENT_1)->ResizeLike(Input(MOMENT_1));

     auto& segmentGradsInput = Input(GRAD);
     auto& lengthsInput = Input(LENGTHS);

     CAFFE_ENFORCE_EQ(lengthsInput.dim(), 1, "LENGTHS must be a vector");
     auto numSegments = lengthsInput.size(0);
     CAFFE_ENFORCE_GT(segmentGradsInput.dim(), 0);
     CAFFE_ENFORCE_EQ(numSegments, segmentGradsInput.size(0));
     const auto* lengths = lengthsInput.template data<TLengths>();

     auto n = Input(INDICES).numel();

     const auto* indices = Input(INDICES).template data<SIndex>();
     const auto* gradIn = segmentGradsInput.template data<T>();
     const auto* paramIn = Input(PARAM).template data<Tdata>();
     const auto* momentIn = Input(MOMENT_1).template data<T>();
     const auto* count = counter_halflife_ == -1
       ? nullptr
       : Input(COUNTER).template data<double>();
     const auto* auxParamIn = Input(AUX_PARAM).template data<T>();

     auto* paramOut = Output(OUTPUT_PARAM)->template mutable_data<Tdata>();
     auto* momentOut = Output(OUTPUT_MOMENT_1)->template mutable_data<T>();
     Output(AUX_GRAD)->Resize(n);
     auto* auxGrad = Output(AUX_GRAD)->template mutable_data<T>();

     CAFFE_ENFORCE_EQ(
         paramIn, paramOut, "RowWiseSparseAdagrad must use inplace param");
     CAFFE_ENFORCE_EQ(
         momentIn, momentOut, "RowWiseSparseAdagrad must use inplace momentum");

     if (numSegments == 0) {
       return true;
     }

     auto block_size = segmentGradsInput.size_from_dim(1);

     // Enforce:
     // Input(embedding/momentum) == outputs(embedding/momentum)
     CAFFE_ENFORCE_EQ(
         Input(PARAM).numel() / block_size,
         Input(MOMENT_1).numel(),
         "Input Param size: ",
         Input(PARAM).numel(),
         " Block size: ",
         block_size,
         " Input Moment size: ",
         Input(MOMENT_1).numel());

     std::vector<T> temp_grad(block_size);
     int dataIndex = 0;
     for (const auto rangeIndex : c10::irange(numSegments)) {
       auto offsetI = rangeIndex * block_size;
       const float* g = gradIn + offsetI;

       float g_sq_avg;
       if (block_size > 1 && !HAS_WEIGHT_DECAY) {
         g_sq_avg = internal::compute_square_average_inlined_(g, block_size);
       }

       for (auto start = dataIndex; dataIndex < start + lengths[rangeIndex];
            ++dataIndex) {
         std::size_t idx = indices[dataIndex];
         auto offsetIdx = idx * block_size;
         auto localOffset = dataIndex - start;

         // Enforce:
         // access within range
         // gradient access within range
         CAFFE_ENFORCE_GE(
             Input(PARAM).numel(),
             block_size + offsetIdx,
             this->debug_def().input(PARAM),
             ", out of bound,  idx:",
             idx,
             " for input dataIndex:",
             dataIndex,
             " and block size:",
             block_size,
             " max size:",
             Input(PARAM).numel());

         int i = 0;
         float acc = 0.0f;

 #ifdef __AVX__
         constexpr int VLEN = 8;
         __m256 acc_v = _mm256_setzero_ps();
         __m256 scalar_v = _mm256_set1_ps(auxParamIn[localOffset]);

         if (std::is_same<Tdata, float>::value) {
           for (; i < block_size / VLEN * VLEN; i += VLEN) {
             __m256 a_v = _mm256_loadu_ps(g + i);
             __m256 b_v = _mm256_loadu_ps(
                 reinterpret_cast<const float*>(paramIn + offsetIdx + i));
             __m256 c_v = _mm256_mul_ps(a_v, b_v);
             acc_v = _mm256_add_ps(acc_v, c_v);
             _mm256_storeu_ps(&temp_grad[i], _mm256_mul_ps(a_v, scalar_v));
           }
         } else if (std::is_same<Tdata, at::Half>::value) {
           for (; i < block_size / VLEN * VLEN; i += VLEN) {
             __m256 a_v = _mm256_loadu_ps(g + i);
             __m256 b_v = _mm256_cvtph_ps(
                 _mm_load_si128((__m128i*)(paramIn + offsetIdx + i)));
             __m256 c_v = _mm256_mul_ps(a_v, b_v);
             acc_v = _mm256_add_ps(acc_v, c_v);
             _mm256_storeu_ps(&temp_grad[i], _mm256_mul_ps(a_v, scalar_v));
           }
         } else {
           CAFFE_THROW("Unsupported type for Embedding");
         }

         alignas(64) float temp[VLEN];
         _mm256_store_ps(temp, acc_v);
         for (const auto j : c10::irange(VLEN)) {
           acc += temp[j];
         }
 #endif

         for (; i < block_size; ++i) {
           float a = g[i];
           acc += a * paramIn[offsetIdx + i];
           temp_grad[i] = a * auxParamIn[localOffset];
         }
         auxGrad[dataIndex] = acc;

         float freq = (counter_halflife_ > 0 && count[idx] > 0)
         ? counter_halflife_ / count[idx]
         : 1.0;

         if (block_size == 1) {
           float gi = std::fma(weight_decay_ * freq, paramIn[idx], temp_grad[0]);
           float hi = momentOut[idx] = momentIn[idx] + gi * gi;
           paramOut[idx] =
               paramIn[idx] + lr[0] / (std::sqrt(hi) + epsilon_) * gi;
         } else {
           // prefetching
           const int prefdist_T0 = 16;
           int i_pref = (dataIndex < n - prefdist_T0) ? dataIndex + prefdist_T0
                                                      : dataIndex;
           std::size_t idx_pref = indices[i_pref];

           if (HAS_WEIGHT_DECAY) {
             g_sq_avg =
                 internal::compute_square_average_with_weight_decay_inlined_(
                     temp_grad.data(),
                     paramOut + offsetIdx,
                     block_size,
                     weight_decay_ * freq);
           }

           kernel_(
               block_size,

               paramOut + offsetIdx,
               &paramOut[idx_pref * block_size],

               temp_grad.data(),
               g_sq_avg *
                   (HAS_WEIGHT_DECAY
                        ? 1
                        : auxParamIn[localOffset] * auxParamIn[localOffset]),

               momentOut + idx,
               momentOut + idx_pref,

               epsilon_,
               lr[0],
               HAS_WEIGHT_DECAY ? weight_decay_ * freq : 0.0f);
         }
       }
     }
     CAFFE_ENFORCE_EQ(dataIndex, n);

     return true;
   }

  protected:
   T epsilon_;
   T weight_decay_;
   T counter_halflife_;
   rowWiseAdagradT kernel_;

   INPUT_TAGS(PARAM, MOMENT_1, AUX_PARAM, INDICES, GRAD, LR, LENGTHS, COUNTER);
   OUTPUT_TAGS(OUTPUT_PARAM, OUTPUT_MOMENT_1, AUX_GRAD);
 };

 struct rowwise_adagrad_update_inlined {
   void operator()(
       int N,
       float* w,
       float* w_n, // prefetch ptr
       const float* g,
       float g_sq_avg,
       float* h,
       float* h_n, // prefetch ptr
       float epsilon,
       float lr,
       float weight_decay) {
 #ifdef __AVX__
     constexpr int kSize = 8;
     _mm_prefetch(reinterpret_cast<const char*>(h_n), _MM_HINT_T0);
 #endif
     float hi = *h = *h + g_sq_avg;
     float float_step = lr / (std::sqrt(hi) + epsilon);

     int i = 0;

 #ifdef __AVX__
     __m256 step = _mm256_set1_ps(float_step);
     __m256 weight_decay_v = _mm256_set1_ps(weight_decay);

     for (i = 0; i + kSize <= N; i += kSize) {
       _mm_prefetch(reinterpret_cast<const char*>(&w_n[i]), _MM_HINT_T0);

       __m256 gi = _mm256_loadu_ps(g + i);
       __m256 wi = _mm256_loadu_ps(w + i);
       if (weight_decay != 0.0f) {
 #ifdef __FMA__
         gi = _mm256_fmadd_ps(weight_decay_v, wi, gi);
 #else
         gi = _mm256_add_ps(_mm256_mul_ps(weight_decay_v, wi), gi);
 #endif
       }

       _mm256_storeu_ps(w + i, _mm256_add_ps(wi, _mm256_mul_ps(gi, step)));
     }
 #endif

     for (; i < N; ++i) {
       float gi =
           weight_decay != 0.0f ? std::fma(weight_decay, w[i], g[i]) : g[i];
       w[i] = w[i] + gi * float_step;
     }
   }
 };

 } // namespace caffe2