caffe2/operators/feature_maps_ops.h - platform/external/pytorch - Git at Google

 #ifndef CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_
 #define CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_

 #include "caffe2/core/context.h"
 #include "caffe2/core/operator.h"
 #include "c10/util/irange.h"

 namespace caffe2 {

 template <class Context>
 class MergeDenseFeatureTensorsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeDenseFeatureTensorsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids");
   }
   virtual ~MergeDenseFeatureTensorsOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(0));
   }

   template <typename T>
   bool DoRunWithType() {
     auto& dense_data = Input(0);
     int numExamples = dense_data.size(0);
     int numFeatures = dense_data.size(1);

     const bool* inPresenceData = Input(1).template data<bool>();
     int totalNumFeatures = 0;
     for (const auto exampleIndex : c10::irange(numExamples)) {
       for (const auto inputIndex : c10::irange(numFeatures)) {
         if (inPresenceData[exampleIndex * numFeatures + inputIndex]) {
           ++totalNumFeatures;
         }
       }
     }

     auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>());
     auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>());
     auto* outValues = Output(2, {totalNumFeatures}, at::dtype<T>());

     int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
     int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
     T* outValuesData = outValues->template mutable_data<T>();
     const T* inData =
       Input(0).template data<T>();

     int keysOffset = 0;
     for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       auto offset = exampleIndex * numFeatures;
       for (const auto inputIndex : c10::irange(numFeatures)) {
         if (inPresenceData[offset]) {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesData[keysOffset] = inData[offset];
           ++keysOffset;
         }
         offset++;
       }
     }
     return true;
   }

  private:
   std::vector<int64_t> featureIDs_;
 };

 template <class Context>
 class MergeSingleScalarFeatureTensorsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeSingleScalarFeatureTensorsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numInputs_ = InputSize() / kNumTensorsPerInput;
     featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids");
   }
   virtual ~MergeSingleScalarFeatureTensorsOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(0));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
       for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           ++totalNumFeatures;
         }
       }
     }

     auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>());
     auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>());
     auto* outValues = Output(2, {totalNumFeatures}, at::dtype<T>());

     int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
     int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
     T* outValuesData = outValues->template mutable_data<T>();

     int keysOffset = 0;
     for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       for (const auto inputIndex : c10::irange(numInputs_)) {
         const T* inData =
             Input(kNumTensorsPerInput * inputIndex).template data<T>();
         const bool* inPresenceData =
             Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
         if (inPresenceData[exampleIndex]) {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesData[keysOffset] = inData[exampleIndex];
           ++keysOffset;
         }
       }
     }
     return true;
   }

  private:
   const int kNumTensorsPerInput = 2;
   int numInputs_;
   std::vector<int64_t> featureIDs_;
 };

 template <class Context>
 class MergeSingleScalarFeatureTensorsGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeSingleScalarFeatureTensorsGradientOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numFeatureInputs_ = InputSize() - 1; // Everything other than values_grad
   }
   virtual ~MergeSingleScalarFeatureTensorsGradientOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(InputSize() - 1));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       Output(inputIndex)->ResizeLike(Input(inputIndex));
     }

     const T* inValuesGradData = Input(InputSize() - 1).template data<T>();

     T default_value = T();
     int valuesOffset = 0;
     for (const auto exampleIndex : c10::irange(numExamples)) {
       for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const bool* inPresenceData = Input(inputIndex).template data<bool>();
         T* outFeatureData = Output(inputIndex)->template mutable_data<T>();
         if (inPresenceData[exampleIndex]) {
           outFeatureData[exampleIndex] = inValuesGradData[valuesOffset];
           ++valuesOffset;
         } else {
           outFeatureData[exampleIndex] = default_value;
         }
       }
     }
     return true;
   }

  private:
   int numFeatureInputs_;
 };

 template <class Context>
 class MergeSingleListFeatureTensorsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeSingleListFeatureTensorsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numInputs_ = InputSize() / kNumTensorsPerInput;
     inValuesOffset_.resize(numInputs_);
     featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids");
   }
   virtual ~MergeSingleListFeatureTensorsOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(1));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 2).template data<bool>();
       for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           ++totalNumFeatures;
           totalNumValues += inLengthsData[exampleIndex];
         }
       }
     }

     auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>());
     auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>());
     auto* outValuesLengths =
         Output(2, {totalNumFeatures}, at::dtype<int32_t>());
     auto* outValuesValues = Output(3, {totalNumValues}, at::dtype<T>());

     int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
     int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
     int32_t* outValuesLengthsData =
         outValuesLengths->template mutable_data<int32_t>();
     T* outValuesValuesData = outValuesValues->template mutable_data<T>();

     int keysOffset = 0;
     int valuesOffset = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       inValuesOffset_[inputIndex] = 0;
     }
     for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 1);
         const bool* inPresenceData =
             Input(kNumTensorsPerInput * inputIndex + 2).template data<bool>();
         if (inPresenceData[exampleIndex]) {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
           context_.CopyItemsSameDevice(
               inValues.dtype(),
               inLengthsData[exampleIndex],
               &inValues.template data<T>()[inValuesOffset_[inputIndex]],
               &outValuesValuesData[valuesOffset]);
           valuesOffset += inLengthsData[exampleIndex];
           inValuesOffset_[inputIndex] += inLengthsData[exampleIndex];
           ++keysOffset;
         }
       }
     }
     return true;
   }

  private:
   const int kNumTensorsPerInput = 3;
   int numInputs_;
   std::vector<int> inValuesOffset_;
   std::vector<int64_t> featureIDs_;
 };

 template <class Context>
 class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeSingleListOrMapFeatureTensorsGradientOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput;
   }
   virtual ~MergeSingleListOrMapFeatureTensorsGradientOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(InputSize() - 1));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     std::vector<int> outValuesOffset(numFeatureInputs_);
     for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       int inputNumValues = 0;
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
       for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           inputNumValues += inLengthsData[exampleIndex];
         }
       }
       Output(inputIndex)->Resize(inputNumValues);
     }

     const auto& inValuesValuesGrad = Input(InputSize() - 1);
     const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>();

     int inValuesValuesOffset = 0;
     for (const auto exampleIndex : c10::irange(numExamples)) {
       for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const bool* inPresenceData =
             Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>();
         if (inPresenceData[exampleIndex]) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
           context_.CopyItemsSameDevice(
               inValuesValuesGrad.dtype(),
               inLengthsData[exampleIndex],
               &inValuesValuesGradData[inValuesValuesOffset],
               &outFeatureValues[outValuesOffset[inputIndex]]);
           outValuesOffset[inputIndex] += inLengthsData[exampleIndex];
           inValuesValuesOffset += inLengthsData[exampleIndex];
         }
       }
     }
     return true;
   }

  private:
   const int kNumTensorsPerInput = 2;
   int numFeatureInputs_;
 };

 template <class Context>
 class MergeSingleMapFeatureTensorsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeSingleMapFeatureTensorsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numInputs_ = InputSize() / kNumTensorsPerInput;
     inValuesOffset_.resize(numInputs_);
     featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids");
   }
   virtual ~MergeSingleMapFeatureTensorsOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(1));
   }

   template <typename K>
   bool DoRunWithType() {
     return DispatchHelper<
         TensorTypes2<bool, int32_t, int64_t, float, double, std::string>,
         K>::call(this, Input(2));
   }

   template <typename K, typename V>
   bool DoRunWithType2() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
       const bool* inPresenceData =
           Input(kNumTensorsPerInput * inputIndex + 3).template data<bool>();
       for (const auto exampleIndex : c10::irange(numExamples)) {
         if (inPresenceData[exampleIndex]) {
           ++totalNumFeatures;
           totalNumValues += inLengthsData[exampleIndex];
         }
       }
     }

     auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>());
     auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>());
     auto* outValuesLengths =
         Output(2, {totalNumFeatures}, at::dtype<int32_t>());
     auto* outValuesKeys = Output(3, {totalNumValues}, at::dtype<K>());
     auto* outValuesValues = Output(4, {totalNumValues}, at::dtype<V>());

     int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
     int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
     int32_t* outValuesLengthsData =
         outValuesLengths->template mutable_data<int32_t>();
     K* outValuesKeysData = outValuesKeys->template mutable_data<K>();
     V* outValuesValuesData = outValuesValues->template mutable_data<V>();

     int keysOffset = 0;
     int valuesOffset = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       inValuesOffset_[inputIndex] = 0;
     }
     for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const auto& inKeys = Input(kNumTensorsPerInput * inputIndex + 1);
         const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 2);
         const bool* inPresenceData =
             Input(kNumTensorsPerInput * inputIndex + 3).template data<bool>();
         if (inPresenceData[exampleIndex]) {
           ++outLengthsData[exampleIndex];
           outKeysData[keysOffset] = featureIDs_[inputIndex];
           outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex];
           context_.CopyItemsSameDevice(
               inKeys.dtype(),
               inLengthsData[exampleIndex],
               &inKeys.template data<K>()[inValuesOffset_[inputIndex]],
               &outValuesKeysData[valuesOffset]);
           context_.CopyItemsSameDevice(
               inValues.dtype(),
               inLengthsData[exampleIndex],
               &inValues.template data<V>()[inValuesOffset_[inputIndex]],
               &outValuesValuesData[valuesOffset]);
           valuesOffset += inLengthsData[exampleIndex];
           inValuesOffset_[inputIndex] += inLengthsData[exampleIndex];
           ++keysOffset;
         }
       }
     }
     return true;
   }

  private:
   const int kNumTensorsPerInput = 4;
   int numInputs_;
   std::vector<int> inValuesOffset_;
   std::vector<int64_t> featureIDs_;
 };

 template <class Context>
 class MergeMultiScalarFeatureTensorsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeMultiScalarFeatureTensorsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numInputs_ = InputSize() / kNumTensorsPerInput;
     inKeysOffset_.resize(numInputs_);
   }
   virtual ~MergeMultiScalarFeatureTensorsOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(2));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel();
     }

     auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>());
     auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>());
     auto* outValues = Output(2, {totalNumFeatures}, at::dtype<T>());

     int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
     int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
     T* outValuesData = outValues->template mutable_data<T>();

     int outKeysOffset = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       inKeysOffset_[inputIndex] = 0;
     }
     for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         auto inputKeysBlobIdx = kNumTensorsPerInput * inputIndex + 1;
         const int64_t* inKeysData =
             Input(inputKeysBlobIdx).template data<int64_t>();
         const T* inValuesData =
             Input(kNumTensorsPerInput * inputIndex + 2).template data<T>();
         outLengthsData[exampleIndex] += inLengthsData[exampleIndex];
         for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex];
              ++featureIndex) {
           CAFFE_ENFORCE_LT(outKeysOffset, totalNumFeatures);
           CAFFE_ENFORCE_LT(
               inKeysOffset_[inputIndex], Input(inputKeysBlobIdx).numel());
           outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
           outValuesData[outKeysOffset] =
               inValuesData[inKeysOffset_[inputIndex]];
           ++outKeysOffset;
           ++inKeysOffset_[inputIndex];
         }
       }
     }

     return true;
   }

  private:
   const int kNumTensorsPerInput = 3;
   int numInputs_;
   std::vector<int> inKeysOffset_;
 };

 template <class Context>
 class MergeMultiScalarFeatureTensorsGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeMultiScalarFeatureTensorsGradientOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput;
   }
   virtual ~MergeMultiScalarFeatureTensorsGradientOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(InputSize() - 1));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     std::vector<int> outValuesOffset(numFeatureInputs_);
     for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       int inputNumValues = 0;
       const int32_t* inLengthsData =
           Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
       for (const auto exampleIndex : c10::irange(numExamples)) {
         inputNumValues += inLengthsData[exampleIndex];
       }
       Output(inputIndex)->Resize(inputNumValues);
     }

     const auto& inValuesGrad = Input(InputSize() - 1);
     const T* inValuesGradData = inValuesGrad.template data<T>();

     int inValuesOffset = 0;
     for (const auto exampleIndex : c10::irange(numExamples)) {
       for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         if (inLengthsData[exampleIndex] > 0) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
           context_.CopyItemsSameDevice(
               inValuesGrad.dtype(),
               inLengthsData[exampleIndex],
               &inValuesGradData[inValuesOffset],
               &outFeatureValues[outValuesOffset[inputIndex]]);
           outValuesOffset[inputIndex] += inLengthsData[exampleIndex];
           inValuesOffset += inLengthsData[exampleIndex];
         }
       }
     }
     return true;
   }

  private:
   int kNumTensorsPerInput = 1;
   int numFeatureInputs_;
 };

 template <class Context>
 class MergeMultiListFeatureTensorsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeMultiListFeatureTensorsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numInputs_ = InputSize() / kNumTensorsPerInput;
     inKeysOffset_.resize(numInputs_);
     inValuesValuesOffset_.resize(numInputs_);
   }
   virtual ~MergeMultiListFeatureTensorsOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(3));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel();
       totalNumValues += Input(kNumTensorsPerInput * inputIndex + 3).numel();
     }

     auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>());
     auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>());
     auto* outValuesLengths =
         Output(2, {totalNumFeatures}, at::dtype<int32_t>());
     auto* outValuesValues = Output(3, {totalNumValues}, at::dtype<T>());

     int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
     int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
     int32_t* outValuesLengthsData =
         outValuesLengths->template mutable_data<int32_t>();
     T* outValuesValuesData = outValuesValues->template mutable_data<T>();

     int outKeysOffset = 0;
     int outValuesValuesOffset = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       inKeysOffset_[inputIndex] = 0;
       inValuesValuesOffset_[inputIndex] = 0;
     }
     for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1)
                                         .template data<int64_t>();
         const int32_t* inValuesLengthsData =
             Input(kNumTensorsPerInput * inputIndex + 2)
                 .template data<int32_t>();
         const auto& inValuesValues =
             Input(kNumTensorsPerInput * inputIndex + 3);
         outLengthsData[exampleIndex] += inLengthsData[exampleIndex];
         for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex];
              ++featureIndex) {
           outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
           outValuesLengthsData[outKeysOffset] =
               inValuesLengthsData[inKeysOffset_[inputIndex]];
           context_.CopyItemsSameDevice(
               inValuesValues.dtype(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesValues
                    .template data<T>()[inValuesValuesOffset_[inputIndex]],
               &outValuesValuesData[outValuesValuesOffset]);
           outValuesValuesOffset +=
               inValuesLengthsData[inKeysOffset_[inputIndex]];
           inValuesValuesOffset_[inputIndex] +=
               inValuesLengthsData[inKeysOffset_[inputIndex]];
           ++outKeysOffset;
           ++inKeysOffset_[inputIndex];
         }
       }
     }

     return true;
   }

  private:
   const int kNumTensorsPerInput = 4;
   int numInputs_;
   std::vector<int> inKeysOffset_;
   std::vector<int> inValuesValuesOffset_;
 };

 template <class Context>
 class MergeMultiMapFeatureTensorsOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeMultiMapFeatureTensorsOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numInputs_ = InputSize() / kNumTensorsPerInput;
     inKeysOffset_.resize(numInputs_);
     inValuesValuesOffset_.resize(numInputs_);
   }
   virtual ~MergeMultiMapFeatureTensorsOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(3));
   }

   template <typename K>
   bool DoRunWithType() {
     return DispatchHelper<
         TensorTypes2<bool, int32_t, int64_t, float, double, std::string>,
         K>::call(this, Input(4));
   }

   template <typename K, typename V>
   bool DoRunWithType2() {
     int numExamples = Input(0).numel();
     int totalNumFeatures = 0;
     int totalNumValues = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel();
       totalNumValues += Input(kNumTensorsPerInput * inputIndex + 4).numel();
     }

     auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>());
     auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>());
     auto* outValuesLengths =
         Output(2, {totalNumFeatures}, at::dtype<int32_t>());
     auto* outValuesKeys = Output(3, {totalNumValues}, at::dtype<K>());
     auto* outValuesValues = Output(4, {totalNumValues}, at::dtype<V>());

     int32_t* outLengthsData = outLengths->template mutable_data<int32_t>();
     int64_t* outKeysData = outKeys->template mutable_data<int64_t>();
     int32_t* outValuesLengthsData =
         outValuesLengths->template mutable_data<int32_t>();
     K* outValuesKeysData = outValuesKeys->template mutable_data<K>();
     V* outValuesValuesData = outValuesValues->template mutable_data<V>();

     int outKeysOffset = 0;
     int outValuesValuesOffset = 0;
     for (const auto inputIndex : c10::irange(numInputs_)) {
       inKeysOffset_[inputIndex] = 0;
       inValuesValuesOffset_[inputIndex] = 0;
     }
     for (const auto exampleIndex : c10::irange(numExamples)) {
       outLengthsData[exampleIndex] = 0;
       for (const auto inputIndex : c10::irange(numInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1)
                                         .template data<int64_t>();
         const int32_t* inValuesLengthsData =
             Input(kNumTensorsPerInput * inputIndex + 2)
                 .template data<int32_t>();
         const auto& inValuesKeys = Input(kNumTensorsPerInput * inputIndex + 3);
         const auto& inValuesValues =
             Input(kNumTensorsPerInput * inputIndex + 4);
         outLengthsData[exampleIndex] += inLengthsData[exampleIndex];
         for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex];
              ++featureIndex) {
           outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]];
           outValuesLengthsData[outKeysOffset] =
               inValuesLengthsData[inKeysOffset_[inputIndex]];
           context_.CopyItemsSameDevice(
               inValuesKeys.dtype(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesKeys
                    .template data<K>()[inValuesValuesOffset_[inputIndex]],
               &outValuesKeysData[outValuesValuesOffset]);
           context_.CopyItemsSameDevice(
               inValuesValues.dtype(),
               inValuesLengthsData[inKeysOffset_[inputIndex]],
               &inValuesValues
                    .template data<V>()[inValuesValuesOffset_[inputIndex]],
               &outValuesValuesData[outValuesValuesOffset]);
           outValuesValuesOffset +=
               inValuesLengthsData[inKeysOffset_[inputIndex]];
           inValuesValuesOffset_[inputIndex] +=
               inValuesLengthsData[inKeysOffset_[inputIndex]];
           ++outKeysOffset;
           ++inKeysOffset_[inputIndex];
         }
       }
     }

     return true;
   }

  private:
   const int kNumTensorsPerInput = 5;
   int numInputs_;
   std::vector<int> inKeysOffset_;
   std::vector<int> inValuesValuesOffset_;
 };

 template <class Context>
 class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;

   template <class... Args>
   explicit MergeMultiListOrMapFeatureTensorsGradientOp(Args&&... args)
       : Operator<Context>(std::forward<Args>(args)...) {
     numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput;
   }
   virtual ~MergeMultiListOrMapFeatureTensorsGradientOp() noexcept {}

   bool RunOnDevice() override {
     return DispatchHelper<
         TensorTypes<bool, int32_t, int64_t, float, double, std::string>>::
         call(this, Input(InputSize() - 1));
   }

   template <typename T>
   bool DoRunWithType() {
     int numExamples = Input(0).numel();
     std::vector<int> outValuesLengthOffset(numFeatureInputs_);
     std::vector<int> outValuesValuesOffset(numFeatureInputs_);
     for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
       int inputNumValues = 0;
       auto& inValuesLength = Input(kNumTensorsPerInput * inputIndex + 1);
       const int32_t* inValuesLengthsData =
           inValuesLength.template data<int32_t>();
       for (const auto valuesIndex : c10::irange(inValuesLength.numel())) {
         inputNumValues += inValuesLengthsData[valuesIndex];
       }
       Output(inputIndex)->Resize(inputNumValues);
     }

     const auto& inValuesValuesGrad = Input(InputSize() - 1);
     const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>();

     int inValuesValuesOffset = 0;
     for (const auto exampleIndex : c10::irange(numExamples)) {
       for (const auto inputIndex : c10::irange(numFeatureInputs_)) {
         const int32_t* inLengthsData =
             Input(kNumTensorsPerInput * inputIndex).template data<int32_t>();
         const int32_t* inValuesLengthsData =
             Input(kNumTensorsPerInput * inputIndex + 1)
                 .template data<int32_t>();
         int valuesLengthCopy = 0;
         for (int valuesLengthIndex = 0;
              valuesLengthIndex < inLengthsData[exampleIndex];
              ++valuesLengthIndex) {
           valuesLengthCopy += inValuesLengthsData
               [outValuesLengthOffset[inputIndex] + valuesLengthIndex];
         }
         if (valuesLengthCopy > 0) {
           T* outFeatureValues = Output(inputIndex)->template mutable_data<T>();
           context_.CopyItemsSameDevice(
               inValuesValuesGrad.dtype(),
               valuesLengthCopy,
               &inValuesValuesGradData[inValuesValuesOffset],
               &outFeatureValues[outValuesValuesOffset[inputIndex]]);
         }
         outValuesLengthOffset[inputIndex] += inLengthsData[exampleIndex];
         outValuesValuesOffset[inputIndex] += valuesLengthCopy;
         inValuesValuesOffset += valuesLengthCopy;
       }
     }
     return true;
   }

  private:
   int kNumTensorsPerInput = 2;
   int numFeatureInputs_;
 };

 } // namespace caffe2

 #endif // CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_