|  | #ifndef CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_ | 
|  | #define CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_ | 
|  |  | 
|  | #include "caffe2/core/context.h" | 
|  | #include "caffe2/core/operator.h" | 
|  | #include "c10/util/irange.h" | 
|  |  | 
|  | namespace caffe2 { | 
|  |  | 
|  | template <class Context> | 
|  | class MergeDenseFeatureTensorsOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeDenseFeatureTensorsOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids"); | 
|  | } | 
|  | virtual ~MergeDenseFeatureTensorsOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(0)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | auto& dense_data = Input(0); | 
|  | int numExamples = dense_data.size(0); | 
|  | int numFeatures = dense_data.size(1); | 
|  |  | 
|  | const bool* inPresenceData = Input(1).template data<bool>(); | 
|  | int totalNumFeatures = 0; | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | for (const auto inputIndex : c10::irange(numFeatures)) { | 
|  | if (inPresenceData[exampleIndex * numFeatures + inputIndex]) { | 
|  | ++totalNumFeatures; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>()); | 
|  | auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>()); | 
|  | auto* outValues = Output(2, {totalNumFeatures}, at::dtype<T>()); | 
|  |  | 
|  | int32_t* outLengthsData = outLengths->template mutable_data<int32_t>(); | 
|  | int64_t* outKeysData = outKeys->template mutable_data<int64_t>(); | 
|  | T* outValuesData = outValues->template mutable_data<T>(); | 
|  | const T* inData = | 
|  | Input(0).template data<T>(); | 
|  |  | 
|  | int keysOffset = 0; | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | outLengthsData[exampleIndex] = 0; | 
|  | auto offset = exampleIndex * numFeatures; | 
|  | for (const auto inputIndex : c10::irange(numFeatures)) { | 
|  | if (inPresenceData[offset]) { | 
|  | ++outLengthsData[exampleIndex]; | 
|  | outKeysData[keysOffset] = featureIDs_[inputIndex]; | 
|  | outValuesData[keysOffset] = inData[offset]; | 
|  | ++keysOffset; | 
|  | } | 
|  | offset++; | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | std::vector<int64_t> featureIDs_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeSingleScalarFeatureTensorsOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeSingleScalarFeatureTensorsOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numInputs_ = InputSize() / kNumTensorsPerInput; | 
|  | featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids"); | 
|  | } | 
|  | virtual ~MergeSingleScalarFeatureTensorsOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(0)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | int totalNumFeatures = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>(); | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | ++totalNumFeatures; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>()); | 
|  | auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>()); | 
|  | auto* outValues = Output(2, {totalNumFeatures}, at::dtype<T>()); | 
|  |  | 
|  | int32_t* outLengthsData = outLengths->template mutable_data<int32_t>(); | 
|  | int64_t* outKeysData = outKeys->template mutable_data<int64_t>(); | 
|  | T* outValuesData = outValues->template mutable_data<T>(); | 
|  |  | 
|  | int keysOffset = 0; | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | outLengthsData[exampleIndex] = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const T* inData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<T>(); | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>(); | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | ++outLengthsData[exampleIndex]; | 
|  | outKeysData[keysOffset] = featureIDs_[inputIndex]; | 
|  | outValuesData[keysOffset] = inData[exampleIndex]; | 
|  | ++keysOffset; | 
|  | } | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | const int kNumTensorsPerInput = 2; | 
|  | int numInputs_; | 
|  | std::vector<int64_t> featureIDs_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeSingleScalarFeatureTensorsGradientOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeSingleScalarFeatureTensorsGradientOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numFeatureInputs_ = InputSize() - 1; // Everything other than values_grad | 
|  | } | 
|  | virtual ~MergeSingleScalarFeatureTensorsGradientOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(InputSize() - 1)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | Output(inputIndex)->ResizeLike(Input(inputIndex)); | 
|  | } | 
|  |  | 
|  | const T* inValuesGradData = Input(InputSize() - 1).template data<T>(); | 
|  |  | 
|  | T default_value = T(); | 
|  | int valuesOffset = 0; | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | const bool* inPresenceData = Input(inputIndex).template data<bool>(); | 
|  | T* outFeatureData = Output(inputIndex)->template mutable_data<T>(); | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | outFeatureData[exampleIndex] = inValuesGradData[valuesOffset]; | 
|  | ++valuesOffset; | 
|  | } else { | 
|  | outFeatureData[exampleIndex] = default_value; | 
|  | } | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | int numFeatureInputs_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeSingleListFeatureTensorsOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeSingleListFeatureTensorsOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numInputs_ = InputSize() / kNumTensorsPerInput; | 
|  | inValuesOffset_.resize(numInputs_); | 
|  | featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids"); | 
|  | } | 
|  | virtual ~MergeSingleListFeatureTensorsOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(1)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | int totalNumFeatures = 0; | 
|  | int totalNumValues = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 2).template data<bool>(); | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | ++totalNumFeatures; | 
|  | totalNumValues += inLengthsData[exampleIndex]; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>()); | 
|  | auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>()); | 
|  | auto* outValuesLengths = | 
|  | Output(2, {totalNumFeatures}, at::dtype<int32_t>()); | 
|  | auto* outValuesValues = Output(3, {totalNumValues}, at::dtype<T>()); | 
|  |  | 
|  | int32_t* outLengthsData = outLengths->template mutable_data<int32_t>(); | 
|  | int64_t* outKeysData = outKeys->template mutable_data<int64_t>(); | 
|  | int32_t* outValuesLengthsData = | 
|  | outValuesLengths->template mutable_data<int32_t>(); | 
|  | T* outValuesValuesData = outValuesValues->template mutable_data<T>(); | 
|  |  | 
|  | int keysOffset = 0; | 
|  | int valuesOffset = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | inValuesOffset_[inputIndex] = 0; | 
|  | } | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | outLengthsData[exampleIndex] = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 1); | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 2).template data<bool>(); | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | ++outLengthsData[exampleIndex]; | 
|  | outKeysData[keysOffset] = featureIDs_[inputIndex]; | 
|  | outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex]; | 
|  | context_.CopyItemsSameDevice( | 
|  | inValues.dtype(), | 
|  | inLengthsData[exampleIndex], | 
|  | &inValues.template data<T>()[inValuesOffset_[inputIndex]], | 
|  | &outValuesValuesData[valuesOffset]); | 
|  | valuesOffset += inLengthsData[exampleIndex]; | 
|  | inValuesOffset_[inputIndex] += inLengthsData[exampleIndex]; | 
|  | ++keysOffset; | 
|  | } | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | const int kNumTensorsPerInput = 3; | 
|  | int numInputs_; | 
|  | std::vector<int> inValuesOffset_; | 
|  | std::vector<int64_t> featureIDs_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeSingleListOrMapFeatureTensorsGradientOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeSingleListOrMapFeatureTensorsGradientOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput; | 
|  | } | 
|  | virtual ~MergeSingleListOrMapFeatureTensorsGradientOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(InputSize() - 1)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | std::vector<int> outValuesOffset(numFeatureInputs_); | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | int inputNumValues = 0; | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>(); | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | inputNumValues += inLengthsData[exampleIndex]; | 
|  | } | 
|  | } | 
|  | Output(inputIndex)->Resize(inputNumValues); | 
|  | } | 
|  |  | 
|  | const auto& inValuesValuesGrad = Input(InputSize() - 1); | 
|  | const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>(); | 
|  |  | 
|  | int inValuesValuesOffset = 0; | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 1).template data<bool>(); | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | T* outFeatureValues = Output(inputIndex)->template mutable_data<T>(); | 
|  | context_.CopyItemsSameDevice( | 
|  | inValuesValuesGrad.dtype(), | 
|  | inLengthsData[exampleIndex], | 
|  | &inValuesValuesGradData[inValuesValuesOffset], | 
|  | &outFeatureValues[outValuesOffset[inputIndex]]); | 
|  | outValuesOffset[inputIndex] += inLengthsData[exampleIndex]; | 
|  | inValuesValuesOffset += inLengthsData[exampleIndex]; | 
|  | } | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | const int kNumTensorsPerInput = 2; | 
|  | int numFeatureInputs_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeSingleMapFeatureTensorsOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeSingleMapFeatureTensorsOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numInputs_ = InputSize() / kNumTensorsPerInput; | 
|  | inValuesOffset_.resize(numInputs_); | 
|  | featureIDs_ = this->template GetRepeatedArgument<int64_t>("feature_ids"); | 
|  | } | 
|  | virtual ~MergeSingleMapFeatureTensorsOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(1)); | 
|  | } | 
|  |  | 
|  | template <typename K> | 
|  | bool DoRunWithType() { | 
|  | return DispatchHelper< | 
|  | TensorTypes2<bool, int32_t, int64_t, float, double, std::string>, | 
|  | K>::call(this, Input(2)); | 
|  | } | 
|  |  | 
|  | template <typename K, typename V> | 
|  | bool DoRunWithType2() { | 
|  | int numExamples = Input(0).numel(); | 
|  | int totalNumFeatures = 0; | 
|  | int totalNumValues = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 3).template data<bool>(); | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | ++totalNumFeatures; | 
|  | totalNumValues += inLengthsData[exampleIndex]; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>()); | 
|  | auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>()); | 
|  | auto* outValuesLengths = | 
|  | Output(2, {totalNumFeatures}, at::dtype<int32_t>()); | 
|  | auto* outValuesKeys = Output(3, {totalNumValues}, at::dtype<K>()); | 
|  | auto* outValuesValues = Output(4, {totalNumValues}, at::dtype<V>()); | 
|  |  | 
|  | int32_t* outLengthsData = outLengths->template mutable_data<int32_t>(); | 
|  | int64_t* outKeysData = outKeys->template mutable_data<int64_t>(); | 
|  | int32_t* outValuesLengthsData = | 
|  | outValuesLengths->template mutable_data<int32_t>(); | 
|  | K* outValuesKeysData = outValuesKeys->template mutable_data<K>(); | 
|  | V* outValuesValuesData = outValuesValues->template mutable_data<V>(); | 
|  |  | 
|  | int keysOffset = 0; | 
|  | int valuesOffset = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | inValuesOffset_[inputIndex] = 0; | 
|  | } | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | outLengthsData[exampleIndex] = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const auto& inKeys = Input(kNumTensorsPerInput * inputIndex + 1); | 
|  | const auto& inValues = Input(kNumTensorsPerInput * inputIndex + 2); | 
|  | const bool* inPresenceData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 3).template data<bool>(); | 
|  | if (inPresenceData[exampleIndex]) { | 
|  | ++outLengthsData[exampleIndex]; | 
|  | outKeysData[keysOffset] = featureIDs_[inputIndex]; | 
|  | outValuesLengthsData[keysOffset] = inLengthsData[exampleIndex]; | 
|  | context_.CopyItemsSameDevice( | 
|  | inKeys.dtype(), | 
|  | inLengthsData[exampleIndex], | 
|  | &inKeys.template data<K>()[inValuesOffset_[inputIndex]], | 
|  | &outValuesKeysData[valuesOffset]); | 
|  | context_.CopyItemsSameDevice( | 
|  | inValues.dtype(), | 
|  | inLengthsData[exampleIndex], | 
|  | &inValues.template data<V>()[inValuesOffset_[inputIndex]], | 
|  | &outValuesValuesData[valuesOffset]); | 
|  | valuesOffset += inLengthsData[exampleIndex]; | 
|  | inValuesOffset_[inputIndex] += inLengthsData[exampleIndex]; | 
|  | ++keysOffset; | 
|  | } | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | const int kNumTensorsPerInput = 4; | 
|  | int numInputs_; | 
|  | std::vector<int> inValuesOffset_; | 
|  | std::vector<int64_t> featureIDs_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeMultiScalarFeatureTensorsOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeMultiScalarFeatureTensorsOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numInputs_ = InputSize() / kNumTensorsPerInput; | 
|  | inKeysOffset_.resize(numInputs_); | 
|  | } | 
|  | virtual ~MergeMultiScalarFeatureTensorsOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(2)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | int totalNumFeatures = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel(); | 
|  | } | 
|  |  | 
|  | auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>()); | 
|  | auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>()); | 
|  | auto* outValues = Output(2, {totalNumFeatures}, at::dtype<T>()); | 
|  |  | 
|  | int32_t* outLengthsData = outLengths->template mutable_data<int32_t>(); | 
|  | int64_t* outKeysData = outKeys->template mutable_data<int64_t>(); | 
|  | T* outValuesData = outValues->template mutable_data<T>(); | 
|  |  | 
|  | int outKeysOffset = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | inKeysOffset_[inputIndex] = 0; | 
|  | } | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | outLengthsData[exampleIndex] = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | auto inputKeysBlobIdx = kNumTensorsPerInput * inputIndex + 1; | 
|  | const int64_t* inKeysData = | 
|  | Input(inputKeysBlobIdx).template data<int64_t>(); | 
|  | const T* inValuesData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 2).template data<T>(); | 
|  | outLengthsData[exampleIndex] += inLengthsData[exampleIndex]; | 
|  | for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex]; | 
|  | ++featureIndex) { | 
|  | CAFFE_ENFORCE_LT(outKeysOffset, totalNumFeatures); | 
|  | CAFFE_ENFORCE_LT( | 
|  | inKeysOffset_[inputIndex], Input(inputKeysBlobIdx).numel()); | 
|  | outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]]; | 
|  | outValuesData[outKeysOffset] = | 
|  | inValuesData[inKeysOffset_[inputIndex]]; | 
|  | ++outKeysOffset; | 
|  | ++inKeysOffset_[inputIndex]; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | const int kNumTensorsPerInput = 3; | 
|  | int numInputs_; | 
|  | std::vector<int> inKeysOffset_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeMultiScalarFeatureTensorsGradientOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeMultiScalarFeatureTensorsGradientOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput; | 
|  | } | 
|  | virtual ~MergeMultiScalarFeatureTensorsGradientOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(InputSize() - 1)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | std::vector<int> outValuesOffset(numFeatureInputs_); | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | int inputNumValues = 0; | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | inputNumValues += inLengthsData[exampleIndex]; | 
|  | } | 
|  | Output(inputIndex)->Resize(inputNumValues); | 
|  | } | 
|  |  | 
|  | const auto& inValuesGrad = Input(InputSize() - 1); | 
|  | const T* inValuesGradData = inValuesGrad.template data<T>(); | 
|  |  | 
|  | int inValuesOffset = 0; | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | if (inLengthsData[exampleIndex] > 0) { | 
|  | T* outFeatureValues = Output(inputIndex)->template mutable_data<T>(); | 
|  | context_.CopyItemsSameDevice( | 
|  | inValuesGrad.dtype(), | 
|  | inLengthsData[exampleIndex], | 
|  | &inValuesGradData[inValuesOffset], | 
|  | &outFeatureValues[outValuesOffset[inputIndex]]); | 
|  | outValuesOffset[inputIndex] += inLengthsData[exampleIndex]; | 
|  | inValuesOffset += inLengthsData[exampleIndex]; | 
|  | } | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | int kNumTensorsPerInput = 1; | 
|  | int numFeatureInputs_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeMultiListFeatureTensorsOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeMultiListFeatureTensorsOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numInputs_ = InputSize() / kNumTensorsPerInput; | 
|  | inKeysOffset_.resize(numInputs_); | 
|  | inValuesValuesOffset_.resize(numInputs_); | 
|  | } | 
|  | virtual ~MergeMultiListFeatureTensorsOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(3)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | int totalNumFeatures = 0; | 
|  | int totalNumValues = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel(); | 
|  | totalNumValues += Input(kNumTensorsPerInput * inputIndex + 3).numel(); | 
|  | } | 
|  |  | 
|  | auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>()); | 
|  | auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>()); | 
|  | auto* outValuesLengths = | 
|  | Output(2, {totalNumFeatures}, at::dtype<int32_t>()); | 
|  | auto* outValuesValues = Output(3, {totalNumValues}, at::dtype<T>()); | 
|  |  | 
|  | int32_t* outLengthsData = outLengths->template mutable_data<int32_t>(); | 
|  | int64_t* outKeysData = outKeys->template mutable_data<int64_t>(); | 
|  | int32_t* outValuesLengthsData = | 
|  | outValuesLengths->template mutable_data<int32_t>(); | 
|  | T* outValuesValuesData = outValuesValues->template mutable_data<T>(); | 
|  |  | 
|  | int outKeysOffset = 0; | 
|  | int outValuesValuesOffset = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | inKeysOffset_[inputIndex] = 0; | 
|  | inValuesValuesOffset_[inputIndex] = 0; | 
|  | } | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | outLengthsData[exampleIndex] = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1) | 
|  | .template data<int64_t>(); | 
|  | const int32_t* inValuesLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 2) | 
|  | .template data<int32_t>(); | 
|  | const auto& inValuesValues = | 
|  | Input(kNumTensorsPerInput * inputIndex + 3); | 
|  | outLengthsData[exampleIndex] += inLengthsData[exampleIndex]; | 
|  | for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex]; | 
|  | ++featureIndex) { | 
|  | outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]]; | 
|  | outValuesLengthsData[outKeysOffset] = | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]]; | 
|  | context_.CopyItemsSameDevice( | 
|  | inValuesValues.dtype(), | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]], | 
|  | &inValuesValues | 
|  | .template data<T>()[inValuesValuesOffset_[inputIndex]], | 
|  | &outValuesValuesData[outValuesValuesOffset]); | 
|  | outValuesValuesOffset += | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]]; | 
|  | inValuesValuesOffset_[inputIndex] += | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]]; | 
|  | ++outKeysOffset; | 
|  | ++inKeysOffset_[inputIndex]; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | const int kNumTensorsPerInput = 4; | 
|  | int numInputs_; | 
|  | std::vector<int> inKeysOffset_; | 
|  | std::vector<int> inValuesValuesOffset_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeMultiMapFeatureTensorsOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeMultiMapFeatureTensorsOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numInputs_ = InputSize() / kNumTensorsPerInput; | 
|  | inKeysOffset_.resize(numInputs_); | 
|  | inValuesValuesOffset_.resize(numInputs_); | 
|  | } | 
|  | virtual ~MergeMultiMapFeatureTensorsOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(3)); | 
|  | } | 
|  |  | 
|  | template <typename K> | 
|  | bool DoRunWithType() { | 
|  | return DispatchHelper< | 
|  | TensorTypes2<bool, int32_t, int64_t, float, double, std::string>, | 
|  | K>::call(this, Input(4)); | 
|  | } | 
|  |  | 
|  | template <typename K, typename V> | 
|  | bool DoRunWithType2() { | 
|  | int numExamples = Input(0).numel(); | 
|  | int totalNumFeatures = 0; | 
|  | int totalNumValues = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | totalNumFeatures += Input(kNumTensorsPerInput * inputIndex + 1).numel(); | 
|  | totalNumValues += Input(kNumTensorsPerInput * inputIndex + 4).numel(); | 
|  | } | 
|  |  | 
|  | auto* outLengths = Output(0, {numExamples}, at::dtype<int32_t>()); | 
|  | auto* outKeys = Output(1, {totalNumFeatures}, at::dtype<int64_t>()); | 
|  | auto* outValuesLengths = | 
|  | Output(2, {totalNumFeatures}, at::dtype<int32_t>()); | 
|  | auto* outValuesKeys = Output(3, {totalNumValues}, at::dtype<K>()); | 
|  | auto* outValuesValues = Output(4, {totalNumValues}, at::dtype<V>()); | 
|  |  | 
|  | int32_t* outLengthsData = outLengths->template mutable_data<int32_t>(); | 
|  | int64_t* outKeysData = outKeys->template mutable_data<int64_t>(); | 
|  | int32_t* outValuesLengthsData = | 
|  | outValuesLengths->template mutable_data<int32_t>(); | 
|  | K* outValuesKeysData = outValuesKeys->template mutable_data<K>(); | 
|  | V* outValuesValuesData = outValuesValues->template mutable_data<V>(); | 
|  |  | 
|  | int outKeysOffset = 0; | 
|  | int outValuesValuesOffset = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | inKeysOffset_[inputIndex] = 0; | 
|  | inValuesValuesOffset_[inputIndex] = 0; | 
|  | } | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | outLengthsData[exampleIndex] = 0; | 
|  | for (const auto inputIndex : c10::irange(numInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const int64_t* inKeysData = Input(kNumTensorsPerInput * inputIndex + 1) | 
|  | .template data<int64_t>(); | 
|  | const int32_t* inValuesLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 2) | 
|  | .template data<int32_t>(); | 
|  | const auto& inValuesKeys = Input(kNumTensorsPerInput * inputIndex + 3); | 
|  | const auto& inValuesValues = | 
|  | Input(kNumTensorsPerInput * inputIndex + 4); | 
|  | outLengthsData[exampleIndex] += inLengthsData[exampleIndex]; | 
|  | for (int featureIndex = 0; featureIndex < inLengthsData[exampleIndex]; | 
|  | ++featureIndex) { | 
|  | outKeysData[outKeysOffset] = inKeysData[inKeysOffset_[inputIndex]]; | 
|  | outValuesLengthsData[outKeysOffset] = | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]]; | 
|  | context_.CopyItemsSameDevice( | 
|  | inValuesKeys.dtype(), | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]], | 
|  | &inValuesKeys | 
|  | .template data<K>()[inValuesValuesOffset_[inputIndex]], | 
|  | &outValuesKeysData[outValuesValuesOffset]); | 
|  | context_.CopyItemsSameDevice( | 
|  | inValuesValues.dtype(), | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]], | 
|  | &inValuesValues | 
|  | .template data<V>()[inValuesValuesOffset_[inputIndex]], | 
|  | &outValuesValuesData[outValuesValuesOffset]); | 
|  | outValuesValuesOffset += | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]]; | 
|  | inValuesValuesOffset_[inputIndex] += | 
|  | inValuesLengthsData[inKeysOffset_[inputIndex]]; | 
|  | ++outKeysOffset; | 
|  | ++inKeysOffset_[inputIndex]; | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | const int kNumTensorsPerInput = 5; | 
|  | int numInputs_; | 
|  | std::vector<int> inKeysOffset_; | 
|  | std::vector<int> inValuesValuesOffset_; | 
|  | }; | 
|  |  | 
|  | template <class Context> | 
|  | class MergeMultiListOrMapFeatureTensorsGradientOp : public Operator<Context> { | 
|  | public: | 
|  | USE_OPERATOR_CONTEXT_FUNCTIONS; | 
|  |  | 
|  | template <class... Args> | 
|  | explicit MergeMultiListOrMapFeatureTensorsGradientOp(Args&&... args) | 
|  | : Operator<Context>(std::forward<Args>(args)...) { | 
|  | numFeatureInputs_ = (InputSize() - 1) / kNumTensorsPerInput; | 
|  | } | 
|  | virtual ~MergeMultiListOrMapFeatureTensorsGradientOp() noexcept {} | 
|  |  | 
|  | bool RunOnDevice() override { | 
|  | return DispatchHelper< | 
|  | TensorTypes<bool, int32_t, int64_t, float, double, std::string>>:: | 
|  | call(this, Input(InputSize() - 1)); | 
|  | } | 
|  |  | 
|  | template <typename T> | 
|  | bool DoRunWithType() { | 
|  | int numExamples = Input(0).numel(); | 
|  | std::vector<int> outValuesLengthOffset(numFeatureInputs_); | 
|  | std::vector<int> outValuesValuesOffset(numFeatureInputs_); | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | int inputNumValues = 0; | 
|  | auto& inValuesLength = Input(kNumTensorsPerInput * inputIndex + 1); | 
|  | const int32_t* inValuesLengthsData = | 
|  | inValuesLength.template data<int32_t>(); | 
|  | for (const auto valuesIndex : c10::irange(inValuesLength.numel())) { | 
|  | inputNumValues += inValuesLengthsData[valuesIndex]; | 
|  | } | 
|  | Output(inputIndex)->Resize(inputNumValues); | 
|  | } | 
|  |  | 
|  | const auto& inValuesValuesGrad = Input(InputSize() - 1); | 
|  | const T* inValuesValuesGradData = inValuesValuesGrad.template data<T>(); | 
|  |  | 
|  | int inValuesValuesOffset = 0; | 
|  | for (const auto exampleIndex : c10::irange(numExamples)) { | 
|  | for (const auto inputIndex : c10::irange(numFeatureInputs_)) { | 
|  | const int32_t* inLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex).template data<int32_t>(); | 
|  | const int32_t* inValuesLengthsData = | 
|  | Input(kNumTensorsPerInput * inputIndex + 1) | 
|  | .template data<int32_t>(); | 
|  | int valuesLengthCopy = 0; | 
|  | for (int valuesLengthIndex = 0; | 
|  | valuesLengthIndex < inLengthsData[exampleIndex]; | 
|  | ++valuesLengthIndex) { | 
|  | valuesLengthCopy += inValuesLengthsData | 
|  | [outValuesLengthOffset[inputIndex] + valuesLengthIndex]; | 
|  | } | 
|  | if (valuesLengthCopy > 0) { | 
|  | T* outFeatureValues = Output(inputIndex)->template mutable_data<T>(); | 
|  | context_.CopyItemsSameDevice( | 
|  | inValuesValuesGrad.dtype(), | 
|  | valuesLengthCopy, | 
|  | &inValuesValuesGradData[inValuesValuesOffset], | 
|  | &outFeatureValues[outValuesValuesOffset[inputIndex]]); | 
|  | } | 
|  | outValuesLengthOffset[inputIndex] += inLengthsData[exampleIndex]; | 
|  | outValuesValuesOffset[inputIndex] += valuesLengthCopy; | 
|  | inValuesValuesOffset += valuesLengthCopy; | 
|  | } | 
|  | } | 
|  | return true; | 
|  | } | 
|  |  | 
|  | private: | 
|  | int kNumTensorsPerInput = 2; | 
|  | int numFeatureInputs_; | 
|  | }; | 
|  |  | 
|  | } // namespace caffe2 | 
|  |  | 
|  | #endif // CAFFE2_OPERATORS_FEATURE_MAPS_OPS_H_ |