| #include "conv_dnnlowp_op.h" | 
 |  | 
 | // #define DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 | #include <chrono> | 
 | #endif | 
 |  | 
 | #ifdef _OPENMP | 
 | #include <omp.h> | 
 | #endif | 
 |  | 
 | #include "caffe2/core/tensor_int8.h" | 
 | #include "caffe2/operators/conv_op_impl.h" | 
 | #include "caffe2/utils/cpuid.h" | 
 |  | 
 | #include <fbgemm/src/RefImplementations.h> | 
 |  | 
 | #include "dnnlowp_op.h" | 
 | #include "dnnlowp_partition.h" | 
 | #include "fbgemm_pack_op.h" | 
 | #include "im2col_dnnlowp.h" | 
 | #include "mmio.h" | 
 |  | 
 | C10_DEFINE_bool( | 
 |     caffe2_dnnlowp_shared_int32_buffer, | 
 |     false, | 
 |     "Share intermediate int32 buffer across DNNLOWP Conv ops"); | 
 |  | 
 | C10_DEFINE_bool( | 
 |     caffe2_dnnlowp_dump_tensors, | 
 |     false, | 
 |     "Dump quantized input and weight tensors used in Conv and FC operators " | 
 |     "during the first iteration"); | 
 |  | 
 | C10_DECLARE_bool(caffe2_dnnlowp_force_slow_path); | 
 |  | 
 | namespace caffe2 { | 
 |  | 
 | using namespace std; | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | ConvDNNLowPOp<T, ReluFused>::ConvDNNLowPOp( | 
 |     const OperatorDef& operator_def, | 
 |     Workspace* ws) | 
 |     : BaseType(operator_def, ws), | 
 |       column_offsets_(make_shared<vector<int32_t>>()), | 
 |       b_quantized_(make_shared<vector<int32_t>>()) { | 
 |   in_qparams_.resize(1); | 
 |  | 
 |   // Create shared buffer mutex in the constructor | 
 |   // to avoid race-condition in DAGNet. | 
 |   if (FLAGS_caffe2_force_shared_col_buffer || shared_buffer_) { | 
 |     createSharedBuffer<CPUContext>(ws_); | 
 |   } | 
 |  | 
 |   if (FLAGS_caffe2_dnnlowp_shared_int32_buffer) { | 
 |     this->CreateSharedInt32Buffer_(); | 
 |   } | 
 |  | 
 |   quantize_groupwise_ = | 
 |       this->template GetSingleArgument<bool>("quantize_groupwise", false); | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | ConvDNNLowPOp<T, ReluFused>::~ConvDNNLowPOp() {} | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | dnnlowp::TensorQuantizationParams& | 
 | ConvDNNLowPOp<T, ReluFused>::FilterQuantizationParams(int group_id) { | 
 |   return filter_qparams_[quantize_groupwise_ ? group_id : 0]; | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | dnnlowp::RequantizationParams& | 
 | ConvDNNLowPOp<T, ReluFused>::RequantizationParams(int group_id) { | 
 |   return requantization_params_[quantize_groupwise_ ? group_id : 0]; | 
 | } | 
 |  | 
 | // FIXME : code duplication with | 
 | // ConvDNNLowPPackWeightOp::TakeDepthWise3x3FastPath_ | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::TakeDepthWise3x3FastPath_() { | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   return this->order_ == StorageOrder::NHWC && is_same<T, uint8_t>::value && | 
 |       !Acc16() && group_ == X.dim32(X.dim() - 1) && group_ % 8 == 0 && | 
 |       this->kernel_.size() == 2 && kernel_h() == 3 && kernel_w() == 3 && | 
 |       stride_h() == stride_w() && (stride_h() == 1 || stride_h() == 2) && | 
 |       dilation_h() == 1 && dilation_w() == 1 && pad_t() == 1 && pad_b() == 1 && | 
 |       pad_l() == 1 && pad_r() == 1 && GetCpuId().avx2(); | 
 | } | 
 |  | 
 | // FIXME : code duplication with | 
 | // ConvDNNLowPPackWeightOp::TakeDepthWise3x3x3FastPath_ | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::TakeDepthWise3x3x3FastPath_() { | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   bool ret = this->order_ == StorageOrder::NHWC && is_same<T, uint8_t>::value && | 
 |       !Acc16() && group_ == X.dim32(X.dim() - 1) && group_ % 8 == 0 && | 
 |       this->kernel_.size() == 3 && this->kernel_[0] == 3 && | 
 |       this->kernel_[1] == 3 && this->kernel_[2] == 3 && | 
 |       (this->stride_[0] == 1 || this->stride_[0] == 2) && | 
 |       (this->stride_[1] == 1 || this->stride_[1] == 2) && | 
 |       (this->stride_[2] == 1 || this->stride_[2] == 2) && | 
 |       this->dilation_[0] == 1 && this->dilation_[1] == 1 && | 
 |       this->dilation_[2] == 1 && | 
 |       accumulate( | 
 |           this->pads_.begin(), this->pads_.end(), 1, multiplies<int>()) == 1 && | 
 |       GetCpuId().avx2(); | 
 |   return ret; | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | fbgemm::conv_param_t<> ConvDNNLowPOp<T, ReluFused>::GetConvParam_() { | 
 |   CAFFE_ENFORCE_EQ(this->kernel_.size(), 2); | 
 |   CAFFE_ENFORCE_EQ(this->order_, StorageOrder::NHWC); | 
 |  | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   const int N = X.dim32(0), C = X.dim32(X.dim() - 1); | 
 |   const int M = filter.dim32(0); | 
 |  | 
 |   return fbgemm::conv_param_t<>( | 
 |       N, | 
 |       C, | 
 |       M, | 
 |       {X.dim32(1), X.dim32(2)}, | 
 |       group_, | 
 |       {this->kernel_[0], this->kernel_[1]}, | 
 |       {this->stride_[0], this->stride_[1]}, | 
 |       {this->pads_[0], this->pads_[1], this->pads_[2], this->pads_[3]}); | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | fbgemm::conv_param_t<3> ConvDNNLowPOp<T, ReluFused>::GetConv3DParam_() { | 
 |   CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); | 
 |   CAFFE_ENFORCE_EQ(this->order_, StorageOrder::NHWC); | 
 |  | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   const int N = X.dim32(0), C = X.dim32(X.dim() - 1); | 
 |   const int M = filter.dim32(0); | 
 |  | 
 |   return fbgemm::conv_param_t<3>( | 
 |       N, | 
 |       C, | 
 |       M, | 
 |       {X.dim32(1), X.dim32(2), X.dim32(3)}, | 
 |       group_, | 
 |       {this->kernel_[0], this->kernel_[1], this->kernel_[2]}, | 
 |       {this->stride_[0], this->stride_[1], this->stride_[2]}, | 
 |       {this->pads_[0], | 
 |        this->pads_[1], | 
 |        this->pads_[2], | 
 |        this->pads_[3], | 
 |        this->pads_[4], | 
 |        this->pads_[5]}); | 
 | } | 
 |  | 
 | // FIXME : code duplication with | 
 | // ConvDNNLowPPackWeightOp::TakeGConvFastPath_ | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::TakeGConvFastPath_() { | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   if (this->order_ != StorageOrder::NHWC || !is_same<T, uint8_t>::value || | 
 |       !X.template IsType<T>() || | 
 |       (this->kernel_.size() != 2 && this->kernel_.size() != 3)) { | 
 |     return false; | 
 |   } | 
 |  | 
 |   if (this->kernel_.size() == 2) { | 
 |     return fbgemm::fbgemmOptimizedGConv(GetConvParam_()); | 
 |   } else { | 
 |     CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); | 
 |     return fbgemm::fbgemmOptimizedGConv(GetConv3DParam_()); | 
 |   } | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | int ConvDNNLowPOp<T, ReluFused>::KernelDim_() { | 
 |   int kernel_dim; | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   const auto& filter = InputTensorCPU_(FILTER); | 
 |  | 
 |   int C; | 
 |   int filter_offset; | 
 |   if (ConvPoolOpBase<CPUContext>::order_ == StorageOrder::NCHW) { | 
 |     C = X.dim32(1); | 
 |     filter_offset = 2; | 
 |   } else { | 
 |     C = X.dim32(X.dim() - 1); | 
 |     filter_offset = 1; | 
 |   } | 
 |  | 
 |   int kernel_dims_size = 1; | 
 |   for (int i = 0; i < this->kernel_.size(); ++i) { | 
 |     CAFFE_ENFORCE_EQ(filter.dim32(i + filter_offset), kernel_[i]); | 
 |     kernel_dims_size *= kernel_[i]; | 
 |   } | 
 |   kernel_dim = C / group_ * kernel_dims_size; | 
 |  | 
 |   return kernel_dim; | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::IsConvGEMM_() const { | 
 |   return accumulate( | 
 |              this->kernel_.begin(), | 
 |              this->kernel_.end(), | 
 |              1, | 
 |              multiplies<int>()) == 1 && | 
 |       accumulate( | 
 |           this->stride_.begin(), this->stride_.end(), 1, multiplies<int>()) == | 
 |       1 && | 
 |       accumulate( | 
 |           this->dilation_.begin(), | 
 |           this->dilation_.end(), | 
 |           1, | 
 |           multiplies<int>()) == 1 && | 
 |       accumulate(this->pads_.begin(), this->pads_.end(), 0) == 0; | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::NoIm2ColNHWC_() { | 
 |   if (TakeDepthWise3x3FastPath_() || TakeDepthWise3x3x3FastPath_() || | 
 |       TakeGConvFastPath_()) { | 
 |     return true; | 
 |   } | 
 |  | 
 |   if (Wq_packed_ && | 
 |       accumulate( | 
 |           this->dilation_.begin(), | 
 |           this->dilation_.end(), | 
 |           1, | 
 |           multiplies<int>()) == 1) { | 
 |     // im2col fusion | 
 |     return true; | 
 |   } | 
 |  | 
 |   return IsConvGEMM_(); | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | void ConvDNNLowPOp<T, ReluFused>::PreComputeRowColumnOffsets_() { | 
 |   if (this->order_ == StorageOrder::NHWC && | 
 |       this->template InputIsType<int8::Int8TensorCPU>(INPUT)) { | 
 |     // If input tensor doesn't use dynamic quantization, we fold column_offsets_ | 
 |     // into bias. | 
 |     return; | 
 |   } | 
 |  | 
 |   const auto& filter = InputTensorCPU_(FILTER); | 
 |   int kernel_dim = KernelDim_(); | 
 |   int M = filter.dim32(0); | 
 |  | 
 |   // Pre-compute row_offset / column_offset | 
 |   vector<int>& offsets = | 
 |       this->order_ == StorageOrder::NCHW ? row_offsets_ : *column_offsets_; | 
 |  | 
 |   if (offsets.empty()) { | 
 |     if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |       const auto& packed_filter = | 
 |           this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |       column_offsets_ = packed_filter.column_offsets; | 
 |     } else { | 
 |       ComputeColumnOffsets<T_signed>( | 
 |           kernel_dim, M, W_quantized_.data(), filter_qparams_, offsets); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | void ConvDNNLowPOp<T, ReluFused>::QuantizeBias_() { | 
 |   using namespace dnnlowp; | 
 |  | 
 |   const auto& filter = InputTensorCPU_(FILTER); | 
 |   int M = filter.dim32(0); | 
 |  | 
 |   bool has_packed_bias = | 
 |       this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER) && | 
 |       this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER).bias.get(); | 
 |   bool has_bias = InputSize() == 3 || has_packed_bias; | 
 |  | 
 |   // Quantize bias | 
 |   if (has_bias && | 
 |       (!b_quantized_data_ || | 
 |        in_qparams_[INPUT].scale != in_qparams_scale_old_ || | 
 |        in_qparams_[INPUT].zero_point != in_qparams_zero_point_old_)) { | 
 |     if (has_packed_bias) { | 
 |       const auto& packed_filter = | 
 |           this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |       b_quantized_data_ = packed_filter.bias->data(); | 
 |     } else { | 
 |       const auto& bias = InputTensorCPU_(BIAS); | 
 |       if (this->template InputIsType<int8::Int8TensorCPU>(BIAS)) { | 
 |         TensorQuantizationParams bias_qparams; | 
 |         bias_qparams.scale = | 
 |             this->template Input<int8::Int8TensorCPU>(BIAS).scale; | 
 |         bias_qparams.zero_point = | 
 |             this->template Input<int8::Int8TensorCPU>(BIAS).zero_point; | 
 |         if (InputTensorCPU_(INPUT).dim32(0) > 0) { | 
 |           CAFFE_ENFORCE_LE( | 
 |               std::abs( | 
 |                   bias_qparams.scale - | 
 |                   in_qparams_[INPUT].scale * FilterQuantizationParams(0).scale), | 
 |               1e-4); | 
 |         } | 
 |         CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0); | 
 |         b_quantized_data_ = bias.template data<int32_t>(); | 
 |       } else { | 
 |         const float* b_data = bias.template data<float>(); | 
 |         b_quantized_->resize(bias.numel()); | 
 |         for (int g = 0; g < filter_qparams_.size(); ++g) { | 
 |           int i_begin = g * (M / filter_qparams_.size()); | 
 |           int i_end = i_begin + (M / filter_qparams_.size()); | 
 |           for (int i = i_begin; i < i_end; ++i) { | 
 |             (*b_quantized_)[i] = fbgemm::Quantize<int32_t>( | 
 |                 b_data[i], | 
 |                 0, | 
 |                 in_qparams_[INPUT].scale * FilterQuantizationParams(g).scale, | 
 |                 32, | 
 |                 true /* signed */); | 
 |           } | 
 |         } | 
 |         b_quantized_data_ = b_quantized_->data(); | 
 |       } | 
 |     } | 
 |     in_qparams_scale_old_ = in_qparams_[INPUT].scale; | 
 |     in_qparams_zero_point_old_ = in_qparams_[INPUT].zero_point; | 
 |  | 
 |     CAFFE_ENFORCE(b_quantized_data_); | 
 |  | 
 |     // If column_offsets_ is empty even when we need column_offsets (asymmetric | 
 |     // quantization in input), it means we need to fuse column_offsets to bias. | 
 |     if (this->order_ == StorageOrder::NHWC && in_qparams_[INPUT].zero_point && | 
 |         column_offsets_->empty()) { | 
 |       if (b_quantized_->empty()) { | 
 |         // When b_quantized_data_ is from pre-packed bias or Int8TensorCPU, | 
 |         // we can't inplace modify so copy to internal b_quantized_ vector. | 
 |         b_quantized_->assign(b_quantized_data_, b_quantized_data_ + M); | 
 |         b_quantized_data_ = b_quantized_->data(); | 
 |       } | 
 |       vector<int32_t>* column_offset_ptr; | 
 |       vector<int32_t> column_offset_temp; | 
 |       if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |         const auto& packed_filter = | 
 |             this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |         column_offset_ptr = packed_filter.column_offsets.get(); | 
 |       } else { | 
 |         vector<TensorQuantizationParams> temp_qparams; | 
 |         temp_qparams.push_back(in_qparams_[1]); | 
 |         column_offset_temp.resize(M); | 
 |         ComputeColumnOffsets<T_signed>( | 
 |             KernelDim_(), | 
 |             M, | 
 |             W_quantized_.data(), | 
 |             filter_qparams_, | 
 |             column_offset_temp); | 
 |         column_offset_ptr = &column_offset_temp; | 
 |       } | 
 |       for (int i = 0; i < M; ++i) { | 
 |         (*b_quantized_)[i] -= | 
 |             in_qparams_[0].zero_point * (*column_offset_ptr)[i]; | 
 |       } | 
 |     } | 
 |   } | 
 |  | 
 |   if (!has_bias && this->order_ == StorageOrder::NHWC && | 
 |       in_qparams_[INPUT].zero_point && column_offsets_->empty() && | 
 |       !b_quantized_data_) { | 
 |     // no bias but create one filling with column offset values | 
 |     b_quantized_->resize(M, 0); | 
 |     b_quantized_data_ = b_quantized_->data(); | 
 |  | 
 |     vector<int32_t>* column_offset_ptr; | 
 |     vector<int32_t> column_offset_temp; | 
 |     if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |       const auto& packed_filter = | 
 |           this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |       column_offset_ptr = packed_filter.column_offsets.get(); | 
 |     } else { | 
 |       vector<TensorQuantizationParams> temp_qparams; | 
 |       temp_qparams.push_back(in_qparams_[1]); | 
 |       column_offset_temp.resize(M); | 
 |       ComputeColumnOffsets<T_signed>( | 
 |           KernelDim_(), | 
 |           M, | 
 |           W_quantized_.data(), | 
 |           filter_qparams_, | 
 |           column_offset_temp); | 
 |       column_offset_ptr = &column_offset_temp; | 
 |     } | 
 |     for (int i = 0; i < M; ++i) { | 
 |       (*b_quantized_)[i] -= in_qparams_[0].zero_point * (*column_offset_ptr)[i]; | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | void ConvDNNLowPOp<T, ReluFused>::QuantizeWeight_() { | 
 |   using namespace dnnlowp; | 
 |  | 
 |   // Quantize W if not done already | 
 |   int kernel_dim = KernelDim_(); | 
 |   const auto& filter = InputTensorCPU_(FILTER); | 
 |   int M = filter.dim32(0); | 
 |  | 
 |   bool packW = ConvPoolOpBase<CPUContext>::order_ == StorageOrder::NHWC && | 
 |       !Acc16() && is_same<T, uint8_t>::value && GetCpuId().avx2() && | 
 |       !FLAGS_caffe2_dnnlowp_force_slow_path; | 
 |  | 
 |   bool depthwise_3x3_fast_path = false, depthwise_3x3x3_fast_path = false, | 
 |        gconv_fast_path = false; | 
 |   if (TakeDepthWise3x3FastPath_()) { | 
 |     depthwise_3x3_fast_path = true; | 
 |     packW = false; | 
 |   } else if (TakeDepthWise3x3x3FastPath_()) { | 
 |     depthwise_3x3x3_fast_path = true; | 
 |     packW = false; | 
 |   } else if (TakeGConvFastPath_()) { | 
 |     gconv_fast_path = true; | 
 |     packW = false; | 
 |   } | 
 |  | 
 |   if ((depthwise_3x3_fast_path && !Wq_depthwise_packed_) || | 
 |       (depthwise_3x3x3_fast_path && !Wq_depthwise_packed_) || | 
 |       (gconv_fast_path && !Wq_gconv_packed_ && !Wq_gconv3d_packed_) || | 
 |       (packW && !Wq_packed_) || (!packW && W_quantized_.empty())) { | 
 |     if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |       CAFFE_ENFORCE_EQ( | 
 |           ConvPoolOpBase<CPUContext>::order_, | 
 |           StorageOrder::NHWC, | 
 |           "Pre-packed weight only works with NHWC layout"); | 
 |  | 
 |       const auto& packed_filter = | 
 |           this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |       filter_qparams_ = packed_filter.qparams; | 
 |     } else { | 
 |       filter_qparams_.resize(quantize_groupwise_ ? group_ : 1); | 
 |       QuantizeWeight<T>( | 
 |           InputBlob(FILTER), | 
 |           kernel_dim, | 
 |           M, | 
 |           filter_qparams_, | 
 |           W_quantized_, | 
 |           qfactory_.get()); | 
 |  | 
 |       if (this->template InputIsType<int8::Int8TensorCPU>(FILTER) && | 
 |           quantize_groupwise_) { | 
 |         static int log_occurences = 0; | 
 |         if (log_occurences < 32) { | 
 |           ++log_occurences; | 
 |           LOG(WARNING) << "Cannot do group-wise quantization for " | 
 |                           "pre-quantized weight " | 
 |                        << this->debug_def().input(FILTER); | 
 |         } | 
 |       } | 
 |     } | 
 |  | 
 |     filter_zero_points_.resize(filter_qparams_.size()); | 
 |     requantization_params_.resize(filter_qparams_.size()); | 
 |     requantization_multipliers_.resize(filter_qparams_.size()); | 
 |     for (int i = 0; i < filter_qparams_.size(); ++i) { | 
 |       filter_zero_points_[i] = filter_qparams_[i].zero_point; | 
 |     } | 
 |  | 
 |     if (depthwise_3x3_fast_path) { | 
 |       if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |         const auto& packed_filter = | 
 |             this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |         Wq_depthwise_packed_ = packed_filter.W_depthwise; | 
 |       } else { | 
 |         Wq_depthwise_packed_.reset(new fbgemm::PackedDepthWiseConvMatrix( | 
 |             group_, | 
 |             3 * 3, | 
 |             reinterpret_cast<const int8_t*>(W_quantized_.data()))); | 
 |       } | 
 |     } else if (depthwise_3x3x3_fast_path) { | 
 |       if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |         const auto& packed_filter = | 
 |             this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |         Wq_depthwise_packed_ = packed_filter.W_depthwise; | 
 |       } else { | 
 |         Wq_depthwise_packed_.reset(new fbgemm::PackedDepthWiseConvMatrix( | 
 |             group_, | 
 |             3 * 3 * 3, | 
 |             reinterpret_cast<const int8_t*>(W_quantized_.data()))); | 
 |       } | 
 |     } else if (gconv_fast_path) { | 
 |       if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |         const auto& packed_filter = | 
 |             this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |         if (packed_filter.W_gconv) { | 
 |           CAFFE_ENFORCE_EQ(this->kernel_.size(), 2); | 
 |           Wq_gconv_packed_ = packed_filter.W_gconv; | 
 |         } else { | 
 |           CAFFE_ENFORCE(packed_filter.W_gconv3d); | 
 |           CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); | 
 |           Wq_gconv3d_packed_ = packed_filter.W_gconv3d; | 
 |         } | 
 |       } else { | 
 |         if (this->kernel_.size() == 2) { | 
 |           fbgemm::conv_param_t<> conv_p(GetConvParam_()); | 
 |           Wq_gconv_packed_.reset(new fbgemm::PackWeightMatrixForGConv<int8_t>( | 
 |               fbgemm::matrix_op_t::Transpose, | 
 |               conv_p, | 
 |               reinterpret_cast<const int8_t*>(W_quantized_.data()))); | 
 |         } else { | 
 |           CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); | 
 |           fbgemm::conv_param_t<3> conv_p(GetConv3DParam_()); | 
 |           Wq_gconv3d_packed_.reset( | 
 |               new fbgemm::PackWeightMatrixForGConv<int8_t, int32_t, 3>( | 
 |                   fbgemm::matrix_op_t::Transpose, | 
 |                   conv_p, | 
 |                   reinterpret_cast<const int8_t*>(W_quantized_.data()))); | 
 |         } | 
 |       } | 
 |     } else if (packW) { | 
 |       if (this->template InputIsType<Int8ConvDNNLowPPackedWeightBlob>(FILTER)) { | 
 |         const auto& packed_filter = | 
 |             this->template Input<Int8ConvDNNLowPPackedWeightBlob>(FILTER); | 
 |         Wq_packed_ = packed_filter.W; | 
 |       } else { | 
 |         // fast path using fbgemm | 
 |         Wq_packed_.reset(new fbgemm::PackBMatrix<int8_t>( | 
 |             fbgemm::matrix_op_t::Transpose, | 
 |             group_ * kernel_dim, | 
 |             M / group_, | 
 |             reinterpret_cast<const int8_t*>(W_quantized_.data()), | 
 |             kernel_dim, // ld | 
 |             nullptr, // pmat | 
 |             group_)); | 
 |       } | 
 |     } else { | 
 |       string reason; | 
 |       if (ConvPoolOpBase<CPUContext>::order_ != StorageOrder::NHWC) { | 
 |         reason = "fbgemm only supports NHWC layout"; | 
 |       } else if (!is_same<T, uint8_t>::value) { | 
 |         reason = "fbgemm only supports 8-bit integers"; | 
 |       } else if (!GetCpuId().avx2()) { | 
 |         reason = "fbgemm only supports AVX2+"; | 
 |       } else if (Acc16()) { | 
 |         reason = ""; | 
 |       } else if (FLAGS_caffe2_dnnlowp_force_slow_path) { | 
 |         reason = "slow path enforced"; | 
 |       } else { | 
 |         assert(false); | 
 |       } | 
 |       if (!reason.empty()) { | 
 |         static int log_occurences = 0; | 
 |         if (log_occurences < 32) { | 
 |           ++log_occurences; | 
 |           LOG(WARNING) << "Conv with weight " << this->debug_def().input(FILTER) | 
 |                        << " falls back to slow path because " << reason; | 
 |         } | 
 |       } | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | /** | 
 |  * @return false if something goes wrong | 
 |  */ | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::GetQuantizationParameters_() { | 
 |   using namespace dnnlowp; | 
 |  | 
 |   if (!this->arguments_parsed_) { | 
 |     bool dequantize_output; | 
 |     ParseDNNLowPOperatorArguments( | 
 |         this, &dequantize_output, &measure_quantization_error_, &followed_by_); | 
 |     CAFFE_ENFORCE_EQ( | 
 |         dequantize_output, | 
 |         false, | 
 |         "Conv DNNLOWP operators don't support dequantize_output"); | 
 |  | 
 |     if (ReluFused) { | 
 |       // It's actually fused with Relu not followed by but setting this to make | 
 |       // sure quantization error is correctly measured in | 
 |       // this->MeasureQuantizationError_ | 
 |       followed_by_ = "Relu"; | 
 |       AdjustOutputTensorQuantizationParamsWithFollowedBy(this, followed_by_); | 
 |     } | 
 |     this->arguments_parsed_ = true; | 
 |   } | 
 |  | 
 |   // Choose quantization for X | 
 |   in_qparams_[INPUT] = | 
 |       GetInputTensorQuantizationParamsOf(this, INPUT, qfactory_.get()); | 
 |  | 
 |   QuantizeWeight_(); | 
 |   PreComputeRowColumnOffsets_(); | 
 |   QuantizeBias_(); | 
 |  | 
 |   if (Wq_packed_ && !FLAGS_caffe2_dnnlowp_dump_tensors) { | 
 |     // From here, W_quantized_ is not used anymore when we have Wq_packed_ | 
 |     vector<T_signed>().swap(W_quantized_); | 
 |   } | 
 |  | 
 |   bool fp32_executed = false; | 
 |   if (HasStaticQuantization(this)) { | 
 |     out_qparams_ = GetStaticQuantizationParamsOf(this, 0); | 
 |   } else { | 
 |     // If quantization parameters are not chosen beforehand, run reference | 
 |     // Conv op in fp32 to choose quantization for Y. | 
 |     Fp32Op_()->DequantizeInput(); | 
 |     Fp32Op_()->Get()->RunOnDevice(); | 
 |     out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get()); | 
 |     fp32_executed = true; | 
 |   } | 
 |  | 
 |   for (int g = 0; g < filter_qparams_.size(); ++g) { | 
 |     float real_multiplier = in_qparams_[INPUT].scale * | 
 |         FilterQuantizationParams(g).scale / out_qparams_.scale; | 
 |     requantization_params_[g] = qfactory_->ChooseRequantizationMultiplier( | 
 |         real_multiplier, out_qparams_); | 
 |     requantization_multipliers_[g] = requantization_params_[g].real_multiplier; | 
 |   } | 
 |  | 
 |   if (measure_quantization_error_ && Fp32Op_() && !fp32_executed) { | 
 |     // to measure quantization error, run ref impl. | 
 |     Fp32Op_()->DequantizeInput(); | 
 |     Fp32Op_()->Get()->RunOnDevice(); | 
 |   } | 
 |  | 
 |   return true; | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | void ConvDNNLowPOp<T, ReluFused>::RunOnDeviceEpilogueNCHW_( | 
 |     const T* col_buffer_data, | 
 |     int32_t* Y_int32, | 
 |     T* Y_data, | 
 |     size_t i_offset, | 
 |     int group_id) { | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   const int M = filter.dim32(0); | 
 |   int kernel_dim = KernelDim_(); | 
 |  | 
 |   Tensor* Y = OutputTensorCPU_(0); | 
 |   const int Y_HxW = this->GetDimsSize(*Y); | 
 |  | 
 |   // See batch_matmul_dnnlowp_op.cc to why we compute column_offsets, | 
 |   // row_offset, and const_offset in this way. | 
 |   int tid = dnnlowp_get_thread_num(); | 
 |   int32_t* column_offsets = column_offsets_->data() + tid * Y_HxW; | 
 |  | 
 |   const dnnlowp::TensorQuantizationParams& filter_qparams = | 
 |       FilterQuantizationParams(group_id); | 
 |   for (int j = 0; j < Y_HxW; ++j) { | 
 |     int sum = 0; | 
 |     for (int k = 0; k < kernel_dim; ++k) { | 
 |       sum += col_buffer_data[k * Y_HxW + j]; | 
 |     } | 
 |     column_offsets[j] = sum * filter_qparams.zero_point; | 
 |   } | 
 |  | 
 |   for (int i = 0; i < M / group_; ++i) { | 
 |     int32_t row_offset = row_offsets_[i_offset + i]; | 
 |     row_offset *= -in_qparams_[INPUT].zero_point; | 
 |     if (b_quantized_data_) { | 
 |       row_offset += b_quantized_data_[i_offset + i]; | 
 |     } | 
 |     for (int j = 0; j < Y_HxW; ++j) { | 
 |       int32_t raw = Y_int32[i * Y_HxW + j] + row_offset - column_offsets[j]; | 
 |       if (ReluFused) { | 
 |         raw = std::max(0, raw); | 
 |       } | 
 |       Y_data[i * Y_HxW + j] = | 
 |           fbgemm::Requantize<T>(raw, RequantizationParams(group_id)); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNCHW() { | 
 |   VLOG(2) << "Running DNNLOWP Conv"; | 
 |  | 
 |   using namespace dnnlowp; | 
 |  | 
 |   // Get quantization parameters | 
 |   if (!GetQuantizationParameters_()) { | 
 |     return false; | 
 |   } | 
 |  | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   const int N = X.dim32(0), C = X.dim32(1); | 
 |   CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); | 
 |   const int M = filter.dim32(0); | 
 |   CAFFE_ENFORCE_EQ( | 
 |       C, | 
 |       filter.dim32(1) * group_, | 
 |       "Convolution op: input channels does not match: # of input channels ", | 
 |       C, | 
 |       " is not equal to kernel channels * group:", | 
 |       filter.dim32(1), | 
 |       "*", | 
 |       group_); | 
 |   CAFFE_ENFORCE_EQ( | 
 |       M % group_, | 
 |       0, | 
 |       "The number of output channels is not divisible by group."); | 
 |  | 
 |   auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); | 
 |   Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); | 
 |  | 
 |   const vector<int> input_dims = GetDims(X); | 
 |   const vector<int> output_dims = GetDims(*Y); | 
 |   const int X_HxW = this->GetDimsSize(X); | 
 |   const int Y_HxW = this->GetDimsSize(*Y); | 
 |  | 
 |   // The dimension of each kernel | 
 |   const int kernel_dim = KernelDim_(); | 
 |  | 
 |   vector<int> img_shape; | 
 |   img_shape.assign(X.sizes().begin() + 1, X.sizes().end()); | 
 |  | 
 |   vector<int> buffer_shape; | 
 |   buffer_shape.push_back(kernel_dim); | 
 |   buffer_shape.insert( | 
 |       buffer_shape.end(), output_dims.begin(), output_dims.end()); | 
 |   buffer_shape.insert(buffer_shape.begin(), dnnlowp_get_max_threads()); | 
 |  | 
 |   if (this->kernel_.size() != 2) { | 
 |     SetDeviceTensor(img_shape, &img_shape_device_); | 
 |     SetDeviceTensor(buffer_shape, &col_buffer_shape_device_); | 
 |   } | 
 |  | 
 |   const int col_buffer_size = kernel_dim * Y_HxW; | 
 |  | 
 |   // The offset corresponding to a single input image, and a single output | 
 |   // image. | 
 |   const int input_offset = C / group_ * X_HxW; | 
 |  | 
 |   // The col buffer is stored in CHW order as well - kernel_dim, and the | 
 |   // height and width. | 
 |   const T* Xdata = X.template data<T>(); | 
 |  | 
 |   // We must not call mutable_data inside omp region | 
 |   T* Y_data_T = Y->template mutable_data<T>(); | 
 |   column_offsets_->resize(Y_HxW * dnnlowp_get_max_threads()); | 
 |  | 
 |   auto f = [&](Tensor* col_buffer, vector<int32_t>* Y_int32) { | 
 |     col_buffer->Resize(buffer_shape); | 
 |     vector<int> buffer_shape_per_thread( | 
 |         buffer_shape.begin() + 1, buffer_shape.end()); | 
 |     T* col_buffer_data = col_buffer->template mutable_data<T>(); | 
 |  | 
 |     Y_int32->resize(M * Y_HxW * dnnlowp_get_max_threads()); | 
 |  | 
 |     // Im2Col, followed by gemm. | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel for | 
 | #endif | 
 |     for (int image_id = 0; image_id < N; ++image_id) { | 
 |       int tid = dnnlowp_get_thread_num(); | 
 |       for (int group_id = 0; group_id < group_; ++group_id) { | 
 |         if (this->kernel_.size() == 2) { | 
 |           math::Im2ColNCHW<T>( | 
 |               C / group_, | 
 |               input_dims[0], | 
 |               input_dims[1], | 
 |               kernel_h(), | 
 |               kernel_w(), | 
 |               dilation_h(), | 
 |               dilation_w(), | 
 |               pad_t(), | 
 |               pad_l(), | 
 |               pad_b(), | 
 |               pad_r(), | 
 |               stride_h(), | 
 |               stride_w(), | 
 |               Xdata + (group_ * image_id + group_id) * input_offset, | 
 |               col_buffer_data + tid * col_buffer_size, | 
 |               &context_, | 
 |               in_qparams_[INPUT].zero_point); | 
 |         } else { | 
 |           math::Im2ColNdNCHW<T>( | 
 |               this->kernel_.size(), | 
 |               C * X_HxW, | 
 |               col_buffer_size, | 
 |               img_shape.data(), | 
 |               buffer_shape_per_thread.data(), | 
 |               this->kernel_.data(), | 
 |               this->stride_.data(), | 
 |               this->dilation_.data(), | 
 |               this->pads_.data(), | 
 |               Xdata + (group_ * image_id + group_id) * input_offset, | 
 |               col_buffer_data + tid * col_buffer_size, | 
 |               &context_, | 
 |               in_qparams_[INPUT].zero_point); | 
 |         } | 
 |  | 
 |         // quantize col_buffer | 
 |         T* col_buffer_private = col_buffer_data + tid * col_buffer_size; | 
 |  | 
 |         int32_t* Y_int32_temp = | 
 |             Y_int32->data() + ((M / group_) * group_id + M * tid) * Y_HxW; | 
 |         T_signed* W_quantized_group = | 
 |             W_quantized_.data() + (M / group_) * group_id * kernel_dim; | 
 |  | 
 |         for (int i = 0; i < M / group_; ++i) { | 
 |           for (int j = 0; j < Y_HxW; ++j) { | 
 |             int32_t sum = 0; | 
 |             for (int k = 0; k < kernel_dim; ++k) { | 
 |               int w = W_quantized_group[i * kernel_dim + k]; | 
 |               int x = col_buffer_private[k * Y_HxW + j]; | 
 |               sum += w * x; | 
 |             } | 
 |             Y_int32_temp[i * Y_HxW + j] = sum; | 
 |           } // j | 
 |         } // i | 
 |  | 
 |         RunOnDeviceEpilogueNCHW_( | 
 |             col_buffer_private, | 
 |             Y_int32_temp, | 
 |             Y_data_T + (M * image_id + M / group_ * group_id) * Y_HxW, | 
 |             M / group_ * group_id, | 
 |             group_id); | 
 |       } // for each group | 
 |     } // for each image_id | 
 |  | 
 |     PropagateOutputTensorQuantizationParams(this, 0, out_qparams_); | 
 |     MeasureQuantizationError_(); | 
 |   }; // f | 
 |  | 
 |   this->RunWithSharedBuffer_(&col_buffer_, &Y_int32_, f); | 
 |  | 
 |   return true; | 
 | } // RunOnDeviceWithOrderNCHW | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | void ConvDNNLowPOp<T, ReluFused>::RunOnDeviceEpilogueNHWC_( | 
 |     const T* col_buffer_data, | 
 |     int32_t* Y_int32) { | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   Tensor* Y = OutputTensorCPU_(0); | 
 |   const int N = X.dim32(0); | 
 |   const int M = filter.dim32(0); | 
 |   int kernel_dim = KernelDim_(); | 
 |   const int Y_HxW = this->GetDimsSize(*Y); | 
 |  | 
 |   // Adjust with bias and zero_point and then requantize | 
 |   // See batch_matmul_dnnlowp_op.cc to why we compute column_offsets, | 
 |   // row_offset, and const_offset in this way. | 
 |   int32_t A_zero_point = in_qparams_[INPUT].zero_point; | 
 |  | 
 |   if (!dnnlowp::HasStaticQuantization(this)) { | 
 |     if (quantize_groupwise_) { | 
 |       static int log_occurences = 0; | 
 |       if (log_occurences < 32) { | 
 |         ++log_occurences; | 
 |         LOG(WARNING) << "Cannot do group-wise quantization without " | 
 |                         "static quantization of activations for " | 
 |                      << this->debug_def().output(0); | 
 |       } | 
 |     } | 
 |  | 
 |     int32_t Y_min = numeric_limits<int32_t>::max(); | 
 |     int32_t Y_max = numeric_limits<int32_t>::min(); | 
 |  | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel for reduction(min : Y_min), reduction(max : Y_max) | 
 | #endif | 
 |     for (int i = 0; i < N * Y_HxW; ++i) { | 
 |       for (int group_id = 0; group_id < group_; ++group_id) { | 
 |         int32_t row_offset = 0; | 
 |         for (int k = 0; k < kernel_dim; ++k) { | 
 |           row_offset += | 
 |               col_buffer_data[(i * group_ + group_id) * kernel_dim + k]; | 
 |         } | 
 |         row_offset *= FilterQuantizationParams(0).zero_point; | 
 |  | 
 |         for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_); | 
 |              ++j) { | 
 |           int32_t raw = Y_int32[i * M + j] - row_offset; | 
 |           if (!column_offsets_->empty()) { | 
 |             raw -= A_zero_point * (*column_offsets_)[j]; | 
 |           } | 
 |           if (b_quantized_data_) { | 
 |             raw += b_quantized_data_[j]; | 
 |           } | 
 |           Y_min = std::min(Y_min, raw); | 
 |           Y_max = std::max(Y_max, raw); | 
 |         } | 
 |       } // for each group | 
 |     } // for each row i | 
 |  | 
 |     if (ReluFused) { | 
 |       Y_min = std::max(0, Y_min); | 
 |       Y_max = std::max(0, Y_max); | 
 |     } | 
 |  | 
 |     float Y_scale = | 
 |         in_qparams_[INPUT].scale * FilterQuantizationParams(0).scale; | 
 |     out_qparams_ = | 
 |         qfactory_->ChooseQuantizationParams(Y_scale * Y_min, Y_scale * Y_max); | 
 |  | 
 |     float real_multiplier = Y_scale / out_qparams_.scale; | 
 |     requantization_params_[0] = qfactory_->ChooseRequantizationMultiplier( | 
 |         real_multiplier, out_qparams_); | 
 |     requantization_multipliers_[0] = requantization_params_[0].real_multiplier; | 
 |   } | 
 |  | 
 |   int32_t C_zero_point = out_qparams_.zero_point; | 
 |  | 
 |   T* Ydata = Y->template mutable_data<T>(); | 
 |  | 
 |   using namespace fbgemm; | 
 |   if (is_same<T, uint8_t>::value && GetCpuId().avx2()) { | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel for | 
 | #endif | 
 |     for (int i = 0; i < N * Y_HxW; ++i) { | 
 |       for (int group_id = 0; group_id < group_; ++group_id) { | 
 |         int32_t row_offset; | 
 |         row_offsets_u8acc32_ref( | 
 |             1, | 
 |             kernel_dim, | 
 |             group_ * kernel_dim, | 
 |             reinterpret_cast<const uint8_t*>( | 
 |                 col_buffer_data + (i * group_ + group_id) * kernel_dim), | 
 |             &row_offset); | 
 |  | 
 |         int32_t B_zero_point = FilterQuantizationParams(group_id).zero_point; | 
 |         float C_multiplier = RequantizationParams(group_id).real_multiplier; | 
 |  | 
 |         requantize_u8acc32_ref( | 
 |             1, | 
 |             M / group_, | 
 |             M, | 
 |             Y_int32 + i * M + group_id * (M / group_), | 
 |             reinterpret_cast<uint8_t*>(Ydata + i * M + group_id * (M / group_)), | 
 |             &C_multiplier, | 
 |             C_zero_point, | 
 |             column_offsets_->empty() ? 0 : A_zero_point, | 
 |             &B_zero_point, | 
 |             &row_offset, | 
 |             column_offsets_->empty() | 
 |                 ? nullptr | 
 |                 : column_offsets_->data() + group_id * (M / group_), | 
 |             b_quantized_data_ ? b_quantized_data_ + group_id * (M / group_) | 
 |                               : nullptr, | 
 |             M / group_, | 
 |             ReluFused); | 
 |       } // for each group | 
 |     } // for each row i | 
 |   } else { | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel for | 
 | #endif | 
 |     for (int i = 0; i < N * Y_HxW; ++i) { | 
 |       for (int group_id = 0; group_id < group_; ++group_id) { | 
 |         int32_t B_zero_point = FilterQuantizationParams(group_id).zero_point; | 
 |         int32_t row_offset = 0; | 
 |         for (int k = 0; k < kernel_dim; ++k) { | 
 |           row_offset += | 
 |               col_buffer_data[(i * group_ + group_id) * kernel_dim + k]; | 
 |         } | 
 |         row_offset *= B_zero_point; | 
 |  | 
 |         for (int j = group_id * (M / group_); j < (group_id + 1) * (M / group_); | 
 |              ++j) { | 
 |           int32_t raw = Y_int32[i * M + j] - row_offset; | 
 |           if (!column_offsets_->empty()) { | 
 |             raw -= A_zero_point * (*column_offsets_)[j]; | 
 |           } | 
 |           if (b_quantized_data_) { | 
 |             raw += b_quantized_data_[j]; | 
 |           } | 
 |  | 
 |           Ydata[i * M + j] = | 
 |               fbgemm::Requantize<T>(raw, RequantizationParams(group_id)); | 
 |           if (ReluFused) { // static if | 
 |             Ydata[i * M + j] = | 
 |                 std::max<int32_t>(C_zero_point, Ydata[i * M + j]); | 
 |           } | 
 |         } | 
 |       } // for each group | 
 |     } // for each row i | 
 |   } // !__AVX2__ | 
 |  | 
 |   dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_); | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | void ConvDNNLowPOp<T, ReluFused>::PartitionGroupedNHWCConv_( | 
 |     int* group_begin, | 
 |     int* group_end, | 
 |     int* i_begin, | 
 |     int* i_end, | 
 |     int num_groups, | 
 |     int m, | 
 |     int nthreads, | 
 |     int thread_id) { | 
 |   // Make sure i_per_thread is a multiple of 32 because | 
 |   // cblas_gemm_compute_u8s8s32_acc16 performs the best when M is a multiple | 
 |   // of 32. | 
 |   Get1DPartitionOf2D( | 
 |       num_groups, | 
 |       m, | 
 |       nthreads, | 
 |       thread_id, | 
 |       group_begin, | 
 |       group_end, | 
 |       i_begin, | 
 |       i_end, | 
 |       32); | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | const T* ConvDNNLowPOp<T, ReluFused>::Im2ColNHWC_(Tensor* col_buffer) { | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   Tensor* Y = OutputTensorCPU_(0); | 
 |   int ndim = X.dim(); | 
 |   const int N = X.dim32(0), C = X.dim32(ndim - 1); | 
 |  | 
 |   const int kernel_dim = KernelDim_(); | 
 |   // The offset corresponding to a single input image, and a single output | 
 |   // image. | 
 |   const int X_HxW = this->GetDimsSize(X); | 
 |   const int input_offset = X_HxW * C; | 
 |   const int Y_HxW = this->GetDimsSize(*Y); | 
 |  | 
 |   const T* Xdata = X.template data<T>(); | 
 |  | 
 |   vector<int> buffer_shape(ndim); | 
 |   for (auto i = 0; i < ndim - 1; ++i) { | 
 |     buffer_shape[i] = Y->dim32(i); | 
 |   } | 
 |   buffer_shape[ndim - 1] = kernel_dim * group_; | 
 |  | 
 |   col_buffer->Resize(buffer_shape); | 
 |  | 
 |   T* col_buffer_data = col_buffer->template mutable_data<T>(); | 
 |  | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel for if (N > 1) | 
 | #endif | 
 |   for (int image_id = 0; image_id < N; ++image_id) { | 
 |     if (this->kernel_.size() <= 2) { | 
 |       math::Im2ColNHWC<T>( | 
 |           C, | 
 |           X.dim32(1), | 
 |           this->kernel_.size() == 2 ? X.dim32(2) : 1, | 
 |           kernel_h(), | 
 |           this->kernel_.size() == 2 ? kernel_w() : 1, | 
 |           dilation_h(), | 
 |           this->kernel_.size() == 2 ? dilation_w() : 1, | 
 |           pad_t(), | 
 |           this->kernel_.size() == 2 ? pad_l() : 0, | 
 |           this->kernel_.size() == 2 ? pad_b() : pad_l(), | 
 |           this->kernel_.size() == 2 ? pad_r() : 0, | 
 |           stride_h(), | 
 |           this->kernel_.size() == 2 ? stride_w() : 1, | 
 |           Xdata + image_id * input_offset, | 
 |           col_buffer_data + image_id * group_ * kernel_dim * Y_HxW, | 
 |           &context_, | 
 |           group_, | 
 |           in_qparams_[INPUT].zero_point); | 
 |     } else { | 
 |       math::Im2Col3DNHWC<T>( | 
 |           C, | 
 |           X.dim32(1), // num_frames | 
 |           X.dim32(2), // H | 
 |           X.dim32(3), // W | 
 |           this->kernel_[0], | 
 |           this->kernel_[1], | 
 |           this->kernel_[2], | 
 |           this->dilation_[0], | 
 |           this->dilation_[1], | 
 |           this->dilation_[2], | 
 |           this->pads_[0], | 
 |           this->pads_[1], | 
 |           this->pads_[2], | 
 |           this->pads_[3], | 
 |           this->pads_[4], | 
 |           this->pads_[5], | 
 |           this->stride_[0], | 
 |           this->stride_[1], | 
 |           this->stride_[2], | 
 |           Xdata + image_id * input_offset, | 
 |           col_buffer_data + image_id * group_ * kernel_dim * Y_HxW, | 
 |           &context_, | 
 |           group_, | 
 |           in_qparams_[INPUT].zero_point); | 
 |     } | 
 |   } | 
 |  | 
 |   return col_buffer->template data<T>(); | 
 | } | 
 |  | 
 | template <typename T, typename T_signed> | 
 | static void conv_nhwc_ref_( | 
 |     int group_id, | 
 |     int num_groups, | 
 |     int i_begin, | 
 |     int i_end, | 
 |     int M, | 
 |     int kernel_dim, | 
 |     const T* col_buffer, | 
 |     const T_signed* W, | 
 |     int32_t* Y) { | 
 |   for (int i = i_begin; i < i_end; ++i) { | 
 |     for (int j = group_id * (M / num_groups); | 
 |          j < (group_id + 1) * (M / num_groups); | 
 |          ++j) { | 
 |       int32_t sum = 0; | 
 |       for (int k = 0; k < kernel_dim; ++k) { | 
 |         int w = W[j * kernel_dim + k]; | 
 |         int x = col_buffer[(i * num_groups + group_id) * kernel_dim + k]; | 
 |         sum += w * x; | 
 |       } | 
 |       Y[i * M + j] = sum; | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | template <typename PackAMatrix, fbgemm::QuantizationGranularity Q_GRAN> | 
 | void ConvDNNLowPOp<T, ReluFused>::DispatchFBGEMM_( | 
 |     PackAMatrix& packA, | 
 |     vector<int32_t>* Y_int32, | 
 |     uint8_t* Y_uint8_data) { | 
 |   // This function is called within an OpenMP region | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   const int M = filter.dim32(0); | 
 |  | 
 |   int nthreads = dnnlowp_get_num_threads(); | 
 |   int tid = dnnlowp_get_thread_num(); | 
 |  | 
 |   using namespace fbgemm; | 
 |   DoNothing<> doNothingObj{}; | 
 |   ReQuantizeOutput<ReluFused, Q_GRAN> outputProcObj( | 
 |       doNothingObj, | 
 |       requantization_multipliers_.data(), | 
 |       out_qparams_.zero_point, | 
 |       // column_offsets_ empty means column_offsets_ are folded into bias | 
 |       column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point, | 
 |       filter_zero_points_.data(), | 
 |       packA.getRowOffsetBuffer(), | 
 |       column_offsets_->empty() ? nullptr : column_offsets_->data(), | 
 |       b_quantized_data_, | 
 |       M, | 
 |       group_); | 
 |  | 
 |   fbgemmPacked( | 
 |       packA, | 
 |       *Wq_packed_, | 
 |       Y_uint8_data, | 
 |       Y_int32->data(), | 
 |       M, | 
 |       outputProcObj, | 
 |       tid, | 
 |       nthreads); | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | void ConvDNNLowPOp<T, ReluFused>::ConvNHWCCore_( | 
 |     const T* col_buffer_data, | 
 |     vector<int32_t>* Y_int32) { | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   Tensor* Y = OutputTensorCPU_(0); | 
 |   const int N = X.dim32(0), C = X.dim32(X.dim() - 1); | 
 |   const int M = filter.dim32(0); | 
 |   const int kernel_dim = KernelDim_(); | 
 |   const int Y_HxW = this->GetDimsSize(*Y); | 
 |   const int X_HxW = this->GetDimsSize(X); | 
 |  | 
 |   if (N == 0) { | 
 |     LOG(WARNING) << "The batch size is 0 in ConvNHWCCore_ function!"; | 
 |   } | 
 |  | 
 |   if (FLAGS_caffe2_dnnlowp_dump_tensors) { | 
 |     // Dump input activation | 
 |     std::string input_name = this->debug_def().input(INPUT); | 
 |     std::string input_filename = input_name; | 
 |     while (input_filename.find('/') != std::string::npos) { | 
 |       input_filename.replace(input_filename.find('/'), 1, "_"); | 
 |     } | 
 |     StoreMatrixInMatrixMarketFormat( | 
 |         N * X_HxW * C / kernel_dim, | 
 |         kernel_dim, | 
 |         col_buffer_data, | 
 |         input_filename); | 
 |  | 
 |     // Dump weight | 
 |     std::string weight_name = this->debug_def().input(FILTER); | 
 |     std::string weight_filename = weight_name; | 
 |     while (weight_filename.find('/') != std::string::npos) { | 
 |       weight_filename.replace(weight_name.find('/'), 1, "_"); | 
 |     } | 
 |     StoreMatrixInMatrixMarketFormat( | 
 |         M, kernel_dim, W_quantized_.data(), weight_filename); | 
 |   } | 
 |  | 
 |   using namespace fbgemm; | 
 |  | 
 |   if (TakeDepthWise3x3x3FastPath_()) { | 
 |     const T* Xdata = X.template data<T>(); | 
 |     uint8_t* Y_uint8_data = | 
 |         OutputTensorCPU_(0)->template mutable_data<uint8_t>(); | 
 |  | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel | 
 | #endif | 
 |     { | 
 |       if (quantize_groupwise_) { | 
 |         depthwise_3x3x3_per_channel_quantization_pad_1( | 
 |             N, | 
 |             X.dim32(1), | 
 |             X.dim32(2), | 
 |             X.dim32(3), | 
 |             C, | 
 |             this->stride_[0], | 
 |             this->stride_[1], | 
 |             this->stride_[2], | 
 |             // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |             // need zero_point for padding | 
 |             in_qparams_[INPUT].zero_point, | 
 |             reinterpret_cast<const uint8_t*>(Xdata), | 
 |             filter_zero_points_.data(), | 
 |             *Wq_depthwise_packed_, | 
 |             requantization_multipliers_.data(), | 
 |             out_qparams_.zero_point, | 
 |             Y_uint8_data, | 
 |             // column_offsets_ empty means column_offsets_ are folded into bias | 
 |             column_offsets_->empty() ? nullptr : column_offsets_->data(), | 
 |             b_quantized_data_, | 
 |             ReluFused, | 
 |             nullptr, /*act_times_w_scale*/ | 
 |             dnnlowp_get_thread_num(), | 
 |             dnnlowp_get_num_threads()); | 
 |       } else { | 
 |         depthwise_3x3x3_pad_1( | 
 |             N, | 
 |             X.dim32(1), | 
 |             X.dim32(2), | 
 |             X.dim32(3), | 
 |             C, | 
 |             this->stride_[0], | 
 |             this->stride_[1], | 
 |             this->stride_[2], | 
 |             // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |             // need zero_point for padding | 
 |             in_qparams_[INPUT].zero_point, | 
 |             reinterpret_cast<const uint8_t*>(Xdata), | 
 |             FilterQuantizationParams(0).zero_point, | 
 |             *Wq_depthwise_packed_, | 
 |             requantization_params_[0].real_multiplier, | 
 |             out_qparams_.zero_point, | 
 |             Y_uint8_data, | 
 |             // column_offsets_ empty means column_offsets_ are folded into bias | 
 |             column_offsets_->empty() ? nullptr : column_offsets_->data(), | 
 |             b_quantized_data_, | 
 |             ReluFused, | 
 |             1.0f, /*act_times_w_scale*/ | 
 |             dnnlowp_get_thread_num(), | 
 |             dnnlowp_get_num_threads()); | 
 |       } | 
 |     } // omp parallel | 
 |  | 
 |     return; | 
 |   } else if (TakeDepthWise3x3FastPath_()) { | 
 |     const int H = X.dim32(1), W = X.dim32(2); | 
 |     const T* Xdata = X.template data<T>(); | 
 |     uint8_t* Y_uint8_data = | 
 |         OutputTensorCPU_(0)->template mutable_data<uint8_t>(); | 
 |  | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel | 
 | #endif | 
 |     { | 
 |       if (quantize_groupwise_) { | 
 |         depthwise_2d_per_channel_quantization_same_pad( | 
 |             N, | 
 |             H, | 
 |             W, | 
 |             C, | 
 |             stride_h(), | 
 |             stride_w(), | 
 |             // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |             // need zero_point for padding | 
 |             in_qparams_[INPUT].zero_point, | 
 |             reinterpret_cast<const uint8_t*>(Xdata), | 
 |             filter_zero_points_.data(), | 
 |             *Wq_depthwise_packed_, | 
 |             requantization_multipliers_.data(), | 
 |             out_qparams_.zero_point, | 
 |             Y_uint8_data, | 
 |             // column_offsets_ empty means column_offsets_ are folded into bias | 
 |             column_offsets_->empty() ? nullptr : column_offsets_->data(), | 
 |             b_quantized_data_, | 
 |             ReluFused, | 
 |             nullptr, /*act_times_w_scale*/ | 
 |             dnnlowp_get_thread_num(), | 
 |             dnnlowp_get_num_threads()); | 
 |       } else { | 
 |         depthwise_2d_same_pad( | 
 |             N, | 
 |             H, | 
 |             W, | 
 |             C, | 
 |             stride_h(), | 
 |             stride_w(), | 
 |             // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |             // need zero_point for padding | 
 |             in_qparams_[INPUT].zero_point, | 
 |             reinterpret_cast<const uint8_t*>(Xdata), | 
 |             FilterQuantizationParams(0).zero_point, | 
 |             *Wq_depthwise_packed_, | 
 |             requantization_params_[0].real_multiplier, | 
 |             out_qparams_.zero_point, | 
 |             Y_uint8_data, | 
 |             // column_offsets_ empty means column_offsets_ are folded into bias | 
 |             column_offsets_->empty() ? nullptr : column_offsets_->data(), | 
 |             b_quantized_data_, | 
 |             ReluFused, | 
 |             1.0f, /*act_times_w_scale*/ | 
 |             dnnlowp_get_thread_num(), | 
 |             dnnlowp_get_num_threads()); | 
 |       } | 
 |     } // omp parallel | 
 |  | 
 |     return; | 
 |   } else if (TakeGConvFastPath_()) { | 
 |     const T* Xdata = X.template data<T>(); | 
 |     uint8_t* Y_uint8_data = | 
 |         OutputTensorCPU_(0)->template mutable_data<uint8_t>(); | 
 |  | 
 |     int row_offset_size_per_thread; | 
 |     if (this->kernel_.size() == 2) { | 
 |       row_offset_size_per_thread = rowOffsetBufferSizeGConv(GetConvParam_()); | 
 |     } else { | 
 |       CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); | 
 |       row_offset_size_per_thread = rowOffsetBufferSizeGConv(GetConv3DParam_()); | 
 |     } | 
 |     row_offsets_.resize(dnnlowp_get_max_threads() * row_offset_size_per_thread); | 
 |  | 
 | #ifdef _OPENMP | 
 | // TODO: add parallelization once fbgemmGroupwiseConv supports multi-threading | 
 | // #pragma omp parallel | 
 | #endif | 
 |     { | 
 |       int tid = 0; // dnnlowp_get_thread_num(); | 
 |       int nthreads = 1; // dnnlowp_get_num_threads(); | 
 |  | 
 |       DoNothing<> doNothingObj{}; | 
 |       if (quantize_groupwise_) { | 
 |         ReQuantizeOutput<false, QuantizationGranularity::GROUP> reqObj( | 
 |             doNothingObj, | 
 |             requantization_multipliers_.data(), | 
 |             out_qparams_.zero_point, | 
 |             // column_offsets_ empty means column_offsets_ are folded into bias | 
 |             column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point, | 
 |             filter_zero_points_.data(), | 
 |             row_offsets_.data() + tid * row_offset_size_per_thread, | 
 |             column_offsets_->empty() ? nullptr : column_offsets_->data(), | 
 |             b_quantized_data_, | 
 |             M, | 
 |             group_); | 
 |  | 
 |         if (this->kernel_.size() == 2) { | 
 |           fbgemmGroupwiseConv( | 
 |               GetConvParam_(), | 
 |               reinterpret_cast<const uint8_t*>(Xdata), | 
 |               // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |               // need zero_point for padding | 
 |               in_qparams_[INPUT].zero_point, | 
 |               row_offsets_.data() + tid * row_offset_size_per_thread, | 
 |               *Wq_gconv_packed_, | 
 |               Y_uint8_data, | 
 |               Y_int32->data(), | 
 |               reqObj, | 
 |               tid, | 
 |               nthreads); | 
 |         } else { | 
 |           CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); | 
 |           fbgemmGroupwiseConv( | 
 |               GetConv3DParam_(), | 
 |               reinterpret_cast<const uint8_t*>(Xdata), | 
 |               // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |               // need zero_point for padding | 
 |               in_qparams_[INPUT].zero_point, | 
 |               row_offsets_.data() + tid * row_offset_size_per_thread, | 
 |               *Wq_gconv3d_packed_, | 
 |               Y_uint8_data, | 
 |               Y_int32->data(), | 
 |               reqObj, | 
 |               tid, | 
 |               nthreads); | 
 |         } | 
 |       } else { | 
 |         ReQuantizeOutput<false, QuantizationGranularity::TENSOR> reqObj( | 
 |             doNothingObj, | 
 |             requantization_multipliers_.data(), | 
 |             out_qparams_.zero_point, | 
 |             // column_offsets_ empty means column_offsets_ are folded into bias | 
 |             column_offsets_->empty() ? 0 : in_qparams_[INPUT].zero_point, | 
 |             filter_zero_points_.data(), | 
 |             filter_zero_points_[0] | 
 |                 ? row_offsets_.data() + tid * row_offset_size_per_thread | 
 |                 : nullptr, | 
 |             column_offsets_->empty() ? nullptr : column_offsets_->data(), | 
 |             b_quantized_data_, | 
 |             M, | 
 |             group_); | 
 |  | 
 |         if (this->kernel_.size() == 2) { | 
 |           fbgemmGroupwiseConv( | 
 |               GetConvParam_(), | 
 |               reinterpret_cast<const uint8_t*>(Xdata), | 
 |               // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |               // need zero_point for padding | 
 |               in_qparams_[INPUT].zero_point, | 
 |               filter_zero_points_[0] | 
 |                   ? row_offsets_.data() + tid * row_offset_size_per_thread | 
 |                   : nullptr, | 
 |               *Wq_gconv_packed_, | 
 |               Y_uint8_data, | 
 |               Y_int32->data(), | 
 |               reqObj, | 
 |               tid, | 
 |               nthreads); | 
 |         } else { | 
 |           fbgemmGroupwiseConv( | 
 |               GetConv3DParam_(), | 
 |               reinterpret_cast<const uint8_t*>(Xdata), | 
 |               // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |               // need zero_point for padding | 
 |               in_qparams_[INPUT].zero_point, | 
 |               filter_zero_points_[0] | 
 |                   ? row_offsets_.data() + tid * row_offset_size_per_thread | 
 |                   : nullptr, | 
 |               *Wq_gconv3d_packed_, | 
 |               Y_uint8_data, | 
 |               Y_int32->data(), | 
 |               reqObj, | 
 |               tid, | 
 |               nthreads); | 
 |         } | 
 |       } | 
 |     } // omp parallel | 
 |  | 
 |     return; | 
 |   } | 
 |  | 
 |   // Normal path for non-special (e.g., no depth-wise) convolutions. | 
 |   int row_offset_size_per_thread = -1; | 
 |   int x_pack_buf_size_per_thread = -1; | 
 |   bool fuse_im2col = | 
 |       Wq_packed_ && X.template data<T>() == col_buffer_data && !IsConvGEMM_(); | 
 |   if (Wq_packed_) { | 
 |     if (fuse_im2col) { | 
 |       row_offset_size_per_thread = | 
 |           PackAWithIm2Col<uint8_t>::rowOffsetBufferSize(); | 
 |       x_pack_buf_size_per_thread = PackAWithIm2Col<uint8_t>::packedBufferSize(); | 
 |     } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) { | 
 |       row_offset_size_per_thread = 0; | 
 |       x_pack_buf_size_per_thread = PackAMatrix<uint8_t>::packedBufferSize(); | 
 |     } else { | 
 |       row_offset_size_per_thread = | 
 |           PackAWithRowOffset<uint8_t>::rowOffsetBufferSize(); | 
 |       x_pack_buf_size_per_thread = | 
 |           PackAWithRowOffset<uint8_t>::packedBufferSize(); | 
 |     } | 
 |     row_offsets_.resize(dnnlowp_get_max_threads() * row_offset_size_per_thread); | 
 |     X_pack_buf_.resize(dnnlowp_get_max_threads() * x_pack_buf_size_per_thread); | 
 |   } | 
 |  | 
 |   uint8_t* Y_uint8_data = Y->template mutable_data<uint8_t>(); | 
 |  | 
 |   if (Wq_packed_) | 
 | #ifdef _OPENMP | 
 | #pragma omp parallel | 
 | #endif | 
 |   { | 
 |     int tid = dnnlowp_get_thread_num(); | 
 |  | 
 |     // fast path to use fbgemm | 
 |     if (fuse_im2col) { | 
 |       if (this->kernel_.size() <= 2) { | 
 |         conv_param_t<> conv_p( | 
 |             N, | 
 |             C, | 
 |             M, | 
 |             {X.dim32(1), this->kernel_.size() == 2 ? X.dim32(2) : 1}, | 
 |             group_, | 
 |             {this->kernel_[0], | 
 |              this->kernel_.size() == 2 ? this->kernel_[1] : 1}, | 
 |             {this->stride_[0], | 
 |              this->kernel_.size() == 2 ? this->stride_[1] : 1}, | 
 |             {this->pads_[0], | 
 |              this->kernel_.size() == 2 ? this->pads_[1] : 0, | 
 |              this->kernel_.size() == 2 ? this->pads_[2] : this->pads_[1], | 
 |              this->kernel_.size() == 2 ? this->pads_[3] : 0}); | 
 |  | 
 |         PackAWithIm2Col<uint8_t> packA( | 
 |             conv_p, | 
 |             reinterpret_cast<const uint8_t*>(col_buffer_data), | 
 |             // buffer for packed matrix | 
 |             X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, | 
 |             // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |             // need zero_point for padding | 
 |             in_qparams_[INPUT].zero_point, | 
 |             row_offsets_.data() + tid * row_offset_size_per_thread); | 
 |  | 
 |         if (quantize_groupwise_) { | 
 |           DispatchFBGEMM_< | 
 |               PackAWithIm2Col<uint8_t>, | 
 |               QuantizationGranularity::GROUP>(packA, Y_int32, Y_uint8_data); | 
 |         } else { | 
 |           DispatchFBGEMM_< | 
 |               PackAWithIm2Col<uint8_t>, | 
 |               QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data); | 
 |         } | 
 |       } else { | 
 |         // 3D | 
 |         conv_param_t<3> conv_p( | 
 |             N, | 
 |             C, | 
 |             M, | 
 |             {X.dim32(1), X.dim32(2), X.dim32(3)}, | 
 |             group_, | 
 |             {this->kernel_[0], this->kernel_[1], this->kernel_[2]}, | 
 |             {this->stride_[0], this->stride_[1], this->stride_[2]}, | 
 |             {this->pads_[0], | 
 |              this->pads_[1], | 
 |              this->pads_[2], | 
 |              this->pads_[3], | 
 |              this->pads_[4], | 
 |              this->pads_[5]}); | 
 |  | 
 |         PackAWithIm2Col<uint8_t, int32_t, 3> packA( | 
 |             conv_p, | 
 |             reinterpret_cast<const uint8_t*>(col_buffer_data), | 
 |             // buffer for packed matrix | 
 |             X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, | 
 |             // Shouldn't pass 0 if column_offsets_ is empty here because we | 
 |             // need zero_point for padding | 
 |             in_qparams_[INPUT].zero_point, | 
 |             row_offsets_.data() + tid * row_offset_size_per_thread); | 
 |  | 
 |         if (quantize_groupwise_) { | 
 |           DispatchFBGEMM_< | 
 |               PackAWithIm2Col<uint8_t, int32_t, 3>, | 
 |               QuantizationGranularity::GROUP>(packA, Y_int32, Y_uint8_data); | 
 |         } else { | 
 |           DispatchFBGEMM_< | 
 |               PackAWithIm2Col<uint8_t, int32_t, 3>, | 
 |               QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data); | 
 |         } | 
 |       } // 3D | 
 |     } else if (!quantize_groupwise_ && filter_zero_points_[0] == 0) { | 
 |       // no im2col fusion | 
 |       PackAMatrix<uint8_t> packA( | 
 |           matrix_op_t::NoTranspose, | 
 |           N * Y_HxW, | 
 |           group_ * kernel_dim, | 
 |           reinterpret_cast<const uint8_t*>(col_buffer_data), | 
 |           group_ * kernel_dim, | 
 |           // buffer for packed matrix | 
 |           X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, | 
 |           group_); | 
 |  | 
 |       DispatchFBGEMM_<PackAMatrix<uint8_t>, QuantizationGranularity::TENSOR>( | 
 |           packA, Y_int32, Y_uint8_data); | 
 |     } else { | 
 |       // no im2col fusion | 
 |       PackAWithRowOffset<uint8_t> packA( | 
 |           matrix_op_t::NoTranspose, | 
 |           N * Y_HxW, | 
 |           group_ * kernel_dim, | 
 |           reinterpret_cast<const uint8_t*>(col_buffer_data), | 
 |           group_ * kernel_dim, | 
 |           // buffer for packed matrix | 
 |           X_pack_buf_.data() + tid * x_pack_buf_size_per_thread, | 
 |           group_, | 
 |           row_offsets_.data() + tid * row_offset_size_per_thread); | 
 |  | 
 |       if (quantize_groupwise_) { | 
 |         DispatchFBGEMM_< | 
 |             PackAWithRowOffset<uint8_t>, | 
 |             QuantizationGranularity::GROUP>(packA, Y_int32, Y_uint8_data); | 
 |       } else { | 
 |         DispatchFBGEMM_< | 
 |             PackAWithRowOffset<uint8_t>, | 
 |             QuantizationGranularity::TENSOR>(packA, Y_int32, Y_uint8_data); | 
 |       } | 
 |     } // no im2col fusion | 
 |   } else { | 
 |     for (int group_id = 0; group_id < group_; ++group_id) { | 
 |       // Wq_packed_.empty() | 
 |       conv_nhwc_ref_( | 
 |           group_id, | 
 |           group_, | 
 |           0, | 
 |           N * Y_HxW, | 
 |           M, | 
 |           kernel_dim, | 
 |           col_buffer_data, | 
 |           W_quantized_.data(), | 
 |           Y_int32->data()); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | template <typename T, bool ReluFused> | 
 | bool ConvDNNLowPOp<T, ReluFused>::RunOnDeviceWithOrderNHWC() { | 
 |   CAFFE_ENFORCE_LE( | 
 |       this->kernel_.size(), | 
 |       3, | 
 |       "Only 1-3d convolutions are supported for NHWC storage type"); | 
 |  | 
 |   using namespace dnnlowp; | 
 |  | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 |   chrono::time_point<chrono::system_clock> t_very_begin, t_begin, t_end; | 
 |   /*if (VLOG_IS_ON(3))*/ { | 
 |     t_begin = chrono::system_clock::now(); | 
 |     t_very_begin = t_begin; | 
 |   } | 
 | #endif | 
 |  | 
 |   // Get quantization parameters | 
 |   if (!GetQuantizationParameters_()) { | 
 |     return false; | 
 |   } | 
 |  | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 |   /*if (VLOG_IS_ON(3))*/ { | 
 |     t_end = chrono::system_clock::now(); | 
 |     double dt = chrono::duration<double>(t_end - t_begin).count(); | 
 |     LOG(INFO) << "this=" << this << " get_quant_params: " << dt * 1e3 << " ms"; | 
 |   } | 
 | #endif | 
 |  | 
 |   const Tensor& X = InputTensorCPU_(INPUT); | 
 |   auto& filter = InputTensorCPU_(FILTER); | 
 |   const int C = X.dim32(X.dim() - 1); | 
 |   const int G = group_; | 
 |   CAFFE_ENFORCE_EQ(X.dim(), filter.dim()); | 
 |   const int M = filter.dim32(0); | 
 |   CAFFE_ENFORCE_EQ( | 
 |       C, | 
 |       filter.dim32(filter.dim() - 1) * G, | 
 |       "Convolution op: input channels does not match: # of input channels ", | 
 |       C, | 
 |       " is not equal to kernel channels * group: ", | 
 |       filter.dim32(filter.dim() - 1), | 
 |       "*", | 
 |       G); | 
 |   CAFFE_ENFORCE_EQ( | 
 |       M % G, 0, "The number of output channels is not divisible by group."); | 
 |  | 
 |   auto sizes = ConvPoolOpBase<CPUContext>::GetOutputSize(X, filter.dim32(0)); | 
 |   Tensor* Y = OutputTensorCPU_(0, sizes, at::dtype<T>()); | 
 |  | 
 |   // The col buffer is stored in HWC order as well - kernel_dim, and the height | 
 |   // and width. | 
 |  | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 |   /*if (VLOG_IS_ON(3)) */ { t_begin = chrono::system_clock::now(); } | 
 | #endif | 
 |  | 
 |   bool no_im2col = NoIm2ColNHWC_(); | 
 |   auto f = [&](Tensor* col_buffer, vector<int32_t>* Y_int32) { | 
 |     if (!TakeDepthWise3x3FastPath_() && !TakeDepthWise3x3x3FastPath_()) { | 
 |       Y_int32->resize(Y->numel()); | 
 |     } | 
 |  | 
 |     // Im2col, followed by gemm. | 
 |     const T* Xdata = X.template data<T>(); | 
 |     const T* col_buffer_data = no_im2col ? Xdata : Im2ColNHWC_(col_buffer); | 
 |  | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 |     /*if (VLOG_IS_ON(3)) */ { | 
 |       t_end = chrono::system_clock::now(); | 
 |       double dt = chrono::duration<double>(t_end - t_begin).count(); | 
 |       LOG(INFO) << "this=" << this << " im2col: " << dt * 1e3 << " ms"; | 
 |       t_begin = chrono::system_clock::now(); | 
 |     } | 
 | #endif | 
 |  | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 |     /*if (VLOG_IS_ON(3)) */ { | 
 |       t_end = chrono::system_clock::now(); | 
 |       double dt = chrono::duration<double>(t_end - t_begin).count(); | 
 |       LOG(INFO) << "this=" << this << " quantize col_buf: " << dt * 1e3 | 
 |                 << " ms"; | 
 |       t_begin = chrono::system_clock::now(); | 
 |     } | 
 | #endif | 
 |  | 
 |     ConvNHWCCore_(col_buffer_data, Y_int32); | 
 |  | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 |     /*if (VLOG_IS_ON(3)) */ { | 
 |       t_end = chrono::system_clock::now(); | 
 |       double dt = chrono::duration<double>(t_end - t_begin).count(); | 
 |       LOG(INFO) << "this=" << this << " GEMM: " << dt * 1e3 << " ms"; | 
 |       t_begin = chrono::system_clock::now(); | 
 |     } | 
 | #endif | 
 |  | 
 |     if (Wq_packed_ || Wq_depthwise_packed_ || Wq_gconv_packed_ || | 
 |         Wq_gconv3d_packed_) { | 
 |       // In fast path with fbgemm except when | 
 |       // rescaling quantized numbers should've been already done. | 
 |       PropagateOutputTensorQuantizationParams(this, 0, out_qparams_); | 
 |     } else { | 
 |       RunOnDeviceEpilogueNHWC_(col_buffer_data, Y_int32->data()); | 
 |     } | 
 |   }; // f | 
 |  | 
 |   this->RunWithSharedBuffer_(&col_buffer_, &Y_int32_, f); | 
 |  | 
 | #ifdef DNNLOWP_MEASURE_TIME_BREAKDOWN | 
 |   /*if (VLOG_IS_ON(3)) */ { | 
 |     const int N = X.dim32(0); | 
 |     // The dimension of each kernel | 
 |     const int kernel_dim = KernelDim_(); | 
 |     // The output image size is the spatial size of the output. | 
 |     const int Y_HxW = this->GetDimsSize(*Y); | 
 |  | 
 |     t_end = chrono::system_clock::now(); | 
 |     double dt = chrono::duration<double>(t_end - t_begin).count(); | 
 |     LOG(INFO) << "this=" << this << " prologue: " << dt * 1e3 << " ms"; | 
 |     t_begin = chrono::system_clock::now(); | 
 |  | 
 |     t_end = chrono::system_clock::now(); | 
 |     const int M = filter.dim32(0); | 
 |     double ops = 2. * N * Y_HxW * M * kernel_dim; | 
 |     dt = chrono::duration<double>(t_end - t_very_begin).count(); | 
 |     double gops = ops / dt / 1e9; | 
 |     LOG(INFO) << "this=" << this << " " << this->debug_def().type() | 
 |               << " output=" << this->debug_def().output(0) << " " << N * Y_HxW | 
 |               << "x" << M << "x" << kernel_dim << " G=" << group_ | 
 |               << " C/G=" << C / group_ << " K/G=" << M / group_ | 
 |               << " R=" << kernel_h() << " S=" << kernel_w() << " : " << dt * 1e3 | 
 |               << " ms " << gops << " gops"; | 
 |   } | 
 | #endif | 
 |  | 
 |   MeasureQuantizationError_(); | 
 |  | 
 |   return true; | 
 | } | 
 |  | 
 | template class ConvDNNLowPOp<uint8_t, false>; | 
 | template class ConvDNNLowPOp<uint8_t, true>; | 
 |  | 
 | template class ConvDNNLowPOp<uint16_t, false>; | 
 | template class ConvDNNLowPOp<uint16_t, true>; | 
 |  | 
 | REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, DNNLOWP, ConvDNNLowPOp<uint8_t, false>); | 
 | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
 |     ConvRelu, | 
 |     DNNLOWP, | 
 |     ConvDNNLowPOp<uint8_t, true>); | 
 |  | 
 | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
 |     Int8Conv, | 
 |     DNNLOWP, | 
 |     ConvDNNLowPOp<uint8_t, false>); | 
 | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
 |     Int8ConvRelu, | 
 |     DNNLOWP, | 
 |     ConvDNNLowPOp<uint8_t, true>); | 
 |  | 
 | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
 |     Conv, | 
 |     DNNLOWP_16, | 
 |     ConvDNNLowPOp<uint16_t, false>); | 
 | REGISTER_CPU_OPERATOR_WITH_ENGINE( | 
 |     ConvRelu, | 
 |     DNNLOWP_16, | 
 |     ConvDNNLowPOp<uint16_t, true>); | 
 |  | 
 | } // namespace caffe2 |