| #include "fbgemm_pack_op.h" |
| |
| #include "caffe2/core/tensor.h" |
| #include "caffe2/core/tensor_int8.h" |
| |
| #include "caffe2_dnnlowp_utils.h" |
| #include <fbgemm/FbgemmConvert.h> |
| |
| C10_DECLARE_int32(caffe2_dnnlowp_nbits_in_non_outlier); |
| C10_DECLARE_double(caffe2_dnnlowp_acc16_density_threshold); |
| C10_DECLARE_int32(caffe2_dnnlowp_acc16_n_threshold); |
| C10_DECLARE_int32(caffe2_dnnlowp_acc16_k_threshold); |
| |
| namespace caffe2 { |
| |
| using namespace std; |
| using dnnlowp::TensorQuantizationParams; |
| |
| // Helper functions |
| |
| template <typename T> |
| void QuantizeWeight( |
| const Blob& blob, |
| int kernel_dim, |
| int M, |
| vector<TensorQuantizationParams>& qparams, |
| vector<typename make_signed<T>::type>& W_quantized, |
| dnnlowp::QuantizationFactory* qfactory) { |
| using T_signed = typename make_signed<T>::type; |
| |
| const auto& filter = blob.IsType<int8::Int8TensorCPU>() |
| ? blob.Get<int8::Int8TensorCPU>().t |
| : blob.Get<TensorCPU>(); |
| |
| W_quantized.resize(filter.numel()); |
| |
| int signed_min = -(1 << (qfactory->GetWeightPrecision() - 1)); |
| if (blob.IsType<int8::Int8TensorCPU>()) { |
| qparams[0].scale = blob.Get<int8::Int8TensorCPU>().scale; |
| qparams[0].zero_point = |
| blob.Get<int8::Int8TensorCPU>().zero_point + signed_min; |
| |
| const T* W_data = filter.data<T>(); |
| for (auto i = 0; i < filter.numel(); ++i) { |
| W_quantized[i] = W_data[i] + signed_min; |
| } |
| } else { |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int g = 0; g < qparams.size(); ++g) { |
| size_t offset = g * (M / qparams.size()) * kernel_dim; |
| qparams[g] = qfactory->ChooseQuantizationParams( |
| filter.data<float>() + offset, |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| (M / qparams.size()) * kernel_dim, |
| true /*weight*/); |
| |
| // qparams[g] is computed for unsigned type. |
| // Adjust for the fact that weight will actually use signed. |
| qparams[g].zero_point += signed_min; |
| |
| fbgemm::Quantize<T_signed>( |
| filter.data<float>() + offset, |
| W_quantized.data() + offset, |
| (M / qparams.size()) * kernel_dim, |
| qparams[g]); |
| } |
| } |
| } |
| |
| template void QuantizeWeight<uint8_t>( |
| const Blob& blob, |
| int kernel_dim, |
| int M, |
| vector<TensorQuantizationParams>& qparams, |
| vector<int8_t>& W_quantized, |
| dnnlowp::QuantizationFactory* qfactory); |
| |
| template void QuantizeWeight<uint16_t>( |
| const Blob& blob, |
| int kernel_dim, |
| int M, |
| vector<TensorQuantizationParams>& qparams, |
| vector<int16_t>& W_quantized, |
| dnnlowp::QuantizationFactory* qfactory); |
| |
| // TODO reuse col_offsets_with_zero_pt_s8acc32_ref in fbgemm |
| // RefImplementations.cc . We can't do this now because W_quantized is |
| // not transposed here. |
| template <typename T> |
| void ComputeColumnOffsets( |
| int num_rows, |
| int num_cols, |
| const T* W, |
| const vector<TensorQuantizationParams>& qparams, |
| vector<int32_t>& col_offsets) { |
| col_offsets.resize(num_cols); |
| int num_quant_groups = qparams.size(); |
| for (int g = 0; g < num_quant_groups; ++g) { |
| int j_begin = g * (num_cols / num_quant_groups); |
| int j_end = j_begin + (num_cols / num_quant_groups); |
| for (int j = j_begin; j < j_end; ++j) { |
| int32_t sum = 0; |
| for (int k = 0; k < num_rows; ++k) { |
| sum += W[j * num_rows + k]; |
| } |
| col_offsets[j] = sum - qparams[g].zero_point * num_rows; |
| } |
| } |
| } |
| |
| template void ComputeColumnOffsets<int8_t>( |
| int num_rows, |
| int num_cols, |
| const int8_t* W, |
| const vector<TensorQuantizationParams>& qparams, |
| vector<int32_t>& col_offsets); |
| |
| template void ComputeColumnOffsets<int16_t>( |
| int num_rows, |
| int num_cols, |
| const int16_t* W, |
| const vector<TensorQuantizationParams>& qparams, |
| vector<int32_t>& col_offsets); |
| |
| int CountOutliers( |
| int groups, |
| int kernel_dim, |
| int M, |
| int nbits_in_non_outlier, |
| vector<int8_t>& W_quantized) { |
| int outlier_cnt = 0; |
| for (int group_id = 0; group_id < groups; ++group_id) { |
| for (int i = 0; i < (M / groups) * kernel_dim; ++i) { |
| int8_t w = W_quantized[group_id * (M / groups) * kernel_dim + i]; |
| bool is_outlier = nbits_in_non_outlier == 0 || |
| w < -(1 << (nbits_in_non_outlier - 1)) || |
| w >= (1 << (nbits_in_non_outlier - 1)); |
| if (is_outlier) { |
| ++outlier_cnt; |
| } |
| } |
| } |
| return outlier_cnt; |
| } |
| |
| fbgemm::CompressedSparseColumn* ExtractOutlierMatrix( |
| int groups, |
| int kernel_dim, |
| int M, |
| int nbits_in_non_outlier, |
| vector<int8_t>& W_quantized) { |
| int outlier_cnt = |
| CountOutliers(groups, kernel_dim, M, nbits_in_non_outlier, W_quantized); |
| |
| fbgemm::CompressedSparseColumn* Wq_outlier = |
| new fbgemm::CompressedSparseColumn(kernel_dim, M); |
| Wq_outlier->RowIdx().resize(outlier_cnt); |
| Wq_outlier->Values().resize(outlier_cnt); |
| |
| outlier_cnt = 0; |
| for (int group_id = 0; group_id < groups; ++group_id) { |
| for (int j = 0; j < M / groups; ++j) { |
| Wq_outlier->ColPtr()[group_id * (M / groups) + j] = outlier_cnt; |
| |
| for (int k = 0; k < kernel_dim; ++k) { |
| int8_t w = W_quantized[(group_id * (M / groups) + j) * kernel_dim + k]; |
| bool is_outlier = nbits_in_non_outlier == 0 || |
| w < -(1 << (nbits_in_non_outlier - 1)) || |
| w >= (1 << (nbits_in_non_outlier - 1)); |
| if (is_outlier) { |
| CAFFE_ENFORCE_LE(k, numeric_limits<int16_t>::max()); |
| Wq_outlier->RowIdx()[outlier_cnt] = k; |
| Wq_outlier->Values()[outlier_cnt] = w; |
| ++outlier_cnt; |
| |
| W_quantized[(group_id * (M / groups) + j) * kernel_dim + k] = 0; |
| } |
| } |
| } |
| } // for each group |
| CAFFE_ENFORCE_EQ(outlier_cnt, Wq_outlier->RowIdx().size()); |
| Wq_outlier->ColPtr()[M] = outlier_cnt; |
| |
| return Wq_outlier; |
| } |
| |
| // FIXME: code duplication with ConvDNNLowPOp::QuantizeBias_ |
| void QuantizeConvBias( |
| const Blob& blob, |
| int M, |
| const TensorQuantizationParams& in_qparams, |
| const vector<TensorQuantizationParams>& filter_qparams, |
| vector<int32_t>& b_quantized, bool use_fp16, |
| bool round_to_nearest_even) { |
| const auto& bias = blob.IsType<int8::Int8TensorCPU>() |
| ? blob.Get<int8::Int8TensorCPU>().t |
| : blob.Get<TensorCPU>(); |
| if (blob.IsType<int8::Int8TensorCPU>()) { |
| // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) |
| TensorQuantizationParams bias_qparams; |
| bias_qparams.scale = blob.Get<int8::Int8TensorCPU>().scale; |
| bias_qparams.zero_point = blob.Get<int8::Int8TensorCPU>().zero_point; |
| CAFFE_ENFORCE_LE( |
| std::abs( |
| bias_qparams.scale - in_qparams.scale * filter_qparams[0].scale), |
| 1e-4); |
| CAFFE_ENFORCE_EQ(bias_qparams.zero_point, 0); |
| b_quantized.resize(bias.numel()); |
| b_quantized.assign( |
| bias.data<int32_t>(), bias.data<int32_t>() + bias.numel()); |
| } else { |
| const float* bdata = bias.data<float>(); |
| vector<float> bdata_local; |
| if (use_fp16) { |
| bdata_local.resize(bias.numel()); |
| fbgemm::RoundToFloat16( |
| bdata, bdata_local.data(), bias.numel(), false /* FLAGS_caffe2_fbgemm_fake_fp16_clamp */); |
| bdata = bdata_local.data(); |
| } |
| b_quantized.resize(bias.numel()); |
| // NOLINTNEXTLINE(clang-diagnostic-sign-compare) |
| for (int g = 0; g < filter_qparams.size(); ++g) { |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| int i_begin = g * (M / filter_qparams.size()); |
| // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) |
| int i_end = i_begin + (M / filter_qparams.size()); |
| for (int i = i_begin; i < i_end; ++i) { |
| if (round_to_nearest_even) { |
| b_quantized[i] = fbgemm::Quantize<int32_t>( |
| bdata[i], |
| 0, |
| in_qparams.scale * filter_qparams[g].scale, |
| 32, |
| true /* signed */); |
| } else { |
| b_quantized[i] = round((1.0f / in_qparams.scale) * (1.0f / filter_qparams[g].scale) * bdata[i]); |
| b_quantized[i] = std::max(std::min(b_quantized[i], INT32_MAX), INT32_MIN); |
| } |
| } |
| } |
| } |
| } |
| |
| // FullyConnectedDNNLowPPackWeightOp |
| |
| FullyConnectedDNNLowPPackWeightOp::FullyConnectedDNNLowPPackWeightOp( |
| const OperatorDef& operator_def, |
| Workspace* ws) |
| : DNNLowPOp<uint8_t, FCFp32Op>(operator_def, ws), |
| axis_w_(this->GetSingleArgument<int32_t>("axis_w", 1)), |
| quantize_channelwise_( |
| this->GetSingleArgument<bool>("quantize_channelwise", false)), |
| save_unpacked_weights_( |
| this->GetSingleArgument<bool>("save_unpacked_weights", false)) { |
| if (this->debug_def().engine() == "DNNLOWP_ROWWISE") { |
| quantize_channelwise_ = true; |
| } |
| if (this->debug_def().engine() == "DNNLOWP_ACC16") { |
| nbits_in_non_outlier_ = this->GetSingleArgument<int>( |
| "nbits_in_non_outlier", FLAGS_caffe2_dnnlowp_nbits_in_non_outlier); |
| } |
| } |
| |
| bool FullyConnectedDNNLowPPackWeightOp::RunOnDevice() { |
| const auto& filter = InputTensorCPU_(0); |
| const auto canonical_axis_w = filter.canonical_axis_index(axis_w_); |
| const auto K = filter.size_from_dim(canonical_axis_w); |
| const auto N = filter.size_to_dim(canonical_axis_w); |
| |
| auto* Y = this->Output<Int8FCDNNLowPPackedWeightBlob>(0); |
| |
| // Create tensor with the same shape but this new tensor shouldn't actually |
| // allocate memory for the tensor. |
| // This is just a convenient way to pass tensor shape information |
| Y->original_tensor.ResizeLike(filter); |
| |
| Y->qparams.resize(quantize_channelwise_ ? N : 1); |
| |
| vector<int8_t> W_quantized; |
| QuantizeWeight<uint8_t>( |
| InputBlob(0), K, N, Y->qparams, W_quantized, qfactory_.get()); |
| |
| if (save_unpacked_weights_) { |
| ReinitializeTensor( |
| &Y->original_tensor, filter.sizes(), at::dtype<int8_t>().device(CPU)); |
| auto* buffer = Y->original_tensor.template mutable_data<int8_t>(); |
| CAFFE_ENFORCE_EQ(Y->original_tensor.numel(), W_quantized.size()); |
| memcpy(buffer, W_quantized.data(), W_quantized.size() * sizeof(int8_t)); |
| } |
| if (this->InputIsType<int8::Int8TensorCPU>(0) && quantize_channelwise_) { |
| static int log_occurences = 0; |
| if (log_occurences < 32) { |
| ++log_occurences; |
| LOG(WARNING) << "Cannot do row-wise quantization for " |
| "pre-quantized weight " |
| << this->debug_def().input(0); |
| } |
| } |
| |
| // Pre-compute column offsets |
| // This should happen before ExtractOutlierMatrix because W_quantized is |
| // changed in ExtractOutlierMatrix. |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->column_offsets.reset(new vector<int32_t>()); |
| ComputeColumnOffsets( |
| K, N, W_quantized.data(), Y->qparams, *Y->column_offsets); |
| |
| if (this->debug_def().engine() == "DNNLOWP_ACC16") { |
| if (nbits_in_non_outlier_ < 8) { |
| Y->W_outlier.reset( |
| ExtractOutlierMatrix(1, K, N, nbits_in_non_outlier_, W_quantized)); |
| int outlier_cnt = Y->W_outlier->ColPtr()[N]; |
| |
| LOG(INFO) << "Proportion of outlier for FC layer with weight blob " |
| << this->debug_def().input(0) << " is " |
| << static_cast<float>(outlier_cnt) / W_quantized.size(); |
| LOG(INFO) << "nbits_in_non_outlier " << nbits_in_non_outlier_; |
| } |
| |
| Y->nbits_in_non_outlier = nbits_in_non_outlier_; |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W_acc16.reset(new fbgemm::PackBMatrix<int8_t, int16_t>( |
| fbgemm::matrix_op_t::Transpose, |
| K, |
| N, |
| W_quantized.data(), |
| K, |
| nullptr, // pmat |
| 1)); // group |
| } else { |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W.reset(new fbgemm::PackBMatrix<int8_t>( |
| fbgemm::matrix_op_t::Transpose, |
| K, |
| N, |
| W_quantized.data(), |
| K, |
| nullptr, // pmat |
| 1)); // group |
| } |
| |
| // Quantize bias |
| if (InputSize() >= 2) { |
| // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) |
| TensorQuantizationParams in_qparams; |
| CAFFE_ENFORCE(HasSingleArgumentOfType<float>("in_scale")); |
| in_qparams.scale = GetSingleArgument<float>("in_scale", 0); |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->bias.reset(new vector<int32_t>()); |
| QuantizeConvBias(InputBlob(1), N, in_qparams, Y->qparams, *Y->bias); |
| } else { |
| Y->bias = nullptr; |
| } |
| |
| // Output quantized bias if we specify a second output. This output is meant |
| // to be consumed by accelerator instead of CPU ops. |
| if (OutputSize() >= 2) { |
| CAFFE_ENFORCE(Y->bias, "Bias is not quantized"); |
| // The reason we don't support this is basically due to limitation of |
| // Int8TensorCPU only support single scale and zero_point. If we choose to |
| // output bias as Int8FCDNNLowPPackedWeightBlob with original layout, |
| // everything should still work for accelerator. |
| CAFFE_ENFORCE_EQ( |
| 1, |
| Y->qparams.size(), |
| "We don't support outputing channelwise quantized bias yet"); |
| auto quantized_bias = Y->bias; |
| float in_scale = GetSingleArgument<float>("in_scale", 0); |
| float bias_scale = in_scale * Y->qparams.front().scale; |
| LOG(INFO) << "Bias scale " << bias_scale << ": input scale " << in_scale |
| << " weight scale " << Y->qparams.front().scale; |
| auto* Bq = this->Output<int8::Int8TensorCPU>(1); |
| std::vector<int64_t> shape = {static_cast<int64_t>(quantized_bias->size())}; |
| Bq->t.Resize(shape); |
| Bq->scale = bias_scale; |
| Bq->zero_point = 0; |
| auto* data = Bq->t.template mutable_data<int32_t>(); |
| context_.template CopySameDevice<int32_t>( |
| quantized_bias->size(), quantized_bias->data(), data); |
| } |
| |
| return true; |
| } |
| |
| // ConvDNNLowPPackWeightOp |
| |
| ConvDNNLowPPackWeightOp::ConvDNNLowPPackWeightOp( |
| const OperatorDef& operator_def, |
| Workspace* ws) |
| : ConvPoolDNNLowPOpBase<uint8_t, ConvFp32Op>(operator_def, ws), |
| save_unpacked_weights_( |
| this->GetSingleArgument<bool>("save_unpacked_weights", false)), |
| quantize_groupwise_( |
| this->GetSingleArgument<bool>("quantize_groupwise", false)) { |
| if (this->debug_def().engine() == "DNNLOWP_ACC16") { |
| nbits_in_non_outlier_ = this->GetSingleArgument<int>( |
| "nbits_in_non_outlier", FLAGS_caffe2_dnnlowp_nbits_in_non_outlier); |
| } |
| } |
| |
| bool ConvDNNLowPPackWeightOp::TakeDepthWise3x3FastPath_() { |
| const auto& filter = this->InputTensorCPU_(FILTER); |
| // The number of output channels |
| int M = filter.dim32(0); |
| // The number of input channels per group |
| int C_per_group = filter.dim32(filter.dim() - 1); |
| return this->debug_def().engine() != "DNNLOWP_ACC16" && group_ == M && |
| C_per_group == 1 && group_ % 8 == 0 && this->kernel_.size() == 2 && |
| kernel_h() == 3 && kernel_w() == 3 && stride_h() == stride_w() && |
| (stride_h() == 1 || stride_h() == 2) && dilation_h() == 1 && |
| dilation_w() == 1 && pad_t() == 1 && pad_b() == 1 && pad_l() == 1 && |
| pad_r() == 1 && GetCpuId().avx2(); |
| } |
| |
| bool ConvDNNLowPPackWeightOp::TakeDepthWise3x3x3FastPath_() { |
| const auto& filter = this->InputTensorCPU_(FILTER); |
| // The number of output channels |
| int M = filter.dim32(0); |
| // The number of input channels per group |
| int C_per_group = filter.dim32(filter.dim() - 1); |
| bool ret = this->debug_def().engine() != "DNNLOWP_ACC16" && group_ == M && |
| C_per_group == 1 && group_ % 8 == 0 && this->kernel_.size() == 3 && |
| this->kernel_[0] == 3 && this->kernel_[1] == 3 && this->kernel_[2] == 3 && |
| (this->stride_[0] == 1 || this->stride_[0] == 2) && |
| (this->stride_[1] == 1 || this->stride_[1] == 2) && |
| (this->stride_[2] == 1 || this->stride_[2] == 2) && |
| this->dilation_[0] == 1 && this->dilation_[1] == 1 && |
| this->dilation_[2] == 1 && |
| accumulate( |
| // NOLINTNEXTLINE(modernize-use-transparent-functors) |
| this->pads_.begin(), this->pads_.end(), 1, multiplies<int>()) == 1 && |
| GetCpuId().avx2(); |
| return ret; |
| } |
| |
| fbgemm::conv_param_t<> ConvDNNLowPPackWeightOp::GetConvParam_() { |
| CAFFE_ENFORCE_EQ(this->kernel_.size(), 2); |
| |
| auto& filter = InputTensorCPU_(FILTER); |
| const int M = filter.dim32(0), C = filter.dim32(filter.dim() - 1) * group_; |
| |
| return fbgemm::conv_param_t<>( |
| 1, // dummy |
| C, |
| M, |
| {this->kernel_[0] * this->stride_[0], |
| this->kernel_[1] * this->stride_[1]}, // dummy |
| group_, |
| {this->kernel_[0], this->kernel_[1]}, |
| {this->stride_[0], this->stride_[1]}, |
| {this->pads_[0], this->pads_[1], this->pads_[2], this->pads_[3]}); |
| } |
| |
| fbgemm::conv_param_t<3> ConvDNNLowPPackWeightOp::GetConv3DParam_() { |
| CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); |
| |
| auto& filter = InputTensorCPU_(FILTER); |
| const int M = filter.dim32(0), C = filter.dim32(filter.dim() - 1) * group_; |
| |
| return fbgemm::conv_param_t<3>( |
| 1, // dummy |
| C, |
| M, |
| {1, |
| this->kernel_[1] * this->stride_[1], |
| this->kernel_[2] * this->stride_[2]}, // dummy |
| group_, |
| {this->kernel_[0], this->kernel_[1], this->kernel_[2]}, |
| {this->stride_[0], this->stride_[1], this->stride_[2]}, |
| {this->pads_[0], |
| this->pads_[1], |
| this->pads_[2], |
| this->pads_[3], |
| this->pads_[4], |
| this->pads_[5]}); |
| } |
| |
| bool ConvDNNLowPPackWeightOp::TakeGConvFastPath_() { |
| if (this->debug_def().engine() == "DNNLOWP_ACC16" || |
| (this->kernel_.size() != 2 && this->kernel_.size() != 3)) { |
| return false; |
| } |
| |
| if (this->kernel_.size() == 2) { |
| return fbgemm::fbgemmOptimizedGConv(GetConvParam_()); |
| } else { |
| CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); |
| return fbgemm::fbgemmOptimizedGConv(GetConv3DParam_()); |
| } |
| } |
| |
| bool ConvDNNLowPPackWeightOp::RunOnDevice() { |
| const auto& filter = InputTensorCPU_(FILTER); |
| |
| auto* Y = this->Output<Int8ConvDNNLowPPackedWeightBlob>(0); |
| // Create tensor with the same shape but this new tensor shouldn't actually |
| // allocate memory for the tensor. |
| // This is just a convenient way to pass tensor shape information |
| Y->original_tensor.ResizeLike(filter); |
| |
| // Assume KRSC layout |
| // The number of output channels |
| int M = filter.dim32(0); |
| // The number of input channels per group |
| int C_per_group = filter.dim32(filter.dim() - 1); |
| |
| int kernel_dims_size = 1; |
| for (int i = 0; i < filter.dim() - 2; ++i) { |
| kernel_dims_size *= filter.dim32(i + 1); |
| } |
| int kernel_dim = C_per_group * kernel_dims_size; |
| |
| vector<int8_t> W_quantized; |
| Y->qparams.resize(quantize_groupwise_ ? group_ : 1); |
| QuantizeWeight<uint8_t>( |
| InputBlob(FILTER), |
| kernel_dim, |
| M, |
| Y->qparams, |
| W_quantized, |
| qfactory_.get()); |
| if (save_unpacked_weights_) { |
| ReinitializeTensor( |
| &Y->original_tensor, filter.sizes(), at::dtype<int8_t>().device(CPU)); |
| auto* buffer = Y->original_tensor.template mutable_data<int8_t>(); |
| CAFFE_ENFORCE_EQ(Y->original_tensor.numel(), W_quantized.size()); |
| memcpy(buffer, W_quantized.data(), W_quantized.size() * sizeof(int8_t)); |
| } |
| |
| if (this->InputIsType<int8::Int8TensorCPU>(FILTER) && quantize_groupwise_) { |
| static int log_occurences = 0; |
| if (log_occurences < 32) { |
| ++log_occurences; |
| LOG(WARNING) << "Cannot do group-wise quantization for " |
| "pre-quantized weight " |
| << this->debug_def().input(0); |
| } |
| } |
| |
| // Pre-compute column offsets |
| // This should happen before ExtractOutlierMatrix because W_quantized is |
| // changed in ExtractOutlierMatrix. |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->column_offsets.reset(new vector<int32_t>()); |
| ComputeColumnOffsets( |
| kernel_dim, M, W_quantized.data(), Y->qparams, *Y->column_offsets); |
| |
| // Check if we should fallback to 32-bit accumulation. |
| // This check is only meaningful when engine is DNNLOWP_ACC16. |
| bool fallback_to_32_bit_accumulation = false; |
| if (nbits_in_non_outlier_ == 0) { |
| LOG(INFO) << "nbits_in_non_outlier == 0 means everything is outlier so we " |
| "fallback to acc32"; |
| fallback_to_32_bit_accumulation = true; |
| } |
| // In Skylake, acc16 is not faster when N or K is smaller than 128 |
| // FIXME : code duplication with conv_dnnlowp_acc16_op.cc |
| constexpr int SKYLAKE_ACC16_N_THRESHOLD_MIN = 128, |
| SKYLAKE_ACC16_K_THRESHOLD_MIN = 128; |
| int acc16_n_threshold = FLAGS_caffe2_dnnlowp_acc16_n_threshold; |
| if (caffe2::GetCpuId().avx512f() && |
| acc16_n_threshold < SKYLAKE_ACC16_N_THRESHOLD_MIN) { |
| acc16_n_threshold = SKYLAKE_ACC16_N_THRESHOLD_MIN; |
| } |
| int acc16_k_threshold = FLAGS_caffe2_dnnlowp_acc16_k_threshold; |
| if (caffe2::GetCpuId().avx512f() && |
| acc16_k_threshold < SKYLAKE_ACC16_K_THRESHOLD_MIN) { |
| acc16_k_threshold = SKYLAKE_ACC16_K_THRESHOLD_MIN; |
| } |
| if (!fallback_to_32_bit_accumulation && M / group_ < acc16_n_threshold) { |
| LOG(INFO) << "N " << M / group_ << " of weight blob " |
| << this->debug_def().input(0) << " is smaller than threshold " |
| << acc16_n_threshold << " . Falling back to acc32"; |
| fallback_to_32_bit_accumulation = true; |
| } |
| if (!fallback_to_32_bit_accumulation && kernel_dim < acc16_k_threshold) { |
| LOG(INFO) << "K " << kernel_dim << " of weight blob " |
| << this->debug_def().input(0) << " is smaller than threshold " |
| << acc16_k_threshold << " . Falling back to acc32"; |
| fallback_to_32_bit_accumulation = true; |
| } |
| |
| // When nbits_in_non_outlier == 0, we fall back to acc32 |
| if (this->debug_def().engine() == "DNNLOWP_ACC16" && |
| !fallback_to_32_bit_accumulation) { |
| if (nbits_in_non_outlier_ < 8) { |
| int outlier_cnt = CountOutliers( |
| group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized); |
| |
| LOG(INFO) << "Proportion of outlier for Conv layer with weight blob " |
| << this->debug_def().input(0) << " is " |
| << static_cast<float>(outlier_cnt) / W_quantized.size(); |
| LOG(INFO) << "nbits_in_non_outlier " << nbits_in_non_outlier_; |
| |
| if (static_cast<float>(outlier_cnt) / W_quantized.size() > |
| FLAGS_caffe2_dnnlowp_acc16_density_threshold) { |
| LOG(INFO) << "Density of outliers is higher than threshold " |
| << FLAGS_caffe2_dnnlowp_acc16_density_threshold |
| << " . Falling back to acc32"; |
| fallback_to_32_bit_accumulation = true; |
| } else { |
| Y->W_outlier.reset(ExtractOutlierMatrix( |
| group_, kernel_dim, M, nbits_in_non_outlier_, W_quantized)); |
| } |
| } |
| |
| if (!fallback_to_32_bit_accumulation) { |
| Y->nbits_in_non_outlier = nbits_in_non_outlier_; |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W_acc16.reset(new fbgemm::PackBMatrix<int8_t, int16_t>( |
| fbgemm::matrix_op_t::Transpose, |
| group_ * kernel_dim, |
| M / group_, |
| W_quantized.data(), |
| kernel_dim, |
| nullptr, // pmat |
| group_)); |
| } |
| } |
| |
| if (fallback_to_32_bit_accumulation) { |
| Y->W_acc16.reset(); |
| Y->W_outlier.reset(); |
| } |
| |
| if (this->debug_def().engine() != "DNNLOWP_ACC16" || |
| fallback_to_32_bit_accumulation) { |
| // acc32 |
| if (TakeDepthWise3x3FastPath_()) { |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W_depthwise.reset(new fbgemm::PackedDepthWiseConvMatrix( |
| group_, 3 * 3, W_quantized.data())); |
| } else if (TakeDepthWise3x3x3FastPath_()) { |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W_depthwise.reset(new fbgemm::PackedDepthWiseConvMatrix( |
| group_, 3 * 3 * 3, W_quantized.data())); |
| } else if (TakeGConvFastPath_()) { |
| if (this->kernel_.size() == 2) { |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W_gconv.reset(new fbgemm::PackWeightMatrixForGConv<int8_t>( |
| fbgemm::matrix_op_t::Transpose, |
| GetConvParam_(), |
| W_quantized.data())); |
| } else { |
| CAFFE_ENFORCE_EQ(this->kernel_.size(), 3); |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W_gconv3d.reset( |
| new fbgemm::PackWeightMatrixForGConv<int8_t, int32_t, 3>( |
| fbgemm::matrix_op_t::Transpose, |
| GetConv3DParam_(), |
| W_quantized.data())); |
| } |
| } else { |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->W.reset(new fbgemm::PackBMatrix<int8_t>( |
| fbgemm::matrix_op_t::Transpose, |
| group_ * kernel_dim, |
| M / group_, |
| W_quantized.data(), |
| kernel_dim, |
| nullptr, // pmat |
| group_)); |
| } |
| } |
| |
| if (InputSize() >= 2) { |
| // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) |
| TensorQuantizationParams in_qparams; |
| CAFFE_ENFORCE(HasSingleArgumentOfType<float>("in_scale")); |
| in_qparams.scale = GetSingleArgument<float>("in_scale", 0); |
| // NOLINTNEXTLINE(modernize-make-shared) |
| Y->bias.reset(new vector<int32_t>()); |
| QuantizeConvBias(InputBlob(BIAS), M, in_qparams, Y->qparams, *Y->bias); |
| } else { |
| Y->bias = nullptr; |
| } |
| |
| return true; |
| } |
| |
| bool Int8FCDNNLowpPackedWeightBlobShapeFunctions::IsSameMetaType( |
| TypeIdentifier id) { |
| return id == TypeMeta::Id<Int8FCDNNLowPPackedWeightBlob>(); |
| } |
| |
| bool Int8ConvDNNLowpPackedWeightBlobShapeFunctions::IsSameMetaType( |
| TypeIdentifier id) { |
| return id == TypeMeta::Id<Int8ConvDNNLowPPackedWeightBlob>(); |
| } |
| |
| TypeIdentifier Int8FCDNNLowpPackedWeightBlobShapeFunctions::GetTypeMetaId() { |
| return TypeMeta::Id<Int8FCDNNLowPPackedWeightBlob>(); |
| } |
| |
| TypeIdentifier Int8ConvDNNLowpPackedWeightBlobShapeFunctions::GetTypeMetaId() { |
| return TypeMeta::Id<Int8ConvDNNLowPPackedWeightBlob>(); |
| } |
| |
| TypeMeta Int8FCDNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorType( |
| const void* c) { |
| // const Int8FCDNNLowPPackedWeightBlob* int8_tensor = |
| // reinterpret_cast<const Int8FCDNNLowPPackedWeightBlob*>(c); |
| // We forced the output type to be uint8_t since we know it always is. |
| // If it is going to be implemented elsewhere, we might need to change here. |
| // return (int8_tensor->original_tensor).dtype(); |
| return TypeMeta::Make<uint8_t>(); |
| } |
| |
| TypeMeta Int8ConvDNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorType( |
| const void* c) { |
| // const Int8ConvDNNLowPPackedWeightBlob* int8_tensor = |
| // reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(c); |
| // return (int8_tensor->original_tensor).dtype(); |
| return TypeMeta::Make<uint8_t>(); |
| } |
| |
| vector<int64_t> |
| Int8FCDNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorInfo( |
| const void* c, |
| size_t* capacity, |
| DeviceOption* device) { |
| const Int8FCDNNLowPPackedWeightBlob* int8_tensor = |
| reinterpret_cast<const Int8FCDNNLowPPackedWeightBlob*>(c); |
| return GetTensorInfo(&(int8_tensor->original_tensor), capacity, device); |
| } |
| |
| vector<int64_t> |
| Int8ConvDNNLowpPackedWeightBlobShapeFunctions::GetExternalTensorInfo( |
| const void* c, |
| size_t* capacity, |
| DeviceOption* device) { |
| const Int8ConvDNNLowPPackedWeightBlob* int8_tensor = |
| reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(c); |
| return GetTensorInfo(&(int8_tensor->original_tensor), capacity, device); |
| } |
| |
| void Int8FCDNNLowpPackedWeightBlobShapeFunctions::LoadInfoOfBlob( |
| const Blob* blob, |
| std::vector<float>* scale, |
| std::vector<float>* offset, |
| uint32_t* axis) { |
| scale->clear(); |
| offset->clear(); |
| const Int8FCDNNLowPPackedWeightBlob* int8_tensor = |
| reinterpret_cast<const Int8FCDNNLowPPackedWeightBlob*>(blob->GetRaw()); |
| const auto& qparams = int8_tensor->qparams; |
| for (const auto& qparam : qparams) { |
| scale->emplace_back(qparam.scale); |
| offset->emplace_back(static_cast<float>(qparam.zero_point)); |
| } |
| *axis = 1; |
| } |
| |
| void Int8ConvDNNLowpPackedWeightBlobShapeFunctions::LoadInfoOfBlob( |
| const Blob* blob, |
| std::vector<float>* scale, |
| std::vector<float>* offset, |
| uint32_t* axis) { |
| scale->clear(); |
| offset->clear(); |
| const Int8ConvDNNLowPPackedWeightBlob* int8_tensor = |
| reinterpret_cast<const Int8ConvDNNLowPPackedWeightBlob*>(blob->GetRaw()); |
| const auto& qparams = int8_tensor->qparams; |
| for (const auto& qparam : qparams) { |
| scale->emplace_back(qparam.scale); |
| offset->emplace_back(static_cast<float>(qparam.zero_point)); |
| } |
| *axis = 1; |
| } |
| |
| void Int8FCDNNLowpPackedWeightBlobShapeFunctions::SetupExternalTensorDescriptor( |
| const Blob* blob, |
| std::vector<std::vector<uint64_t>>* shapes, |
| std::vector<std::vector<float>>* all_scales, |
| std::vector<std::vector<int32_t>>* all_offsets, |
| ExternalTensorDescriptor* desc) { |
| const auto& dnntensor = blob->template Get<Int8FCDNNLowPPackedWeightBlob>(); |
| const Tensor& cpu_tensor = dnntensor.original_tensor; |
| |
| if (cpu_tensor.template IsType<uint8_t>()) { |
| desc->dataType = kONNXIFI_DATATYPE_UINT8; |
| desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<uint8_t>()); |
| } else if (cpu_tensor.template IsType<int32_t>()) { |
| desc->dataType = kONNXIFI_DATATYPE_INT32; |
| desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<int32_t>()); |
| } else if (cpu_tensor.template IsType<int8_t>()) { |
| desc->dataType = kONNXIFI_DATATYPE_INT8; |
| desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<int8_t>()); |
| } else { |
| CAFFE_THROW( |
| "Unsupported Int8FCDNNLowPPackedWeightBlob type in ONNXIFI: ", |
| cpu_tensor.dtype().name()); |
| } |
| |
| desc->quantizationParams = dnntensor.qparams.size(); |
| desc->quantizationAxis = 1; |
| std::vector<float> scales; |
| std::vector<int32_t> offsets; |
| for (const auto v : dnntensor.qparams) { |
| scales.push_back(v.scale); |
| int32_t cur_offset = v.zero_point; |
| offsets.push_back(cur_offset); |
| } |
| all_scales->push_back(scales); |
| all_offsets->push_back(offsets); |
| desc->scales = all_scales->back().data(); |
| desc->biases = all_offsets->back().data(); |
| |
| // Set up dim and shape |
| const auto shape = cpu_tensor.sizes(); |
| desc->dimensions = shape.size(); |
| shapes->emplace_back(shape.cbegin(), shape.cend()); |
| desc->shape = shapes->back().data(); |
| |
| // not an offline tensor |
| desc->isOffline = 0; |
| } |
| |
| void Int8ConvDNNLowpPackedWeightBlobShapeFunctions:: |
| SetupExternalTensorDescriptor( |
| const Blob* blob, |
| std::vector<std::vector<uint64_t>>* shapes, |
| std::vector<std::vector<float>>* all_scales, |
| std::vector<std::vector<int32_t>>* all_offsets, |
| ExternalTensorDescriptor* desc) { |
| const auto& dnntensor = blob->template Get<Int8ConvDNNLowPPackedWeightBlob>(); |
| const Tensor& cpu_tensor = dnntensor.original_tensor; |
| |
| if (cpu_tensor.template IsType<uint8_t>()) { |
| desc->dataType = kONNXIFI_DATATYPE_UINT8; |
| desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<uint8_t>()); |
| } else if (cpu_tensor.template IsType<int32_t>()) { |
| desc->dataType = kONNXIFI_DATATYPE_INT32; |
| desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<int32_t>()); |
| } else if (cpu_tensor.template IsType<int8_t>()) { |
| desc->dataType = kONNXIFI_DATATYPE_INT8; |
| desc->buffer = reinterpret_cast<uint64_t>(cpu_tensor.data<int8_t>()); |
| } else { |
| CAFFE_THROW( |
| "Unsupported Int8ConvDNNLowPPackedWeightBlob type in ONNXIFI: ", |
| cpu_tensor.dtype().name()); |
| } |
| |
| desc->quantizationParams = dnntensor.qparams.size(); |
| desc->quantizationAxis = 1; |
| std::vector<float> scales; |
| std::vector<int32_t> offsets; |
| for (const auto v : dnntensor.qparams) { |
| scales.push_back(v.scale); |
| int32_t cur_offset = v.zero_point; |
| offsets.push_back(cur_offset); |
| } |
| all_scales->push_back(scales); |
| all_offsets->push_back(offsets); |
| desc->scales = all_scales->back().data(); |
| desc->biases = all_offsets->back().data(); |
| |
| // Set up dim and shape |
| const auto shape = cpu_tensor.sizes(); |
| desc->dimensions = shape.size(); |
| shapes->emplace_back(shape.cbegin(), shape.cend()); |
| desc->shape = shapes->back().data(); |
| |
| // not an offline tensor |
| desc->isOffline = 0; |
| } |
| |
| // Explicitly register TypeMeta |
| CAFFE_KNOWN_TYPE(Int8FCDNNLowPPackedWeightBlob); |
| CAFFE_KNOWN_TYPE(Int8ConvDNNLowPPackedWeightBlob); |
| |
| // Register DNNLOWP Type in caffe2 core |
| REGISTER_EXTERNAL_TENSOR_FUNCTIONS( |
| (TypeMeta::Id<Int8FCDNNLowPPackedWeightBlob>()), |
| Int8FCDNNLowpPackedWeightBlobShapeFunctions); |
| REGISTER_EXTERNAL_TENSOR_FUNCTIONS( |
| (TypeMeta::Id<Int8ConvDNNLowPPackedWeightBlob>()), |
| Int8ConvDNNLowpPackedWeightBlobShapeFunctions); |
| |
| REGISTER_CPU_OPERATOR(Int8FCPackWeight, FullyConnectedDNNLowPPackWeightOp); |
| |
| REGISTER_CPU_OPERATOR_WITH_ENGINE( |
| Int8FCPackWeight, |
| DNNLOWP, |
| FullyConnectedDNNLowPPackWeightOp); |
| |
| REGISTER_CPU_OPERATOR_WITH_ENGINE( |
| Int8FCPackWeight, |
| DNNLOWP_ACC16, |
| FullyConnectedDNNLowPPackWeightOp); |
| |
| REGISTER_CPU_OPERATOR_WITH_ENGINE( |
| Int8FCPackWeight, |
| DNNLOWP_ROWWISE, |
| FullyConnectedDNNLowPPackWeightOp); |
| |
| OPERATOR_SCHEMA(Int8FCPackWeight) |
| .NumInputs(1, 2) |
| .NumOutputs(1, 2) |
| .TensorInferenceFunction([](const OperatorDef& def, |
| const vector<TensorShape>& in) { |
| vector<TensorShape> out; |
| TensorShape W = in[0]; |
| out.emplace_back(std::move(W)); |
| out[0].set_data_type(TensorProto_DataType_INT8); |
| if (def.output_size() > 1) { |
| TensorShape b = in[1]; |
| out.emplace_back(std::move(b)); |
| out[1].set_data_type(TensorProto_DataType_INT32); |
| } |
| return out; |
| }) |
| .SetDoc(R"DOC(Prepack weight for Int8FC)DOC") |
| .Input(0, "W", "Weight tensor in KRSC layout") |
| .Input(1, "b", "Bias tensor") |
| .Output( |
| 0, |
| "W_q", |
| "Weight/bias tensor in a packed format " |
| "with type Int8FCDNNLowPPackedWeightBlob") |
| .Output(1, "B_q", "Bias int32 quantized tensor") |
| .Arg("axis_w", "See FC operator") |
| .Arg( |
| "quantize_channelwise", |
| "Default false. Per output channel quantization") |
| .Arg( |
| "save_unpacked_weights", |
| "Default false. " |
| "Store unpacked quantized weights to W_q.original_tensor") |
| .Arg( |
| "in_scale", |
| "The scale of input activation tensor. " |
| "Only meaningful when bias is provided " |
| "(NOTE: this is not the scale of weight"); |
| |
| REGISTER_CPU_OPERATOR_WITH_ENGINE( |
| Int8ConvPackWeight, |
| DNNLOWP, |
| ConvDNNLowPPackWeightOp); |
| |
| REGISTER_CPU_OPERATOR_WITH_ENGINE( |
| Int8ConvPackWeight, |
| DNNLOWP_ACC16, |
| ConvDNNLowPPackWeightOp); |
| |
| OPERATOR_SCHEMA(Int8ConvPackWeight) |
| .NumInputs(1, 2) |
| .NumOutputs(1) |
| .TensorInferenceFunction([](const OperatorDef& def, |
| const vector<TensorShape>& in) { |
| vector<TensorShape> out; |
| TensorShape W = in[0]; |
| out.emplace_back(std::move(W)); |
| out[0].set_data_type(TensorProto_DataType_INT8); |
| if (def.output_size() > 1) { |
| TensorShape b = in[1]; |
| out.emplace_back(std::move(b)); |
| out[1].set_data_type(TensorProto_DataType_INT32); |
| } |
| return out; |
| }) |
| .SetDoc(R"DOC(Prepack weight for Int8Conv)DOC") |
| .Input(0, "W", "Weight tensor in KRSC layout") |
| .Input(1, "b", "Bias tensor") |
| .Output( |
| 0, |
| "W_q", |
| "Weight/bias tensor in a packed format " |
| "with type Int8ConvDNNLowPPackedWeightBlob") |
| .Arg("quantize_groupwise", "Default false. Per group quantization") |
| .Arg( |
| "save_unpacked_weights", |
| "Default false. " |
| "Store unpacked quantized weights to W_q.original_tensor") |
| .Arg( |
| "in_scale", |
| "The scale of input activation tensor. " |
| "Only meaningful when bias is provided " |
| "(NOTE: this is not the scale of weight"); |
| |
| } // namespace caffe2 |