blob: 72d4005357bce7a6d837b1f60961cf17d7a7e948 [file] [log] [blame]
#pragma once
#ifdef _OPENMP
#include <omp.h>
#endif
#include "caffe2/core/operator.h"
#include "caffe2/core/tensor_int8.h"
#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
#include "caffe2/quantization/server/dnnlowp.h"
#include "caffe2/quantization/server/fbgemm_pack_blob.h"
#include "caffe2/quantization/server/op_wrapper.h"
#include "caffe2/quantization/server/sigmoid.h"
#include "caffe2/quantization/server/tanh.h"
#ifdef _OPENMP
C10_DECLARE_int(caffe2_omp_num_threads);
#endif
namespace caffe2 {
/**
* @brief A convenient base class for C2 operators with DNNLOWP engine.
* DNNLOWP ops give flexibility on the type of input/output blobs.
* For example, some inputs can be the usual fp32 tensor and they will be
* quantized before doing actual computation.
* Otherwise, the inputs should be pre-quantized Int8TensorCPU.
* A few constraints: when the weight is pre-quantized if and only if the
* bias is also pre-quantized.
*
* static quantization vs. dynamic quantization
* When Y_scale and Y_zero_point (optional with default = 0) arg is set,
* and dequantize_output is false, we do static quantization, meaning
* we're using the same pre-computed scale and zero_point for the output
* activation tensor.
* Otherwise, we do dynamic quantization by looking at the min/max of
* output activation tensor for each batch.
* Y_scale and Y_zero_point arguments are used for static quantization.
* scale and zero_point of Int8TensorCPU is used for carrying
* quantization information across operators both in static and dynamic
* quantization. This means scale and zero_point of Int8TensorCPU is
* valid only for the current batch and will be reset in the next batch
* when dynamic quantization is used.
*
* C2 operators with DNNLOWP engine have the following arguments:
* - dequantize_output (default=false): when true, output is dequantized
* as fp32. Useful when we're only quantizing individual operators
* rather than doing end-to-end quantization.
* - followed_by (default=null): can be relu, sigmoid, or tanh. When
* specified, the current operator is only followed by relu, sigmoid,
* or tanh, and this fact can be used for more accurate output
* quantization.
* - measure_quantization_error (default=false): when true, L2 error
* with respect to the baseline C2 operator in fp32 is reported.
* WARNING: turning this option will make performance very slow and
* this option is intended for debugging accuracy issues.
*
* For the following quantization method related options, please refer
* to caffe2/quantization/server/dnnlowp.cc for more details.
*
* - activation_quantization_precision (default=8)
* - weight_quantization_precision (default=8)
* - requantization_multiplier_precision (default=32)
* - eltwise_quantization_precision (default=16)
* - force_scale_power_of_two (default=0)
* - preserve_activation_sparsity (default=0)
* - preserve_weight_sparsity (default=0)
* - activation_quantization_kind (default=min_max)
* - weight_quantization_kind (default=min_max)
*/
template <typename T, typename FP32_OP>
class DNNLowPOp : public Operator<CPUContext> {
static_assert(std::is_integral<T>::value, "Integral required.");
public:
USE_OPERATOR_FUNCTIONS(CPUContext);
DNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<CPUContext>(operator_def, ws),
in_qparams_(InputSize()),
qfactory_(dnnlowp::GetQuantizationFactoryOf(this)) {
#ifdef _OPENMP
if (FLAGS_caffe2_omp_num_threads > 0) {
omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
}
#endif
}
virtual ~DNNLowPOp() {
if (measure_quantization_error_) {
dnnlowp::ReportQuantizationError(this, quantization_error_stats_);
}
}
protected:
const TensorCPU& InputTensorCPU_(int idx) {
if (InputIsType<int8::Int8TensorCPU>(idx)) {
return this->Input<int8::Int8TensorCPU>(idx).t;
} else if (InputIsType<Int8FCDNNLowPPackedWeightBlob>(idx)) {
return this->Input<Int8FCDNNLowPPackedWeightBlob>(idx).original_tensor;
} else {
return Input(idx);
}
}
TensorCPU* OutputTensorCPU_(int idx) {
if (dequantize_output_) {
return Output(idx);
} else {
return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
}
}
T* GetQuantizedOutputData_() {
if (dequantize_output_) {
out_temp_.resize(Output(0)->numel());
return out_temp_.data();
} else {
return OutputTensorCPU_(0)->template mutable_data<T>();
}
}
void MeasureQuantizationError_() {
if (!measure_quantization_error_ || !Fp32Op_()) {
return;
}
const float* actual = nullptr;
vector<float> actual_temp;
if (OutputTensorCPU_(0)->template IsType<float>()) {
actual = OutputTensorCPU_(0)->template data<float>();
} else {
actual_temp.resize(OutputTensorCPU_(0)->numel());
fbgemm::Dequantize<float>(
OutputTensorCPU_(0)->template data<float>(),
actual_temp.data(),
OutputTensorCPU_(0)->numel(),
out_qparams_);
actual = actual_temp.data();
}
float* ref = Fp32Op_()->Get()->Output(0)->template mutable_data<float>();
if (followed_by_ == "Relu") {
for (int i = 0; i < Output(0)->numel(); ++i) {
ref[i] = std::max(0.f, ref[i]);
}
}
dnnlowp::MeasureQuantizationError(
actual, ref, OutputTensorCPU_(0)->numel(), &quantization_error_stats_);
}
void RunOnDeviceEpilogue_() {
if (dequantize_output_) {
fbgemm::Dequantize<T>(
out_temp_.data(),
OutputTensorCPU_(0)->template mutable_data<float>(),
OutputTensorCPU_(0)->numel(),
out_qparams_);
} else {
dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
}
MeasureQuantizationError_();
}
void ParseDNNLowPOperatorArguments_() {
// Ideally, this should be done in constructor but any modification of
// arguments in ParseDNNLowPOperatorArguments will be ignored if we call
// this from constructor.
// Make sure all derived classes call this "early enough" so that they
// use correct parameters.
if (!arguments_parsed_) {
dnnlowp::ParseDNNLowPOperatorArguments(
this,
&dequantize_output_,
&measure_quantization_error_,
&followed_by_);
arguments_parsed_ = true;
}
}
void GetOutputQuantizationParams_() {
using namespace dnnlowp;
ParseDNNLowPOperatorArguments_();
if (HasStaticQuantization(this)) {
out_qparams_ = GetStaticQuantizationParamsOf(this, 0);
if (measure_quantization_error_) {
// To measure quantization error, run ref fp32 impl.
// This doesn't really belong here but we need to run the reference fp32
// implementation before quantized computation of some inplace operators
// will overwrite their inputs.
Fp32Op_()->DequantizeInput();
Fp32Op_()->Get()->RunOnDevice();
}
} else {
// TODO: this is only needed when dequantize_output_ == false but leave
// as it is now because some code relies on out_qparams_ initialized even
// though it never actually uses it.
Fp32Op_()->DequantizeInput();
Fp32Op_()->Get()->RunOnDevice();
out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
}
}
OpWrapper<FP32_OP, T>* Fp32Op_() {
if (!fp32_op_) {
fp32_op_.reset(new OpWrapper<FP32_OP, T>(this, qfactory_.get()));
}
return fp32_op_.get();
}
bool dequantize_output_{false}, measure_quantization_error_{false};
std::string followed_by_;
std::vector<dnnlowp::TensorQuantizationParams> in_qparams_;
dnnlowp::TensorQuantizationParams out_qparams_;
std::unique_ptr<OpWrapper<FP32_OP, T>> fp32_op_;
std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;
std::vector<T> out_temp_;
// Buffer to store quantized output temporarily
// when we output dequantized values.
dnnlowp::QuantizationErrorStats quantization_error_stats_;
bool arguments_parsed_{false};
};
#define USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FP32_OP) \
/* using override */ using BaseType = DNNLowPOp<T, FP32_OP>; \
/* using override */ using BaseType::GetOutputQuantizationParams_; \
/* using override */ using BaseType::GetQuantizedOutputData_; \
/* using override */ using BaseType::Fp32Op_; \
/* using override */ using BaseType::InputTensorCPU_; \
/* using override */ using BaseType::MeasureQuantizationError_; \
/* using override */ using BaseType::OutputTensorCPU_; \
/* using override */ using BaseType::RunOnDeviceEpilogue_; \
/* using override */ using BaseType::dequantize_output_; \
/* using override */ using BaseType::followed_by_; \
/* using override */ using BaseType::in_qparams_; \
/* using override */ using BaseType::measure_quantization_error_; \
/* using override */ using BaseType::out_qparams_; \
/* using override */ using BaseType::qfactory_;
inline int dnnlowp_get_num_threads() {
#ifdef _OPENMP
return omp_get_num_threads();
#else
return 1;
#endif
}
inline int dnnlowp_get_max_threads() {
#ifdef _OPENMP
return omp_get_max_threads();
#else
return 1;
#endif
}
inline int dnnlowp_get_thread_num() {
#ifdef _OPENMP
return omp_get_thread_num();
#else
return 0;
#endif
}
} // namespace caffe2