caffe2/quantization/server/dnnlowp_op.h - platform/external/pytorch - Git at Google

 #pragma once

 #ifdef _OPENMP
 #include <omp.h>
 #endif

 #include "caffe2/core/operator.h"
 #include "caffe2/core/tensor_int8.h"
 #include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
 #include "caffe2/quantization/server/dnnlowp.h"
 #include "caffe2/quantization/server/fbgemm_pack_blob.h"
 #include "caffe2/quantization/server/op_wrapper.h"
 #include "caffe2/quantization/server/sigmoid.h"
 #include "caffe2/quantization/server/tanh.h"

 #ifdef _OPENMP
 C10_DECLARE_int(caffe2_omp_num_threads);
 #endif

 namespace caffe2 {

 /**
  * @brief A convenient base class for C2 operators with DNNLOWP engine.
  *        DNNLOWP ops give flexibility on the type of input/output blobs.
  *        For example, some inputs can be the usual fp32 tensor and they will be
  *        quantized before doing actual computation.
  *        Otherwise, the inputs should be pre-quantized Int8TensorCPU.
  *        A few constraints: when the weight is pre-quantized if and only if the
  *        bias is also pre-quantized.
  *
  *        static quantization vs. dynamic quantization
  *        When Y_scale and Y_zero_point (optional with default = 0) arg is set,
  *        and dequantize_output is false, we do static quantization, meaning
  *        we're using the same pre-computed scale and zero_point for the output
  *        activation tensor.
  *        Otherwise, we do dynamic quantization by looking at the min/max of
  *        output activation tensor for each batch.
  *        Y_scale and Y_zero_point arguments are used for static quantization.
  *        scale and zero_point of Int8TensorCPU is used for carrying
  *        quantization information across operators both in static and dynamic
  *        quantization. This means scale and zero_point of Int8TensorCPU is
  *        valid only for the current batch and will be reset in the next batch
  *        when dynamic quantization is used.
  *
  *        C2 operators with DNNLOWP engine have the following arguments:
  *        - dequantize_output (default=false): when true, output is dequantized
  *          as fp32. Useful when we're only quantizing individual operators
  *          rather than doing end-to-end quantization.
  *        - followed_by (default=null): can be relu, sigmoid, or tanh. When
  *          specified, the current operator is only followed by relu, sigmoid,
  *          or tanh, and this fact can be used for more accurate output
  *          quantization.
  *        - measure_quantization_error (default=false): when true, L2 error
  *          with respect to the baseline C2 operator in fp32 is reported.
  *          WARNING: turning this option will make performance very slow and
  *          this option is intended for debugging accuracy issues.
  *
  *        For the following quantization method related options, please refer
  *        to caffe2/quantization/server/dnnlowp.cc for more details.
  *
  *        - activation_quantization_precision (default=8)
  *        - weight_quantization_precision (default=8)
  *        - requantization_multiplier_precision (default=32)
  *        - eltwise_quantization_precision (default=16)
  *        - force_scale_power_of_two (default=0)
  *        - preserve_activation_sparsity (default=0)
  *        - preserve_weight_sparsity (default=0)
  *        - activation_quantization_kind (default=min_max)
  *        - weight_quantization_kind (default=min_max)
  */
 template <typename T, typename FP32_OP>
 class DNNLowPOp : public Operator<CPUContext> {
   static_assert(std::is_integral<T>::value, "Integral required.");

  public:
   USE_OPERATOR_FUNCTIONS(CPUContext);
   DNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<CPUContext>(operator_def, ws),
         in_qparams_(InputSize()),
         qfactory_(dnnlowp::GetQuantizationFactoryOf(this)) {
 #ifdef _OPENMP
     if (FLAGS_caffe2_omp_num_threads > 0) {
       omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
     }
 #endif
   }

   virtual ~DNNLowPOp() {
     if (measure_quantization_error_) {
       dnnlowp::ReportQuantizationError(this, quantization_error_stats_);
     }
   }

  protected:
   const TensorCPU& InputTensorCPU_(int idx) {
     if (InputIsType<int8::Int8TensorCPU>(idx)) {
       return this->Input<int8::Int8TensorCPU>(idx).t;
     } else if (InputIsType<Int8FCDNNLowPPackedWeightBlob>(idx)) {
       return this->Input<Int8FCDNNLowPPackedWeightBlob>(idx).original_tensor;
     } else {
       return Input(idx);
     }
   }

   TensorCPU* OutputTensorCPU_(int idx) {
     if (dequantize_output_) {
       return Output(idx);
     } else {
       return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
     }
   }

   T* GetQuantizedOutputData_() {
     if (dequantize_output_) {
       out_temp_.resize(Output(0)->numel());
       return out_temp_.data();
     } else {
       return OutputTensorCPU_(0)->template mutable_data<T>();
     }
   }

   void MeasureQuantizationError_() {
     if (!measure_quantization_error_ || !Fp32Op_()) {
       return;
     }

     const float* actual = nullptr;
     vector<float> actual_temp;
     if (OutputTensorCPU_(0)->template IsType<float>()) {
       actual = OutputTensorCPU_(0)->template data<float>();
     } else {
       actual_temp.resize(OutputTensorCPU_(0)->numel());
       fbgemm::Dequantize<float>(
           OutputTensorCPU_(0)->template data<float>(),
           actual_temp.data(),
           OutputTensorCPU_(0)->numel(),
           out_qparams_);
       actual = actual_temp.data();
     }

     float* ref = Fp32Op_()->Get()->Output(0)->template mutable_data<float>();
     if (followed_by_ == "Relu") {
       for (int i = 0; i < Output(0)->numel(); ++i) {
         ref[i] = std::max(0.f, ref[i]);
       }
     }

     dnnlowp::MeasureQuantizationError(
         actual, ref, OutputTensorCPU_(0)->numel(), &quantization_error_stats_);
   }

   void RunOnDeviceEpilogue_() {
     if (dequantize_output_) {
       fbgemm::Dequantize<T>(
           out_temp_.data(),
           OutputTensorCPU_(0)->template mutable_data<float>(),
           OutputTensorCPU_(0)->numel(),
           out_qparams_);
     } else {
       dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
     }

     MeasureQuantizationError_();
   }

   void ParseDNNLowPOperatorArguments_() {
     // Ideally, this should be done in constructor but any modification of
     // arguments in ParseDNNLowPOperatorArguments will be ignored if we call
     // this from constructor.
     // Make sure all derived classes call this "early enough" so that they
     // use correct parameters.
     if (!arguments_parsed_) {
       dnnlowp::ParseDNNLowPOperatorArguments(
           this,
           &dequantize_output_,
           &measure_quantization_error_,
           &followed_by_);
       arguments_parsed_ = true;
     }
   }

   void GetOutputQuantizationParams_() {
     using namespace dnnlowp;

     ParseDNNLowPOperatorArguments_();

     if (HasStaticQuantization(this)) {
       out_qparams_ = GetStaticQuantizationParamsOf(this, 0);

       if (measure_quantization_error_) {
         // To measure quantization error, run ref fp32 impl.
         // This doesn't really belong here but we need to run the reference fp32
         // implementation before quantized computation of some inplace operators
         // will overwrite their inputs.
         Fp32Op_()->DequantizeInput();
         Fp32Op_()->Get()->RunOnDevice();
       }
     } else {
       // TODO: this is only needed when dequantize_output_ == false but leave
       // as it is now because some code relies on out_qparams_ initialized even
       // though it never actually uses it.
       Fp32Op_()->DequantizeInput();
       Fp32Op_()->Get()->RunOnDevice();
       out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
     }
   }

   OpWrapper<FP32_OP, T>* Fp32Op_() {
     if (!fp32_op_) {
       fp32_op_.reset(new OpWrapper<FP32_OP, T>(this, qfactory_.get()));
     }
     return fp32_op_.get();
   }

   bool dequantize_output_{false}, measure_quantization_error_{false};
   std::string followed_by_;

   std::vector<dnnlowp::TensorQuantizationParams> in_qparams_;
   dnnlowp::TensorQuantizationParams out_qparams_;

   std::unique_ptr<OpWrapper<FP32_OP, T>> fp32_op_;
   std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;

   std::vector<T> out_temp_;
   // Buffer to store quantized output temporarily
   // when we output dequantized values.

   dnnlowp::QuantizationErrorStats quantization_error_stats_;

   bool arguments_parsed_{false};
 };

 #define USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FP32_OP)              \
   /* using override */ using BaseType = DNNLowPOp<T, FP32_OP>;       \
   /* using override */ using BaseType::GetOutputQuantizationParams_; \
   /* using override */ using BaseType::GetQuantizedOutputData_;      \
   /* using override */ using BaseType::Fp32Op_;                      \
   /* using override */ using BaseType::InputTensorCPU_;              \
   /* using override */ using BaseType::MeasureQuantizationError_;    \
   /* using override */ using BaseType::OutputTensorCPU_;             \
   /* using override */ using BaseType::RunOnDeviceEpilogue_;         \
   /* using override */ using BaseType::dequantize_output_;           \
   /* using override */ using BaseType::followed_by_;                 \
   /* using override */ using BaseType::in_qparams_;                  \
   /* using override */ using BaseType::measure_quantization_error_;  \
   /* using override */ using BaseType::out_qparams_;                 \
   /* using override */ using BaseType::qfactory_;

 inline int dnnlowp_get_num_threads() {
 #ifdef _OPENMP
   return omp_get_num_threads();
 #else
   return 1;
 #endif
 }

 inline int dnnlowp_get_max_threads() {
 #ifdef _OPENMP
   return omp_get_max_threads();
 #else
   return 1;
 #endif
 }

 inline int dnnlowp_get_thread_num() {
 #ifdef _OPENMP
   return omp_get_thread_num();
 #else
   return 0;
 #endif
 }

 } // namespace caffe2
	#pragma once

	#ifdef _OPENMP
	#include <omp.h>
	#endif

	#include "caffe2/core/operator.h"
	#include "caffe2/core/tensor_int8.h"
	#include "caffe2/quantization/server/caffe2_dnnlowp_utils.h"
	#include "caffe2/quantization/server/dnnlowp.h"
	#include "caffe2/quantization/server/fbgemm_pack_blob.h"
	#include "caffe2/quantization/server/op_wrapper.h"
	#include "caffe2/quantization/server/sigmoid.h"
	#include "caffe2/quantization/server/tanh.h"

	#ifdef _OPENMP
	C10_DECLARE_int(caffe2_omp_num_threads);
	#endif

	namespace caffe2 {

	/**
	* @brief A convenient base class for C2 operators with DNNLOWP engine.
	* DNNLOWP ops give flexibility on the type of input/output blobs.
	* For example, some inputs can be the usual fp32 tensor and they will be
	* quantized before doing actual computation.
	* Otherwise, the inputs should be pre-quantized Int8TensorCPU.
	* A few constraints: when the weight is pre-quantized if and only if the
	* bias is also pre-quantized.
	*
	* static quantization vs. dynamic quantization
	* When Y_scale and Y_zero_point (optional with default = 0) arg is set,
	* and dequantize_output is false, we do static quantization, meaning
	* we're using the same pre-computed scale and zero_point for the output
	* activation tensor.
	* Otherwise, we do dynamic quantization by looking at the min/max of
	* output activation tensor for each batch.
	* Y_scale and Y_zero_point arguments are used for static quantization.
	* scale and zero_point of Int8TensorCPU is used for carrying
	* quantization information across operators both in static and dynamic
	* quantization. This means scale and zero_point of Int8TensorCPU is
	* valid only for the current batch and will be reset in the next batch
	* when dynamic quantization is used.
	*
	* C2 operators with DNNLOWP engine have the following arguments:
	* - dequantize_output (default=false): when true, output is dequantized
	* as fp32. Useful when we're only quantizing individual operators
	* rather than doing end-to-end quantization.
	* - followed_by (default=null): can be relu, sigmoid, or tanh. When
	* specified, the current operator is only followed by relu, sigmoid,
	* or tanh, and this fact can be used for more accurate output
	* quantization.
	* - measure_quantization_error (default=false): when true, L2 error
	* with respect to the baseline C2 operator in fp32 is reported.
	* WARNING: turning this option will make performance very slow and
	* this option is intended for debugging accuracy issues.
	*
	* For the following quantization method related options, please refer
	* to caffe2/quantization/server/dnnlowp.cc for more details.
	*
	* - activation_quantization_precision (default=8)
	* - weight_quantization_precision (default=8)
	* - requantization_multiplier_precision (default=32)
	* - eltwise_quantization_precision (default=16)
	* - force_scale_power_of_two (default=0)
	* - preserve_activation_sparsity (default=0)
	* - preserve_weight_sparsity (default=0)
	* - activation_quantization_kind (default=min_max)
	* - weight_quantization_kind (default=min_max)
	*/
	template <typename T, typename FP32_OP>
	class DNNLowPOp : public Operator<CPUContext> {
	static_assert(std::is_integral<T>::value, "Integral required.");

	public:
	USE_OPERATOR_FUNCTIONS(CPUContext);
	DNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
	: Operator<CPUContext>(operator_def, ws),
	in_qparams_(InputSize()),
	qfactory_(dnnlowp::GetQuantizationFactoryOf(this)) {
	#ifdef _OPENMP
	if (FLAGS_caffe2_omp_num_threads > 0) {
	omp_set_num_threads(FLAGS_caffe2_omp_num_threads);
	}
	#endif
	}

	virtual ~DNNLowPOp() {
	if (measure_quantization_error_) {
	dnnlowp::ReportQuantizationError(this, quantization_error_stats_);
	}
	}

	protected:
	const TensorCPU& InputTensorCPU_(int idx) {
	if (InputIsType<int8::Int8TensorCPU>(idx)) {
	return this->Input<int8::Int8TensorCPU>(idx).t;
	} else if (InputIsType<Int8FCDNNLowPPackedWeightBlob>(idx)) {
	return this->Input<Int8FCDNNLowPPackedWeightBlob>(idx).original_tensor;
	} else {
	return Input(idx);
	}
	}

	TensorCPU* OutputTensorCPU_(int idx) {
	if (dequantize_output_) {
	return Output(idx);
	} else {
	return &Outputs()[idx]->template GetMutable<int8::Int8TensorCPU>()->t;
	}
	}

	T* GetQuantizedOutputData_() {
	if (dequantize_output_) {
	out_temp_.resize(Output(0)->numel());
	return out_temp_.data();
	} else {
	return OutputTensorCPU_(0)->template mutable_data<T>();
	}
	}

	void MeasureQuantizationError_() {
	if (!measure_quantization_error_ \|\| !Fp32Op_()) {
	return;
	}

	const float* actual = nullptr;
	vector<float> actual_temp;
	if (OutputTensorCPU_(0)->template IsType<float>()) {
	actual = OutputTensorCPU_(0)->template data<float>();
	} else {
	actual_temp.resize(OutputTensorCPU_(0)->numel());
	fbgemm::Dequantize<float>(
	OutputTensorCPU_(0)->template data<float>(),
	actual_temp.data(),
	OutputTensorCPU_(0)->numel(),
	out_qparams_);
	actual = actual_temp.data();
	}

	float* ref = Fp32Op_()->Get()->Output(0)->template mutable_data<float>();
	if (followed_by_ == "Relu") {
	for (int i = 0; i < Output(0)->numel(); ++i) {
	ref[i] = std::max(0.f, ref[i]);
	}
	}

	dnnlowp::MeasureQuantizationError(
	actual, ref, OutputTensorCPU_(0)->numel(), &quantization_error_stats_);
	}

	void RunOnDeviceEpilogue_() {
	if (dequantize_output_) {
	fbgemm::Dequantize<T>(
	out_temp_.data(),
	OutputTensorCPU_(0)->template mutable_data<float>(),
	OutputTensorCPU_(0)->numel(),
	out_qparams_);
	} else {
	dnnlowp::PropagateOutputTensorQuantizationParams(this, 0, out_qparams_);
	}

	MeasureQuantizationError_();
	}

	void ParseDNNLowPOperatorArguments_() {
	// Ideally, this should be done in constructor but any modification of
	// arguments in ParseDNNLowPOperatorArguments will be ignored if we call
	// this from constructor.
	// Make sure all derived classes call this "early enough" so that they
	// use correct parameters.
	if (!arguments_parsed_) {
	dnnlowp::ParseDNNLowPOperatorArguments(
	this,
	&dequantize_output_,
	&measure_quantization_error_,
	&followed_by_);
	arguments_parsed_ = true;
	}
	}

	void GetOutputQuantizationParams_() {
	using namespace dnnlowp;

	ParseDNNLowPOperatorArguments_();

	if (HasStaticQuantization(this)) {
	out_qparams_ = GetStaticQuantizationParamsOf(this, 0);

	if (measure_quantization_error_) {
	// To measure quantization error, run ref fp32 impl.
	// This doesn't really belong here but we need to run the reference fp32
	// implementation before quantized computation of some inplace operators
	// will overwrite their inputs.
	Fp32Op_()->DequantizeInput();
	Fp32Op_()->Get()->RunOnDevice();
	}
	} else {
	// TODO: this is only needed when dequantize_output_ == false but leave
	// as it is now because some code relies on out_qparams_ initialized even
	// though it never actually uses it.
	Fp32Op_()->DequantizeInput();
	Fp32Op_()->Get()->RunOnDevice();
	out_qparams_ = Fp32Op_()->GetOutputQuantizationParams(qfactory_.get());
	}
	}

	OpWrapper<FP32_OP, T>* Fp32Op_() {
	if (!fp32_op_) {
	fp32_op_.reset(new OpWrapper<FP32_OP, T>(this, qfactory_.get()));
	}
	return fp32_op_.get();
	}

	bool dequantize_output_{false}, measure_quantization_error_{false};
	std::string followed_by_;

	std::vector<dnnlowp::TensorQuantizationParams> in_qparams_;
	dnnlowp::TensorQuantizationParams out_qparams_;

	std::unique_ptr<OpWrapper<FP32_OP, T>> fp32_op_;
	std::unique_ptr<dnnlowp::QuantizationFactory> qfactory_;

	std::vector<T> out_temp_;
	// Buffer to store quantized output temporarily
	// when we output dequantized values.

	dnnlowp::QuantizationErrorStats quantization_error_stats_;

	bool arguments_parsed_{false};
	};

	#define USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, FP32_OP) \
	/* using override */ using BaseType = DNNLowPOp<T, FP32_OP>; \
	/* using override */ using BaseType::GetOutputQuantizationParams_; \
	/* using override */ using BaseType::GetQuantizedOutputData_; \
	/* using override */ using BaseType::Fp32Op_; \
	/* using override */ using BaseType::InputTensorCPU_; \
	/* using override */ using BaseType::MeasureQuantizationError_; \
	/* using override */ using BaseType::OutputTensorCPU_; \
	/* using override */ using BaseType::RunOnDeviceEpilogue_; \
	/* using override */ using BaseType::dequantize_output_; \
	/* using override */ using BaseType::followed_by_; \
	/* using override */ using BaseType::in_qparams_; \
	/* using override */ using BaseType::measure_quantization_error_; \
	/* using override */ using BaseType::out_qparams_; \
	/* using override */ using BaseType::qfactory_;

	inline int dnnlowp_get_num_threads() {
	#ifdef _OPENMP
	return omp_get_num_threads();
	#else
	return 1;
	#endif
	}

	inline int dnnlowp_get_max_threads() {
	#ifdef _OPENMP
	return omp_get_max_threads();
	#else
	return 1;
	#endif
	}

	inline int dnnlowp_get_thread_num() {
	#ifdef _OPENMP
	return omp_get_thread_num();
	#else
	return 0;
	#endif
	}

	} // namespace caffe2