caffe2/quantization/server/fbgemm_pack_op.h - platform/external/pytorch - Git at Google

 #pragma once

 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_op.h"
 #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
 #include "caffe2/quantization/server/fbgemm_pack_blob.h"
 #include "caffe2/quantization/server/fully_connected_dnnlowp_op.h"

 namespace caffe2 {

 using FCFp32Op = FullyConnectedOp<CPUContext>;

 void QuantizeConvBias(
     const Blob& blob,
     int M,
     const dnnlowp::TensorQuantizationParams& in_qparams,
     const vector<dnnlowp::TensorQuantizationParams>& filter_qparams,
     std::vector<int32_t>& b_quantized, bool use_fp16=false, bool round_nearest_even=true);

 class FullyConnectedDNNLowPPackWeightOp final
     : public DNNLowPOp<std::uint8_t, FCFp32Op> {
  public:
   FullyConnectedDNNLowPPackWeightOp(
       const OperatorDef& operator_def,
       Workspace* ws);
   USE_OPERATOR_FUNCTIONS(CPUContext);

   bool RunOnDevice() override;

  private:
   int axis_w_;
   bool quantize_channelwise_;
   int nbits_in_non_outlier_; // only for DNNLOWP_ACC16
   bool save_unpacked_weights_;

   INPUT_TAGS(FILTER, BIAS);
 };

 using ConvFp32Op = ConvOp<float, CPUContext>;

 /**
  * Pack a weight matrix that can be used by DNNLOWP Int8Conv operators.
  * DNNLOWP operators can pack matrix on demand during their first invocations
  * but calling this operator to pre-pack can have benefits like saving memory
  * space when multiple operators are sharing the same weight.
  * This operator should be a part of init net to be called once to populate
  * packed blob to be used by Int8Conv DNNLOWP operators in the predictor net
  *
  * This operator optionally can also pre-quantize bias.
  * Then, we should also provide the scale of input activation tensor as in_scale
  * argument.
  */
 class ConvDNNLowPPackWeightOp final
     : public ConvPoolDNNLowPOpBase<std::uint8_t, ConvFp32Op> {
  public:
   USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
   USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(std::uint8_t, ConvFp32Op);
   ConvDNNLowPPackWeightOp(const OperatorDef& operator_def, Workspace* ws);

   bool RunOnDevice() override;

  private:
   bool TakeDepthWise3x3FastPath_();
   bool TakeDepthWise3x3x3FastPath_();
   bool TakeGConvFastPath_();

   fbgemm::conv_param_t<> GetConvParam_();
   fbgemm::conv_param_t<3> GetConv3DParam_();

   // Save quantized weights right after quantization before layout packing for
   // performance purpose
   bool save_unpacked_weights_;
   bool quantize_groupwise_;
   int nbits_in_non_outlier_; // only for DNNLOWP_ACC16

   INPUT_TAGS(FILTER, BIAS);
 };

 // Helper functions for packing weights that can be used by
 // ConvDNNLowPAcc16PackWeightOp, ConvDNNLowPOp, and ConvDNNLowPAcc16Op

 template <typename T>
 void QuantizeWeight(
     const Blob& blob,
     int kernel_dim,
     int M,
     vector<dnnlowp::TensorQuantizationParams>& qparams,
     vector<typename std::make_signed<T>::type>& w_quantized,
     dnnlowp::QuantizationFactory* qfactory);

 template <typename T>
 void ComputeColumnOffsets(
     int num_rows,
     int num_cols,
     const T* W,
     const vector<dnnlowp::TensorQuantizationParams>& qparams,
     vector<int32_t>& col_offsets);

 int CountOutliers(
     int groups,
     int kernel_dim,
     int M,
     int nbits_in_non_outlier,
     vector<std::int8_t>& W_quantized);

 /**
  * @param W_quantized input quantized weight that is not packed yet
  */
 fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
     int groups,
     int kernel_dim,
     int M,
     int nbits_in_non_outlier,
     vector<std::int8_t>& W_quantized);
 /*
  * Set up used onnxifi data type constexpr
  * Should always be synced with onnxifi.h
  */
 constexpr uint64_t kONNXIFI_DATATYPE_UINT8 = 2;
 constexpr uint64_t kONNXIFI_DATATYPE_INT32 = 6;
 constexpr uint64_t kONNXIFI_DATATYPE_INT8 = 3;

 class Int8ConvDNNLowpPackedWeightBlobShapeFunctions
     : public ExternalTensorFunctionsBase {
  public:
   explicit Int8ConvDNNLowpPackedWeightBlobShapeFunctions()
       : ExternalTensorFunctionsBase() {}
   ~Int8ConvDNNLowpPackedWeightBlobShapeFunctions() override {}
   bool isQuantized() const override {
     return true;
   }
   bool IsSameMetaType(TypeIdentifier id) override;
   void SetupExternalTensorDescriptor(
       const Blob* blob,
       std::vector<std::vector<uint64_t>>* shapes,
       std::vector<std::vector<float>>* all_scales,
       std::vector<std::vector<int32_t>>* all_offsets,
       ExternalTensorDescriptor* desc) override;
   void LoadInfoOfBlob(
       const Blob* blob,
       std::vector<float>* scale,
       std::vector<float>* offset,
       uint32_t* axis) override;
   TypeIdentifier GetTypeMetaId() override;
   TypeMeta GetExternalTensorType(const void* c) override;
   vector<int64_t> GetExternalTensorInfo(
       const void* c,
       size_t* capacity,
       DeviceOption* device) override;
 };

 class Int8FCDNNLowpPackedWeightBlobShapeFunctions
     : public ExternalTensorFunctionsBase {
  public:
   explicit Int8FCDNNLowpPackedWeightBlobShapeFunctions()
       : ExternalTensorFunctionsBase() {}
   ~Int8FCDNNLowpPackedWeightBlobShapeFunctions() override {}
   bool isQuantized() const override {
     return true;
   }
   bool IsSameMetaType(TypeIdentifier id) override;
   void SetupExternalTensorDescriptor(
       const Blob* blob,
       std::vector<std::vector<uint64_t>>* shapes,
       std::vector<std::vector<float>>* all_scales,
       std::vector<std::vector<int32_t>>* all_offsets,
       ExternalTensorDescriptor* desc) override;
   void LoadInfoOfBlob(
       const Blob* blob,
       std::vector<float>* scale,
       std::vector<float>* offset,
       uint32_t* axis) override;
   TypeIdentifier GetTypeMetaId() override;
   TypeMeta GetExternalTensorType(const void* c) override;
   vector<int64_t> GetExternalTensorInfo(
       const void* c,
       size_t* capacity,
       DeviceOption* device) override;
 };

 } // namespace caffe2
	#pragma once

	#include "caffe2/core/operator.h"
	#include "caffe2/operators/conv_op.h"
	#include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
	#include "caffe2/quantization/server/fbgemm_pack_blob.h"
	#include "caffe2/quantization/server/fully_connected_dnnlowp_op.h"

	namespace caffe2 {

	using FCFp32Op = FullyConnectedOp<CPUContext>;

	void QuantizeConvBias(
	const Blob& blob,
	int M,
	const dnnlowp::TensorQuantizationParams& in_qparams,
	const vector<dnnlowp::TensorQuantizationParams>& filter_qparams,
	std::vector<int32_t>& b_quantized, bool use_fp16=false, bool round_nearest_even=true);

	class FullyConnectedDNNLowPPackWeightOp final
	: public DNNLowPOp<std::uint8_t, FCFp32Op> {
	public:
	FullyConnectedDNNLowPPackWeightOp(
	const OperatorDef& operator_def,
	Workspace* ws);
	USE_OPERATOR_FUNCTIONS(CPUContext);

	bool RunOnDevice() override;

	private:
	int axis_w_;
	bool quantize_channelwise_;
	int nbits_in_non_outlier_; // only for DNNLOWP_ACC16
	bool save_unpacked_weights_;

	INPUT_TAGS(FILTER, BIAS);
	};

	using ConvFp32Op = ConvOp<float, CPUContext>;

	/**
	* Pack a weight matrix that can be used by DNNLOWP Int8Conv operators.
	* DNNLOWP operators can pack matrix on demand during their first invocations
	* but calling this operator to pre-pack can have benefits like saving memory
	* space when multiple operators are sharing the same weight.
	* This operator should be a part of init net to be called once to populate
	* packed blob to be used by Int8Conv DNNLOWP operators in the predictor net
	*
	* This operator optionally can also pre-quantize bias.
	* Then, we should also provide the scale of input activation tensor as in_scale
	* argument.
	*/
	class ConvDNNLowPPackWeightOp final
	: public ConvPoolDNNLowPOpBase<std::uint8_t, ConvFp32Op> {
	public:
	USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
	USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(std::uint8_t, ConvFp32Op);
	ConvDNNLowPPackWeightOp(const OperatorDef& operator_def, Workspace* ws);

	bool RunOnDevice() override;

	private:
	bool TakeDepthWise3x3FastPath_();
	bool TakeDepthWise3x3x3FastPath_();
	bool TakeGConvFastPath_();

	fbgemm::conv_param_t<> GetConvParam_();
	fbgemm::conv_param_t<3> GetConv3DParam_();

	// Save quantized weights right after quantization before layout packing for
	// performance purpose
	bool save_unpacked_weights_;
	bool quantize_groupwise_;
	int nbits_in_non_outlier_; // only for DNNLOWP_ACC16

	INPUT_TAGS(FILTER, BIAS);
	};

	// Helper functions for packing weights that can be used by
	// ConvDNNLowPAcc16PackWeightOp, ConvDNNLowPOp, and ConvDNNLowPAcc16Op

	template <typename T>
	void QuantizeWeight(
	const Blob& blob,
	int kernel_dim,
	int M,
	vector<dnnlowp::TensorQuantizationParams>& qparams,
	vector<typename std::make_signed<T>::type>& w_quantized,
	dnnlowp::QuantizationFactory* qfactory);

	template <typename T>
	void ComputeColumnOffsets(
	int num_rows,
	int num_cols,
	const T* W,
	const vector<dnnlowp::TensorQuantizationParams>& qparams,
	vector<int32_t>& col_offsets);

	int CountOutliers(
	int groups,
	int kernel_dim,
	int M,
	int nbits_in_non_outlier,
	vector<std::int8_t>& W_quantized);

	/**
	* @param W_quantized input quantized weight that is not packed yet
	*/
	fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
	int groups,
	int kernel_dim,
	int M,
	int nbits_in_non_outlier,
	vector<std::int8_t>& W_quantized);
	/*
	* Set up used onnxifi data type constexpr
	* Should always be synced with onnxifi.h
	*/
	constexpr uint64_t kONNXIFI_DATATYPE_UINT8 = 2;
	constexpr uint64_t kONNXIFI_DATATYPE_INT32 = 6;
	constexpr uint64_t kONNXIFI_DATATYPE_INT8 = 3;

	class Int8ConvDNNLowpPackedWeightBlobShapeFunctions
	: public ExternalTensorFunctionsBase {
	public:
	explicit Int8ConvDNNLowpPackedWeightBlobShapeFunctions()
	: ExternalTensorFunctionsBase() {}
	~Int8ConvDNNLowpPackedWeightBlobShapeFunctions() override {}
	bool isQuantized() const override {
	return true;
	}
	bool IsSameMetaType(TypeIdentifier id) override;
	void SetupExternalTensorDescriptor(
	const Blob* blob,
	std::vector<std::vector<uint64_t>>* shapes,
	std::vector<std::vector<float>>* all_scales,
	std::vector<std::vector<int32_t>>* all_offsets,
	ExternalTensorDescriptor* desc) override;
	void LoadInfoOfBlob(
	const Blob* blob,
	std::vector<float>* scale,
	std::vector<float>* offset,
	uint32_t* axis) override;
	TypeIdentifier GetTypeMetaId() override;
	TypeMeta GetExternalTensorType(const void* c) override;
	vector<int64_t> GetExternalTensorInfo(
	const void* c,
	size_t* capacity,
	DeviceOption* device) override;
	};

	class Int8FCDNNLowpPackedWeightBlobShapeFunctions
	: public ExternalTensorFunctionsBase {
	public:
	explicit Int8FCDNNLowpPackedWeightBlobShapeFunctions()
	: ExternalTensorFunctionsBase() {}
	~Int8FCDNNLowpPackedWeightBlobShapeFunctions() override {}
	bool isQuantized() const override {
	return true;
	}
	bool IsSameMetaType(TypeIdentifier id) override;
	void SetupExternalTensorDescriptor(
	const Blob* blob,
	std::vector<std::vector<uint64_t>>* shapes,
	std::vector<std::vector<float>>* all_scales,
	std::vector<std::vector<int32_t>>* all_offsets,
	ExternalTensorDescriptor* desc) override;
	void LoadInfoOfBlob(
	const Blob* blob,
	std::vector<float>* scale,
	std::vector<float>* offset,
	uint32_t* axis) override;
	TypeIdentifier GetTypeMetaId() override;
	TypeMeta GetExternalTensorType(const void* c) override;
	vector<int64_t> GetExternalTensorInfo(
	const void* c,
	size_t* capacity,
	DeviceOption* device) override;
	};

	} // namespace caffe2