caffe2/quantization/server/fbgemm_pack_op.h - platform/external/pytorch - Git at Google

 #pragma once

 #include "caffe2/core/operator.h"
 #include "caffe2/operators/conv_op.h"
 #include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
 #include "caffe2/quantization/server/fbgemm_pack_blob.h"
 #include "caffe2/quantization/server/fully_connected_dnnlowp_op.h"

 namespace caffe2 {

 using FCFp32Op = FullyConnectedOp<CPUContext>;

 class FullyConnectedDNNLowPPackWeightOp final
     : public DNNLowPOp<std::uint8_t, FCFp32Op> {
  public:
   FullyConnectedDNNLowPPackWeightOp(
       const OperatorDef& operator_def,
       Workspace* ws);
   USE_OPERATOR_FUNCTIONS(CPUContext);

   bool RunOnDevice() override;

  private:
   int axis_w_;
   bool quantize_channelwise_;
   int nbits_in_non_outlier_; // only for DNNLOWP_ACC16

   INPUT_TAGS(FILTER, BIAS);
 };

 using ConvFp32Op = ConvOp<float, CPUContext>;

 /**
  * Pack a weight matrix that can be used by DNNLOWP Int8Conv operators.
  * DNNLOWP operators can pack matrix on demand during their first invocations
  * but calling this operator to pre-pack can have benefits like saving memory
  * space when multiple operators are sharing the same weight.
  * This operator should be a part of init net to be called once to populate
  * packed blob to be used by Int8Conv DNNLOWP operators in the predictor net
  *
  * This operator optionally can also pre-quantize bias.
  * Then, we should also provide the scale of input activation tensor as in_scale
  * argument.
  */
 class ConvDNNLowPPackWeightOp final
     : public ConvPoolDNNLowPOpBase<std::uint8_t, ConvFp32Op> {
  public:
   USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
   USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(std::uint8_t, ConvFp32Op);
   ConvDNNLowPPackWeightOp(const OperatorDef& operator_def, Workspace* ws);

   bool RunOnDevice() override;

  private:
   bool TakeDepthWise3x3FastPath_();
   bool TakeDepthWise3x3x3FastPath_();
   bool TakeGConvFastPath_();

   bool quantize_groupwise_;
   int nbits_in_non_outlier_; // only for DNNLOWP_ACC16

   INPUT_TAGS(FILTER, BIAS);
 };

 // Helper functions for packing weights that can be used by
 // ConvDNNLowPAcc16PackWeightOp, ConvDNNLowPOp, and ConvDNNLowPAcc16Op

 template <typename T>
 void QuantizeWeight(
     const Blob& blob,
     int kernel_dim,
     int M,
     vector<dnnlowp::TensorQuantizationParams>& qparams,
     vector<typename std::make_signed<T>::type>& w_quantized,
     dnnlowp::QuantizationFactory* qfactory);

 template <typename T>
 void ComputeColumnOffsets(
     int num_rows,
     int num_cols,
     const T* W,
     const vector<dnnlowp::TensorQuantizationParams>& qparams,
     vector<int32_t>& col_offsets);

 int CountOutliers(
     int groups,
     int kernel_dim,
     int M,
     int nbits_in_non_outlier,
     vector<std::int8_t>& W_quantized);

 /**
  * @param W_quantized input quantized weight that is not packed yet
  */
 fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
     int groups,
     int kernel_dim,
     int M,
     int nbits_in_non_outlier,
     vector<std::int8_t>& W_quantized);

 } // namespace caffe2
	#pragma once

	#include "caffe2/core/operator.h"
	#include "caffe2/operators/conv_op.h"
	#include "caffe2/quantization/server/conv_pool_dnnlowp_op_base.h"
	#include "caffe2/quantization/server/fbgemm_pack_blob.h"
	#include "caffe2/quantization/server/fully_connected_dnnlowp_op.h"

	namespace caffe2 {

	using FCFp32Op = FullyConnectedOp<CPUContext>;

	class FullyConnectedDNNLowPPackWeightOp final
	: public DNNLowPOp<std::uint8_t, FCFp32Op> {
	public:
	FullyConnectedDNNLowPPackWeightOp(
	const OperatorDef& operator_def,
	Workspace* ws);
	USE_OPERATOR_FUNCTIONS(CPUContext);

	bool RunOnDevice() override;

	private:
	int axis_w_;
	bool quantize_channelwise_;
	int nbits_in_non_outlier_; // only for DNNLOWP_ACC16

	INPUT_TAGS(FILTER, BIAS);
	};

	using ConvFp32Op = ConvOp<float, CPUContext>;

	/**
	* Pack a weight matrix that can be used by DNNLOWP Int8Conv operators.
	* DNNLOWP operators can pack matrix on demand during their first invocations
	* but calling this operator to pre-pack can have benefits like saving memory
	* space when multiple operators are sharing the same weight.
	* This operator should be a part of init net to be called once to populate
	* packed blob to be used by Int8Conv DNNLOWP operators in the predictor net
	*
	* This operator optionally can also pre-quantize bias.
	* Then, we should also provide the scale of input activation tensor as in_scale
	* argument.
	*/
	class ConvDNNLowPPackWeightOp final
	: public ConvPoolDNNLowPOpBase<std::uint8_t, ConvFp32Op> {
	public:
	USE_CONV_POOL_BASE_FUNCTIONS(CPUContext);
	USE_CONV_POOL_DNNLOWP_OPERATOR_BASE_FUNCTIONS(std::uint8_t, ConvFp32Op);
	ConvDNNLowPPackWeightOp(const OperatorDef& operator_def, Workspace* ws);

	bool RunOnDevice() override;

	private:
	bool TakeDepthWise3x3FastPath_();
	bool TakeDepthWise3x3x3FastPath_();
	bool TakeGConvFastPath_();

	bool quantize_groupwise_;
	int nbits_in_non_outlier_; // only for DNNLOWP_ACC16

	INPUT_TAGS(FILTER, BIAS);
	};

	// Helper functions for packing weights that can be used by
	// ConvDNNLowPAcc16PackWeightOp, ConvDNNLowPOp, and ConvDNNLowPAcc16Op

	template <typename T>
	void QuantizeWeight(
	const Blob& blob,
	int kernel_dim,
	int M,
	vector<dnnlowp::TensorQuantizationParams>& qparams,
	vector<typename std::make_signed<T>::type>& w_quantized,
	dnnlowp::QuantizationFactory* qfactory);

	template <typename T>
	void ComputeColumnOffsets(
	int num_rows,
	int num_cols,
	const T* W,
	const vector<dnnlowp::TensorQuantizationParams>& qparams,
	vector<int32_t>& col_offsets);

	int CountOutliers(
	int groups,
	int kernel_dim,
	int M,
	int nbits_in_non_outlier,
	vector<std::int8_t>& W_quantized);

	/**
	* @param W_quantized input quantized weight that is not packed yet
	*/
	fbgemm::CompressedSparseColumn* ExtractOutlierMatrix(
	int groups,
	int kernel_dim,
	int M,
	int nbits_in_non_outlier,
	vector<std::int8_t>& W_quantized);

	} // namespace caffe2