caffe2/quantization/server/dnnlowp.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <algorithm>
 #include <cassert>
 #include <cmath>
 #include <cstdint>
 #include <limits>

 #ifdef __x86_64__
 #include <immintrin.h>
 #endif

 #include <fbgemm/QuantUtils.h>

 #include "caffe2/quantization/server/dynamic_histogram.h"
 #include "caffe2/utils/cpuid.h"

 namespace dnnlowp {

 using fbgemm::RequantizationParams;
 using fbgemm::TensorQuantizationParams;

 // Represents a quantization scheme that provides quantization parameter based
 // on distribution of data to be quantized.
 class QuantizationFactory {
  public:
   enum QuantizationKind {
     // A simple quantization scheme that determines quantization parameter by
     // just looking at min/max.
     MIN_MAX_QUANTIZATION,
     // Minimizes L2 norm of quantization error
     L2_MIN_QUANTIZATION,
     // fast search to remove histogram outliers and approximate L2 min
     L2_MIN_QUANTIZATION_APPROX,
     // Minimizes Kullback-Leibler divergence
     KL_MIN_QUANTIZATION,
     // Take 99 percentail (only works with sparsity preserving quantization)
     P99_QUANTIZATION,
     L1_MIN_QUANTIZATION,
   };

   /// Get the default factory whose policy is determined by gflags
   static QuantizationFactory* GetDefaultInstance();

   /// Choose quantization scale and zero_point that maps
   /// floating-point range [min, max] to the integer range of the specified
   /// precision
   TensorQuantizationParams ChooseQuantizationParams(
       float min,
       float max,
       int precision,
       bool preserve_sparsity,
       bool is_signed = false) const {
     TensorQuantizationParams qparams = fbgemm::ChooseQuantizationParams(
         min,
         max,
         is_signed ? -(1 << (precision - 1)) : 0,
         is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
         preserve_sparsity,
         force_scale_power_of_two_);
     qparams.precision = precision;
     return qparams;
   }

   /// Choose quantization scale and zero_point that maps
   /// floating-point range [min, max] to the default integer range of
   /// this quantization factory
   TensorQuantizationParams
   ChooseQuantizationParams(float min, float max, bool is_weight = false) const {
     return ChooseQuantizationParams(
         min,
         max,
         is_weight ? GetWeightPrecision() : GetActivationPrecision(),
         is_weight ? GetPreserveWeightSparsity()
                   : GetPreserveActivationSparsity());
   }

   /// Choose quantization based on the values in an array to optimize the
   /// quantization errors ignoring a few outliers
   TensorQuantizationParams ChooseQuantizationParams(
       const float* values,
       int len,
       QuantizationKind kind,
       int precision,
       bool preserve_sparsity) const;

   TensorQuantizationParams ChooseQuantizationParams(
       const float* values,
       int len,
       bool is_weight = false) const;

   /// Choose quantization based on histogram of values to optimize the
   /// quantization errors ignoring a few outliers
   TensorQuantizationParams ChooseQuantizationParams(
       const Histogram& hist,
       QuantizationKind kind,
       int precision,
       bool preserve_sparsity,
       bool is_weight = false) const;

   TensorQuantizationParams ChooseQuantizationParams(
       const Histogram& hist,
       bool is_weight = false) const;

   // Given a real_multiplier, produces a pair (quantized_multiplier,
   // right_shift) where quantized_multiplier is an int32 representing a
   // fixed-point value (in practice we only produce positive values) and
   // right_shift is an amount to shift right by, so that the floating-point
   // multiplication of some int32 input value by real_multiplier,
   //
   //   return static_cast<int32>(int32_value * real_multiplier);
   //
   // is best approximated by the integer-arithmetic-only code
   //
   //   return RoundingRightShift(
   //       Multiplication(int32_value, quantized_multiplier),
   //       right_shift);
   //
   // Note: all this code only needs to run offline to generate the quantized
   // neural network workload, not at runtime on the device on which quantized
   // neural networks need to run. So it's not performance-critical at all.
   RequantizationParams ChooseRequantizationMultiplier(
       float real_multiplier,
       TensorQuantizationParams target_qparams) const;

   int GetActivationPrecision() const {
     return activation_precision_;
   }

   int GetWeightPrecision() const {
     return weight_precision_;
   }

   int GetEltwiseQuantizePrecision() const {
     return eltwise_quantize_precision_;
   }

   bool GetPreserveActivationSparsity() const {
     return preserve_activation_sparsity_;
   }

   bool GetPreserveWeightSparsity() const {
     return preserve_weight_sparsity_;
   }

   QuantizationKind GetActivationKind() const {
     return activation_kind_;
   }
   QuantizationKind GetWeightKind() const {
     return weight_kind_;
   }

   void SetWeightP99Threshold(float threshold) {
     weight_p99_threshold_ = threshold;
   }
   void SetActivationP99Threshold(float threshold) {
     activation_p99_threshold_ = threshold;
   }

   explicit QuantizationFactory(
       int activation_precision = 8,
       // precision used for activations in main operations like matmul
       int weight_precision = 8, // precision used for weights
       int requantization_multiplier_precision = 32,
       // precision used for the requantization multiplier
       int eltwise_quantize_precision = 16,
       // precision used for element-wise addition
       bool preserve_activation_sparsity = false,
       // preserve zeros in quantization
       bool preserve_weight_sparsity = false,
       // preserve zeros in quantization
       bool force_scale_power_of_two = false,
       // restrict scaling to a power of two
       QuantizationKind activation_kind = MIN_MAX_QUANTIZATION,
       QuantizationKind weight_kind = MIN_MAX_QUANTIZATION,
       float weight_p99_threshold = 0.99,
       // P99 percentage to select out from the full histogram for weights

       float activation_p99_threshold = 0.99
       // P99 percentage to select out from the full histogram for activations
   );

  private:
   int activation_precision_;
   int weight_precision_;
   int requantization_multiplier_precision_;
   int eltwise_quantize_precision_;
   bool preserve_activation_sparsity_;
   bool preserve_weight_sparsity_;
   bool force_scale_power_of_two_;
   QuantizationKind activation_kind_, weight_kind_;
   float weight_p99_threshold_;
   float activation_p99_threshold_;
 }; // class QuantizationFactory

 /**
  * Parse a string to QuantizationKind
  */
 QuantizationFactory::QuantizationKind StringToKind(const std::string& s);

 std::vector<float>
 adjust_hist_to_include_zero(const Histogram& hist, float* min, float* max);

 } // namespace dnnlowp
	#pragma once

	#include <algorithm>
	#include <cassert>
	#include <cmath>
	#include <cstdint>
	#include <limits>

	#ifdef __x86_64__
	#include <immintrin.h>
	#endif

	#include <fbgemm/QuantUtils.h>

	#include "caffe2/quantization/server/dynamic_histogram.h"
	#include "caffe2/utils/cpuid.h"

	namespace dnnlowp {

	using fbgemm::RequantizationParams;
	using fbgemm::TensorQuantizationParams;

	// Represents a quantization scheme that provides quantization parameter based
	// on distribution of data to be quantized.
	class QuantizationFactory {
	public:
	enum QuantizationKind {
	// A simple quantization scheme that determines quantization parameter by
	// just looking at min/max.
	MIN_MAX_QUANTIZATION,
	// Minimizes L2 norm of quantization error
	L2_MIN_QUANTIZATION,
	// fast search to remove histogram outliers and approximate L2 min
	L2_MIN_QUANTIZATION_APPROX,
	// Minimizes Kullback-Leibler divergence
	KL_MIN_QUANTIZATION,
	// Take 99 percentail (only works with sparsity preserving quantization)
	P99_QUANTIZATION,
	L1_MIN_QUANTIZATION,
	};

	/// Get the default factory whose policy is determined by gflags
	static QuantizationFactory* GetDefaultInstance();

	/// Choose quantization scale and zero_point that maps
	/// floating-point range [min, max] to the integer range of the specified
	/// precision
	TensorQuantizationParams ChooseQuantizationParams(
	float min,
	float max,
	int precision,
	bool preserve_sparsity,
	bool is_signed = false) const {
	TensorQuantizationParams qparams = fbgemm::ChooseQuantizationParams(
	min,
	max,
	is_signed ? -(1 << (precision - 1)) : 0,
	is_signed ? ((1 << (precision - 1)) - 1) : (1 << precision) - 1,
	preserve_sparsity,
	force_scale_power_of_two_);
	qparams.precision = precision;
	return qparams;
	}

	/// Choose quantization scale and zero_point that maps
	/// floating-point range [min, max] to the default integer range of
	/// this quantization factory
	TensorQuantizationParams
	ChooseQuantizationParams(float min, float max, bool is_weight = false) const {
	return ChooseQuantizationParams(
	min,
	max,
	is_weight ? GetWeightPrecision() : GetActivationPrecision(),
	is_weight ? GetPreserveWeightSparsity()
	: GetPreserveActivationSparsity());
	}

	/// Choose quantization based on the values in an array to optimize the
	/// quantization errors ignoring a few outliers
	TensorQuantizationParams ChooseQuantizationParams(
	const float* values,
	int len,
	QuantizationKind kind,
	int precision,
	bool preserve_sparsity) const;

	TensorQuantizationParams ChooseQuantizationParams(
	const float* values,
	int len,
	bool is_weight = false) const;

	/// Choose quantization based on histogram of values to optimize the
	/// quantization errors ignoring a few outliers
	TensorQuantizationParams ChooseQuantizationParams(
	const Histogram& hist,
	QuantizationKind kind,
	int precision,
	bool preserve_sparsity,
	bool is_weight = false) const;

	TensorQuantizationParams ChooseQuantizationParams(
	const Histogram& hist,
	bool is_weight = false) const;

	// Given a real_multiplier, produces a pair (quantized_multiplier,
	// right_shift) where quantized_multiplier is an int32 representing a
	// fixed-point value (in practice we only produce positive values) and
	// right_shift is an amount to shift right by, so that the floating-point
	// multiplication of some int32 input value by real_multiplier,
	//
	// return static_cast<int32>(int32_value * real_multiplier);
	//
	// is best approximated by the integer-arithmetic-only code
	//
	// return RoundingRightShift(
	// Multiplication(int32_value, quantized_multiplier),
	// right_shift);
	//
	// Note: all this code only needs to run offline to generate the quantized
	// neural network workload, not at runtime on the device on which quantized
	// neural networks need to run. So it's not performance-critical at all.
	RequantizationParams ChooseRequantizationMultiplier(
	float real_multiplier,
	TensorQuantizationParams target_qparams) const;

	int GetActivationPrecision() const {
	return activation_precision_;
	}

	int GetWeightPrecision() const {
	return weight_precision_;
	}

	int GetEltwiseQuantizePrecision() const {
	return eltwise_quantize_precision_;
	}

	bool GetPreserveActivationSparsity() const {
	return preserve_activation_sparsity_;
	}

	bool GetPreserveWeightSparsity() const {
	return preserve_weight_sparsity_;
	}

	QuantizationKind GetActivationKind() const {
	return activation_kind_;
	}
	QuantizationKind GetWeightKind() const {
	return weight_kind_;
	}

	void SetWeightP99Threshold(float threshold) {
	weight_p99_threshold_ = threshold;
	}
	void SetActivationP99Threshold(float threshold) {
	activation_p99_threshold_ = threshold;
	}

	explicit QuantizationFactory(
	int activation_precision = 8,
	// precision used for activations in main operations like matmul
	int weight_precision = 8, // precision used for weights
	int requantization_multiplier_precision = 32,
	// precision used for the requantization multiplier
	int eltwise_quantize_precision = 16,
	// precision used for element-wise addition
	bool preserve_activation_sparsity = false,
	// preserve zeros in quantization
	bool preserve_weight_sparsity = false,
	// preserve zeros in quantization
	bool force_scale_power_of_two = false,
	// restrict scaling to a power of two
	QuantizationKind activation_kind = MIN_MAX_QUANTIZATION,
	QuantizationKind weight_kind = MIN_MAX_QUANTIZATION,
	float weight_p99_threshold = 0.99,
	// P99 percentage to select out from the full histogram for weights

	float activation_p99_threshold = 0.99
	// P99 percentage to select out from the full histogram for activations
	);

	private:
	int activation_precision_;
	int weight_precision_;
	int requantization_multiplier_precision_;
	int eltwise_quantize_precision_;
	bool preserve_activation_sparsity_;
	bool preserve_weight_sparsity_;
	bool force_scale_power_of_two_;
	QuantizationKind activation_kind_, weight_kind_;
	float weight_p99_threshold_;
	float activation_p99_threshold_;
	}; // class QuantizationFactory

	/**
	* Parse a string to QuantizationKind
	*/
	QuantizationFactory::QuantizationKind StringToKind(const std::string& s);

	std::vector<float>
	adjust_hist_to_include_zero(const Histogram& hist, float* min, float* max);

	} // namespace dnnlowp