aten/src/ATen/quantized/Quantizer.h - platform/external/pytorch - Git at Google

 #pragma once

 #include <c10/core/QScheme.h>
 #include <c10/core/MemoryFormat.h>
 #include <c10/macros/Macros.h>
 #include <c10/util/Exception.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/core/ScalarType.h>
 #include <c10/core/TensorOptions.h>

 #include <ATen/TensorUtils.h>

 #include <cmath>
 #include <memory>

 // TODO: move to c10 namespace after we
 // unified caffe2::Tensor and at::Tensor

 namespace at {

 class Tensor;
 struct QTensorImpl;
 struct Quantizer;
 using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
 using QuantizerPtr = c10::intrusive_ptr<Quantizer>;

 /**
  * Quantizer is the class for storing all the information
  * that's necessary to perform quantize and dequantize
  * operation.
  *
  * We might have different types of quantization schemes and this is
  * the base class for all quantizers.
  *
  * QTensorImpl will hold a pointer to Quantizer so that we can support
  * different quantization schemes on Tensor.
  *
  * For example, the most common quantization scheme, Affine Quantization,
  * requires scale and zero_point as parameters, we'll store scale and zero_point
  * inside the instance and we can use it to quantize a float Tensor or
  * dequantize a quantized Tensor.
  *
  * When you add new types of leaf Quantizer class, please also
  * make sure to add a corresponding QScheme enum since
  * they should have one to one mapping.
  *
  * Note about intrusive_ptr:
  * Quantized Tensor holds an intrusive_ptr to Quantizer, and multiple Tensor can
  * share the same Quantizer. Quantizer should be immutable.
  */
 struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target {
   const ScalarType scalar_type_;
   explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
   virtual ~Quantizer();

   // Copied from torch/csrc/jit/ir/scope.h
   QuantizerPtr intrusive_from_this() {
     c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
                                            // from a raw `this` pointer
                                            // so we need to bump the refcount
                                            // to account for this ownership
     return c10::intrusive_ptr<Quantizer>::reclaim(this);
   }

   /**
    * Each concrete Quantizer type should have a unique QScheme type.
    */
   virtual QScheme qscheme() const = 0;

   ScalarType scalar_type() {
     return scalar_type_;
   }

   /**
    * quantize a float Tensor into a quantized Tensor.
    */
   virtual Tensor quantize(Tensor t) = 0;

   /**
    * dequantize a quantized Tensor into a float Tensor.
    */
   virtual Tensor dequantize(Tensor t) = 0;

   /**
    * Compare against `other` for equality.
    */
   virtual bool equalTo(QuantizerPtr other) = 0;
 };

 /**
  * UniformQuantizer is the parent class for all uniform quantizers.
  * These quantization scheme will map float value uniformly to
  * the quantized value. For example, affine quantizer is
  * the most commonly used scheme in this category.
  */
 struct CAFFE2_API UniformQuantizer : public Quantizer {
   explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
 };

 /**
  * NonUniformQuantizer is the parent class for all non-uniform quantizers.
  * These quantization scheme may map float value non-uniformly to the quantized
  * value. K-means quantization is a representative example in this category.
  */
 struct CAFFE2_API NonUniformQuantizer : public Quantizer {
   explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
 };

 // There is also StochasticQuantizer which is uniform but not affine

 /**
  * AffineQuantizer uses affine transformation to do quantization.
  *
  * For quantize:
  * Y = clamp(round(X / scale + zero_point), min, max)
  * For dequantize:
  * X = (Y - zero_point) * scale
  */
 struct CAFFE2_API AffineQuantizer : public UniformQuantizer {
   explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
 };

 // Note that we will not have Symmetric Quantizer in backend to reduce
 // complications in quantized kernel implementation.

 /**
  * PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
  * all the values in the Tensor.
  */
 struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer {
   explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
     : AffineQuantizer(scalar_type),
         scale_(scale),
         zero_point_(zero_point) {}

   Tensor quantize(Tensor tensor) override;
   Tensor dequantize(Tensor tensor) override;

   QScheme qscheme() const override {
     return kPerTensorAffine;
   }

   double scale() const {
     return scale_;
   }

   int64_t zero_point() const {
     return zero_point_;
   }

   bool equalTo(QuantizerPtr other) override {
     if (!other.get() || other->qscheme() != kPerTensorAffine) {
       return false;
     }
     auto* other_per_tensor_affine =
         static_cast<PerTensorAffineQuantizer*>(other.get());
     return scalar_type() == other_per_tensor_affine->scalar_type() &&
         scale() == other_per_tensor_affine->scale() &&
         zero_point() == other_per_tensor_affine->zero_point();
   }

  private:
   const double scale_;
   // We use int64_t for consistency with Python
   const int64_t zero_point_;
 };

 /**
  * PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer
  * except that we have an independent scale and zero_point parameter
  * for each channel.
  *
  * Also note that per channel quantization is mostly applied to output channels
  * of weights since per-input channel of weight quantization or per-channel
  * quantization for activations can't be efficiently supported in most of
  * processors since it requires each multiplication result within a single
  * dot-product to have a different scale.
  */
 struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
   explicit PerChannelAffineQuantizer(
       ScalarType scalar_type,
       Tensor scales,
       Tensor zero_points,
       int64_t axis)
       : AffineQuantizer(scalar_type),
         scales_(scales),
         zero_points_(zero_points),
         axis_(axis) {}

   QScheme qscheme() const override {
     return kPerChannelAffine;
   }

   Tensor scales() const {
     return scales_;
   }

   Tensor zero_points() const {
     return zero_points_;
   }

   int64_t axis() const {
     return axis_;
   }

   Tensor quantize(Tensor tensor) override;
   Tensor dequantize(Tensor tensor) override;

   bool equalTo(QuantizerPtr other) override {
     if (!other.get() || other->qscheme() != kPerChannelAffine) {
       return false;
     }
     auto* other_per_channel_affine =
         static_cast<PerChannelAffineQuantizer*>(other.get());
     return scalar_type() == other_per_channel_affine->scalar_type() &&
         scales().equal(other_per_channel_affine->scales()) &&
         zero_points().equal(other_per_channel_affine->zero_points()) &&
         axis() == other_per_channel_affine->axis();
   }

  private:
   Tensor scales_;
   Tensor zero_points_;
   const int64_t axis_;
 };

 // This is an internal utility function for getting at the QTensorImpl,
 // You should only use this for writing low level
 // setters/getters for QTensorImpl fields; otherwise, you should use
 // the low level setters/getters that were implemented using this.
 // This may be called repeatedly, so make sure it's pretty cheap.
 CAFFE2_API QTensorImpl* get_qtensorimpl(const Tensor& self);

 // Quantize a float value into a uint value given scale and zero_point
 template <typename T>
 CAFFE2_API T quantize_val(double scale, int64_t zero_point, float value);
 template <typename T, int precision=8>
 void quantize_vec(double scale, int64_t zero_point, const float *src, T *dst, size_t count=8);
 template <typename T>
 CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, double scale, int64_t zero_point);
 template <typename T>
 CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value);
 template <typename T>
 CAFFE2_API float dequantize_vec(double scale, int64_t zero_point, const T* src, float* dst, size_t count=8);
 template <typename T>
 CAFFE2_API Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, double scale, int64_t zero_point);
 template <typename SRC_T, typename DST_T>
 CAFFE2_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);

 // Given a multiplier and a zero_point, requantize int32_t computed values back
 // to quantized values. See comment above
 // make_per_tensor_affine_quantizer function for the usage of int64_t
 template <typename DST_T>
 CAFFE2_API DST_T
 requantize_from_int(double multiplier, int64_t zero_point, int64_t src);

 // double and int64_t are because of the native function API, we only have these
 // argument types right now in native functions
 CAFFE2_API QuantizerPtr
 make_per_tensor_affine_quantizer(
     double scale, int64_t zero_point, ScalarType scalar_type);

 CAFFE2_API QuantizerPtr make_per_channel_affine_quantizer(
     const Tensor& scales,
     const Tensor& zero_points,
     int64_t axis,
     ScalarType scalar_type);

 // Create a Quantized Tensor given arguments for normal Tensor and a quantizer
 CAFFE2_API Tensor new_qtensor_cpu(
     IntArrayRef sizes,
     const TensorOptions& options,
     QuantizerPtr quantizer);

 } // namespace at
	#pragma once

	#include <c10/core/QScheme.h>
	#include <c10/core/MemoryFormat.h>
	#include <c10/macros/Macros.h>
	#include <c10/util/Exception.h>
	#include <c10/util/intrusive_ptr.h>
	#include <c10/core/ScalarType.h>
	#include <c10/core/TensorOptions.h>

	#include <ATen/TensorUtils.h>

	#include <cmath>
	#include <memory>

	// TODO: move to c10 namespace after we
	// unified caffe2::Tensor and at::Tensor

	namespace at {

	class Tensor;
	struct QTensorImpl;
	struct Quantizer;
	using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
	using QuantizerPtr = c10::intrusive_ptr<Quantizer>;

	/**
	* Quantizer is the class for storing all the information
	* that's necessary to perform quantize and dequantize
	* operation.
	*
	* We might have different types of quantization schemes and this is
	* the base class for all quantizers.
	*
	* QTensorImpl will hold a pointer to Quantizer so that we can support
	* different quantization schemes on Tensor.
	*
	* For example, the most common quantization scheme, Affine Quantization,
	* requires scale and zero_point as parameters, we'll store scale and zero_point
	* inside the instance and we can use it to quantize a float Tensor or
	* dequantize a quantized Tensor.
	*
	* When you add new types of leaf Quantizer class, please also
	* make sure to add a corresponding QScheme enum since
	* they should have one to one mapping.
	*
	* Note about intrusive_ptr:
	* Quantized Tensor holds an intrusive_ptr to Quantizer, and multiple Tensor can
	* share the same Quantizer. Quantizer should be immutable.
	*/
	struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target {
	const ScalarType scalar_type_;
	explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
	virtual ~Quantizer();

	// Copied from torch/csrc/jit/ir/scope.h
	QuantizerPtr intrusive_from_this() {
	c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
	// from a raw `this` pointer
	// so we need to bump the refcount
	// to account for this ownership
	return c10::intrusive_ptr<Quantizer>::reclaim(this);
	}

	/**
	* Each concrete Quantizer type should have a unique QScheme type.
	*/
	virtual QScheme qscheme() const = 0;

	ScalarType scalar_type() {
	return scalar_type_;
	}

	/**
	* quantize a float Tensor into a quantized Tensor.
	*/
	virtual Tensor quantize(Tensor t) = 0;

	/**
	* dequantize a quantized Tensor into a float Tensor.
	*/
	virtual Tensor dequantize(Tensor t) = 0;

	/**
	* Compare against `other` for equality.
	*/
	virtual bool equalTo(QuantizerPtr other) = 0;
	};

	/**
	* UniformQuantizer is the parent class for all uniform quantizers.
	* These quantization scheme will map float value uniformly to
	* the quantized value. For example, affine quantizer is
	* the most commonly used scheme in this category.
	*/
	struct CAFFE2_API UniformQuantizer : public Quantizer {
	explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
	};

	/**
	* NonUniformQuantizer is the parent class for all non-uniform quantizers.
	* These quantization scheme may map float value non-uniformly to the quantized
	* value. K-means quantization is a representative example in this category.
	*/
	struct CAFFE2_API NonUniformQuantizer : public Quantizer {
	explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
	};

	// There is also StochasticQuantizer which is uniform but not affine

	/**
	* AffineQuantizer uses affine transformation to do quantization.
	*
	* For quantize:
	* Y = clamp(round(X / scale + zero_point), min, max)
	* For dequantize:
	* X = (Y - zero_point) * scale
	*/
	struct CAFFE2_API AffineQuantizer : public UniformQuantizer {
	explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
	};

	// Note that we will not have Symmetric Quantizer in backend to reduce
	// complications in quantized kernel implementation.

	/**
	* PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
	* all the values in the Tensor.
	*/
	struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer {
	explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
	: AffineQuantizer(scalar_type),
	scale_(scale),
	zero_point_(zero_point) {}

	Tensor quantize(Tensor tensor) override;
	Tensor dequantize(Tensor tensor) override;

	QScheme qscheme() const override {
	return kPerTensorAffine;
	}

	double scale() const {
	return scale_;
	}

	int64_t zero_point() const {
	return zero_point_;
	}

	bool equalTo(QuantizerPtr other) override {
	if (!other.get() \|\| other->qscheme() != kPerTensorAffine) {
	return false;
	}
	auto* other_per_tensor_affine =
	static_cast<PerTensorAffineQuantizer*>(other.get());
	return scalar_type() == other_per_tensor_affine->scalar_type() &&
	scale() == other_per_tensor_affine->scale() &&
	zero_point() == other_per_tensor_affine->zero_point();
	}

	private:
	const double scale_;
	// We use int64_t for consistency with Python
	const int64_t zero_point_;
	};

	/**
	* PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer
	* except that we have an independent scale and zero_point parameter
	* for each channel.
	*
	* Also note that per channel quantization is mostly applied to output channels
	* of weights since per-input channel of weight quantization or per-channel
	* quantization for activations can't be efficiently supported in most of
	* processors since it requires each multiplication result within a single
	* dot-product to have a different scale.
	*/
	struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
	explicit PerChannelAffineQuantizer(
	ScalarType scalar_type,
	Tensor scales,
	Tensor zero_points,
	int64_t axis)
	: AffineQuantizer(scalar_type),
	scales_(scales),
	zero_points_(zero_points),
	axis_(axis) {}

	QScheme qscheme() const override {
	return kPerChannelAffine;
	}

	Tensor scales() const {
	return scales_;
	}

	Tensor zero_points() const {
	return zero_points_;
	}

	int64_t axis() const {
	return axis_;
	}

	Tensor quantize(Tensor tensor) override;
	Tensor dequantize(Tensor tensor) override;

	bool equalTo(QuantizerPtr other) override {
	if (!other.get() \|\| other->qscheme() != kPerChannelAffine) {
	return false;
	}
	auto* other_per_channel_affine =
	static_cast<PerChannelAffineQuantizer*>(other.get());
	return scalar_type() == other_per_channel_affine->scalar_type() &&
	scales().equal(other_per_channel_affine->scales()) &&
	zero_points().equal(other_per_channel_affine->zero_points()) &&
	axis() == other_per_channel_affine->axis();
	}

	private:
	Tensor scales_;
	Tensor zero_points_;
	const int64_t axis_;
	};

	// This is an internal utility function for getting at the QTensorImpl,
	// You should only use this for writing low level
	// setters/getters for QTensorImpl fields; otherwise, you should use
	// the low level setters/getters that were implemented using this.
	// This may be called repeatedly, so make sure it's pretty cheap.
	CAFFE2_API QTensorImpl* get_qtensorimpl(const Tensor& self);

	// Quantize a float value into a uint value given scale and zero_point
	template <typename T>
	CAFFE2_API T quantize_val(double scale, int64_t zero_point, float value);
	template <typename T, int precision=8>
	void quantize_vec(double scale, int64_t zero_point, const float src, T dst, size_t count=8);
	template <typename T>
	CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, double scale, int64_t zero_point);
	template <typename T>
	CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value);
	template <typename T>
	CAFFE2_API float dequantize_vec(double scale, int64_t zero_point, const T* src, float* dst, size_t count=8);
	template <typename T>
	CAFFE2_API Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, double scale, int64_t zero_point);
	template <typename SRC_T, typename DST_T>
	CAFFE2_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);

	// Given a multiplier and a zero_point, requantize int32_t computed values back
	// to quantized values. See comment above
	// make_per_tensor_affine_quantizer function for the usage of int64_t
	template <typename DST_T>
	CAFFE2_API DST_T
	requantize_from_int(double multiplier, int64_t zero_point, int64_t src);

	// double and int64_t are because of the native function API, we only have these
	// argument types right now in native functions
	CAFFE2_API QuantizerPtr
	make_per_tensor_affine_quantizer(
	double scale, int64_t zero_point, ScalarType scalar_type);

	CAFFE2_API QuantizerPtr make_per_channel_affine_quantizer(
	const Tensor& scales,
	const Tensor& zero_points,
	int64_t axis,
	ScalarType scalar_type);

	// Create a Quantized Tensor given arguments for normal Tensor and a quantizer
	CAFFE2_API Tensor new_qtensor_cpu(
	IntArrayRef sizes,
	const TensorOptions& options,
	QuantizerPtr quantizer);

	} // namespace at