blob: 547cf4ddb603a9993c9a3daec5bfc46bdc37d5ee [file] [log] [blame]
#pragma once
#include <c10/core/QScheme.h>
#include <c10/core/MemoryFormat.h>
#include <c10/macros/Macros.h>
#include <c10/util/Exception.h>
#include <c10/util/intrusive_ptr.h>
#include <c10/core/ScalarType.h>
#include <c10/core/TensorOptions.h>
#include <ATen/TensorUtils.h>
#include <cmath>
#include <memory>
// TODO: move to c10 namespace after we
// unified caffe2::Tensor and at::Tensor
namespace at {
class Tensor;
struct QTensorImpl;
struct Quantizer;
using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
/**
* Quantizer is the class for storing all the information
* that's necessary to perform quantize and dequantize
* operation.
*
* We might have different types of quantization schemes and this is
* the base class for all quantizers.
*
* QTensorImpl will hold a pointer to Quantizer so that we can support
* different quantization schemes on Tensor.
*
* For example, the most common quantization scheme, Affine Quantization,
* requires scale and zero_point as parameters, we'll store scale and zero_point
* inside the instance and we can use it to quantize a float Tensor or
* dequantize a quantized Tensor.
*
* When you add new types of leaf Quantizer class, please also
* make sure to add a corresponding QScheme enum since
* they should have one to one mapping.
*
* Note about intrusive_ptr:
* Quantized Tensor holds an intrusive_ptr to Quantizer, and multiple Tensor can
* share the same Quantizer. Quantizer should be immutable.
*/
struct CAFFE2_API Quantizer : public c10::intrusive_ptr_target {
const ScalarType scalar_type_;
explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {}
virtual ~Quantizer();
// Copied from torch/csrc/jit/ir/scope.h
QuantizerPtr intrusive_from_this() {
c10::raw::intrusive_ptr::incref(this); // we are creating a new pointer
// from a raw `this` pointer
// so we need to bump the refcount
// to account for this ownership
return c10::intrusive_ptr<Quantizer>::reclaim(this);
}
/**
* Each concrete Quantizer type should have a unique QScheme type.
*/
virtual QScheme qscheme() const = 0;
ScalarType scalar_type() {
return scalar_type_;
}
/**
* quantize a float Tensor into a quantized Tensor.
*/
virtual Tensor quantize(Tensor t) = 0;
/**
* dequantize a quantized Tensor into a float Tensor.
*/
virtual Tensor dequantize(Tensor t) = 0;
/**
* Compare against `other` for equality.
*/
virtual bool equalTo(QuantizerPtr other) = 0;
};
/**
* UniformQuantizer is the parent class for all uniform quantizers.
* These quantization scheme will map float value uniformly to
* the quantized value. For example, affine quantizer is
* the most commonly used scheme in this category.
*/
struct CAFFE2_API UniformQuantizer : public Quantizer {
explicit UniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
};
/**
* NonUniformQuantizer is the parent class for all non-uniform quantizers.
* These quantization scheme may map float value non-uniformly to the quantized
* value. K-means quantization is a representative example in this category.
*/
struct CAFFE2_API NonUniformQuantizer : public Quantizer {
explicit NonUniformQuantizer(ScalarType scalar_type) : Quantizer(scalar_type) {}
};
// There is also StochasticQuantizer which is uniform but not affine
/**
* AffineQuantizer uses affine transformation to do quantization.
*
* For quantize:
* Y = clamp(round(X / scale + zero_point), min, max)
* For dequantize:
* X = (Y - zero_point) * scale
*/
struct CAFFE2_API AffineQuantizer : public UniformQuantizer {
explicit AffineQuantizer(ScalarType scalar_type) : UniformQuantizer(scalar_type) {}
};
// Note that we will not have Symmetric Quantizer in backend to reduce
// complications in quantized kernel implementation.
/**
* PerTensorAffineQuantizer stores a scale and a zero_point, which is used for
* all the values in the Tensor.
*/
struct CAFFE2_API PerTensorAffineQuantizer : public AffineQuantizer {
explicit PerTensorAffineQuantizer(ScalarType scalar_type, double scale, int64_t zero_point)
: AffineQuantizer(scalar_type),
scale_(scale),
zero_point_(zero_point) {}
Tensor quantize(Tensor tensor) override;
Tensor dequantize(Tensor tensor) override;
QScheme qscheme() const override {
return kPerTensorAffine;
}
double scale() const {
return scale_;
}
int64_t zero_point() const {
return zero_point_;
}
bool equalTo(QuantizerPtr other) override {
if (!other.get() || other->qscheme() != kPerTensorAffine) {
return false;
}
auto* other_per_tensor_affine =
static_cast<PerTensorAffineQuantizer*>(other.get());
return scalar_type() == other_per_tensor_affine->scalar_type() &&
scale() == other_per_tensor_affine->scale() &&
zero_point() == other_per_tensor_affine->zero_point();
}
private:
const double scale_;
// We use int64_t for consistency with Python
const int64_t zero_point_;
};
/**
* PerChannelAffineQuantizer is the same as PerTensorAffineQuantizer
* except that we have an independent scale and zero_point parameter
* for each channel.
*
* Also note that per channel quantization is mostly applied to output channels
* of weights since per-input channel of weight quantization or per-channel
* quantization for activations can't be efficiently supported in most of
* processors since it requires each multiplication result within a single
* dot-product to have a different scale.
*/
struct CAFFE2_API PerChannelAffineQuantizer : public AffineQuantizer {
explicit PerChannelAffineQuantizer(
ScalarType scalar_type,
Tensor scales,
Tensor zero_points,
int64_t axis)
: AffineQuantizer(scalar_type),
scales_(scales),
zero_points_(zero_points),
axis_(axis) {}
QScheme qscheme() const override {
return kPerChannelAffine;
}
Tensor scales() const {
return scales_;
}
Tensor zero_points() const {
return zero_points_;
}
int64_t axis() const {
return axis_;
}
Tensor quantize(Tensor tensor) override;
Tensor dequantize(Tensor tensor) override;
bool equalTo(QuantizerPtr other) override {
if (!other.get() || other->qscheme() != kPerChannelAffine) {
return false;
}
auto* other_per_channel_affine =
static_cast<PerChannelAffineQuantizer*>(other.get());
return scalar_type() == other_per_channel_affine->scalar_type() &&
scales().equal(other_per_channel_affine->scales()) &&
zero_points().equal(other_per_channel_affine->zero_points()) &&
axis() == other_per_channel_affine->axis();
}
private:
Tensor scales_;
Tensor zero_points_;
const int64_t axis_;
};
// This is an internal utility function for getting at the QTensorImpl,
// You should only use this for writing low level
// setters/getters for QTensorImpl fields; otherwise, you should use
// the low level setters/getters that were implemented using this.
// This may be called repeatedly, so make sure it's pretty cheap.
CAFFE2_API QTensorImpl* get_qtensorimpl(const Tensor& self);
// Quantize a float value into a uint value given scale and zero_point
template <typename T>
CAFFE2_API T quantize_val(double scale, int64_t zero_point, float value);
template <typename T, int precision=8>
void quantize_vec(double scale, int64_t zero_point, const float *src, T *dst, size_t count=8);
template <typename T>
CAFFE2_API Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, double scale, int64_t zero_point);
template <typename T>
CAFFE2_API float dequantize_val(double scale, int64_t zero_point, T value);
template <typename T>
CAFFE2_API float dequantize_vec(double scale, int64_t zero_point, const T* src, float* dst, size_t count=8);
template <typename T>
CAFFE2_API Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, double scale, int64_t zero_point);
template <typename SRC_T, typename DST_T>
CAFFE2_API DST_T requantize_val(double, int64_t, double, int64_t, SRC_T src);
// Given a multiplier and a zero_point, requantize int32_t computed values back
// to quantized values. See comment above
// make_per_tensor_affine_quantizer function for the usage of int64_t
template <typename DST_T>
CAFFE2_API DST_T
requantize_from_int(double multiplier, int64_t zero_point, int64_t src);
// double and int64_t are because of the native function API, we only have these
// argument types right now in native functions
CAFFE2_API QuantizerPtr
make_per_tensor_affine_quantizer(
double scale, int64_t zero_point, ScalarType scalar_type);
CAFFE2_API QuantizerPtr make_per_channel_affine_quantizer(
const Tensor& scales,
const Tensor& zero_points,
int64_t axis,
ScalarType scalar_type);
// Create a Quantized Tensor given arguments for normal Tensor and a quantizer
CAFFE2_API Tensor new_qtensor_cpu(
IntArrayRef sizes,
const TensorOptions& options,
QuantizerPtr quantizer);
} // namespace at