Enable loading int8 prepacked models in PredictorContainer
Summary: To test the int8 ads models on CPU and accelerators with the ads replayer, we need to load the PREPACKING_INIT_NET_TYPE in the int8 model to initialize the int8 w_packed blobs.
Test Plan:
Ads replayer test.
P74811059
Reviewed By: zrphercule
Differential Revision: D16518888
fbshipit-source-id: cee212710ad37d9e491c970b25b2fe484373e5e4
diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc
index 54a9bcf..d79c73e 100644
--- a/caffe2/opt/bound_shape_inferencer.cc
+++ b/caffe2/opt/bound_shape_inferencer.cc
@@ -53,7 +53,7 @@
InferSparseLengthsSum(op);
} else if (
op.type() == "FC" || op.type() == "FCTransposed" ||
- op.type() == "FbFCPacked") {
+ op.type() == "FbFCPacked" || op.type() == "Int8FC") {
InferFC(op);
} else if (op.type() == "Concat") {
InferConcat(op);
@@ -424,12 +424,13 @@
const ShapeInfo& w_shape_info = w_it->second;
const auto b_it = shape_info_.find(op.input(2));
CAFFE_ENFORCE(
- w_it != shape_info_.end(),
+ b_it != shape_info_.end(),
"Shape of BIAS input of FC ",
op.input(2),
" needs to be presented");
const ShapeInfo& b_shape_info = b_it->second;
bool fp16 = (op.type() == "FbFCPacked");
+ bool int8_fc = (op.type() == "Int8FC" || op.engine() == "DNNLOWP");
auto x_it = shape_info_.find(op.input(0));
if (x_it == shape_info_.end()) {
// We don't have a hint at the x input we try to deduce it from weight
@@ -451,13 +452,21 @@
dims.push_back(K);
current_dim_type_ = ShapeInfo::DimType::BATCH;
current_max_batch_size_ = spec_.max_batch_size;
+ TensorProto::DataType w_data_type;
+ if (fp16) {
+ w_data_type = TensorProto_DataType_FLOAT;
+ } else if (int8_fc) {
+ w_data_type = TensorProto_DataType_UINT8;
+ } else {
+ w_data_type = w_shape.data_type();
+ }
// Note: for FbFCPacked, weight is fp16 but actications are in fp32
CheckAndSetTensorShapeAndType(
op.input(0),
ShapeInfo::DimType::BATCH,
dims,
- fp16 ? TensorProto_DataType_FLOAT : w_shape.data_type(),
- false);
+ w_data_type,
+ int8_fc ? true : false);
} else {
ShapeInfo& x_shape_info = x_it->second;
if (x_shape_info.dim_type != ShapeInfo::DimType::BATCH) {
@@ -472,12 +481,20 @@
shape_info_[op.input(0)].shape, w_shape_info.shape, b_shape_info.shape};
std::vector<TensorShape> output_shapes = InferOutput(op, input_shapes);
CAFFE_ENFORCE_EQ(output_shapes.size(), 1);
+ TensorProto::DataType output_data_type;
+ if (fp16) {
+ output_data_type = TensorProto_DataType_FLOAT;
+ } else if (int8_fc) {
+ output_data_type = TensorProto_DataType_UINT8;
+ } else {
+ output_data_type = output_shapes[0].data_type();
+ }
CheckAndSetTensorShapeAndType(
op.output(0),
ShapeInfo::DimType::BATCH,
ConvertToVec(output_shapes[0].dims()),
- fp16 ? TensorProto_DataType_FLOAT : output_shapes[0].data_type(),
- false);
+ output_data_type,
+ int8_fc ? true : false);
}
void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
@@ -511,7 +528,8 @@
{"Int8AveragePool", 0},
{"Int8FC", 1},
{"Int8Conv", 1},
- {"Int8SumRelu", 0}};
+ {"Int8SumRelu", 0},
+ {"Int8Relu", 0}};
CAFFE_ENFORCE(
type_info_from_input.find(op.type()) != type_info_from_input.end(),
"Undefined quantized output data type, add it into type_info_from_input");
diff --git a/caffe2/opt/custom/glow_net_transform.cc b/caffe2/opt/custom/glow_net_transform.cc
index 679dd11..141522b 100644
--- a/caffe2/opt/custom/glow_net_transform.cc
+++ b/caffe2/opt/custom/glow_net_transform.cc
@@ -125,7 +125,11 @@
if (kv.size() == 2) {
auto dims = caffe2::split(',', kv.back());
TensorShape input;
- input.set_data_type(TensorProto_DataType_FLOAT);
+ if (kv.front().find("int8") != std::string::npos) {
+ input.set_data_type(TensorProto_DataType_UINT8);
+ } else {
+ input.set_data_type(TensorProto_DataType_FLOAT);
+ }
bool valid = true;
for (const auto& d : dims) {
try {
diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc
index 0938fdf..04ea044 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.cc
+++ b/caffe2/quantization/server/fbgemm_pack_op.cc
@@ -229,7 +229,9 @@
: DNNLowPOp<uint8_t, FCFp32Op>(operator_def, ws),
axis_w_(this->GetSingleArgument<int32_t>("axis_w", 1)),
quantize_channelwise_(
- this->GetSingleArgument<bool>("quantize_channelwise", false)) {
+ this->GetSingleArgument<bool>("quantize_channelwise", false)),
+ save_unpacked_weights_(
+ this->GetSingleArgument<bool>("save_unpacked_weights", false)) {
if (this->debug_def().engine() == "DNNLOWP_ROWWISE") {
quantize_channelwise_ = true;
}
@@ -258,6 +260,13 @@
QuantizeWeight<uint8_t>(
InputBlob(0), K, N, Y->qparams, W_quantized, qfactory_.get());
+ if (save_unpacked_weights_) {
+ ReinitializeTensor(
+ &Y->original_tensor, filter.sizes(), at::dtype<int8_t>().device(CPU));
+ auto* buffer = Y->original_tensor.template mutable_data<int8_t>();
+ CAFFE_ENFORCE_EQ(Y->original_tensor.numel(), W_quantized.size());
+ memcpy(buffer, W_quantized.data(), W_quantized.size() * sizeof(int8_t));
+ }
if (this->InputIsType<int8::Int8TensorCPU>(0) && quantize_channelwise_) {
static int log_occurences = 0;
if (log_occurences < 32) {
diff --git a/caffe2/quantization/server/fbgemm_pack_op.h b/caffe2/quantization/server/fbgemm_pack_op.h
index 54d3bda..b615b47 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.h
+++ b/caffe2/quantization/server/fbgemm_pack_op.h
@@ -24,6 +24,7 @@
int axis_w_;
bool quantize_channelwise_;
int nbits_in_non_outlier_; // only for DNNLOWP_ACC16
+ bool save_unpacked_weights_;
INPUT_TAGS(FILTER, BIAS);
};