Enable loading int8 prepacked models in PredictorContainer

Summary: To test the int8 ads models on CPU and accelerators with the ads replayer, we need to load the PREPACKING_INIT_NET_TYPE in the int8 model to initialize the int8 w_packed blobs.

Test Plan:
Ads replayer test.

P74811059

Reviewed By: zrphercule

Differential Revision: D16518888

fbshipit-source-id: cee212710ad37d9e491c970b25b2fe484373e5e4
diff --git a/caffe2/opt/bound_shape_inferencer.cc b/caffe2/opt/bound_shape_inferencer.cc
index 54a9bcf..d79c73e 100644
--- a/caffe2/opt/bound_shape_inferencer.cc
+++ b/caffe2/opt/bound_shape_inferencer.cc
@@ -53,7 +53,7 @@
     InferSparseLengthsSum(op);
   } else if (
       op.type() == "FC" || op.type() == "FCTransposed" ||
-      op.type() == "FbFCPacked") {
+      op.type() == "FbFCPacked" || op.type() == "Int8FC") {
     InferFC(op);
   } else if (op.type() == "Concat") {
     InferConcat(op);
@@ -424,12 +424,13 @@
   const ShapeInfo& w_shape_info = w_it->second;
   const auto b_it = shape_info_.find(op.input(2));
   CAFFE_ENFORCE(
-      w_it != shape_info_.end(),
+      b_it != shape_info_.end(),
       "Shape of BIAS input of FC ",
       op.input(2),
       " needs to be presented");
   const ShapeInfo& b_shape_info = b_it->second;
   bool fp16 = (op.type() == "FbFCPacked");
+  bool int8_fc = (op.type() == "Int8FC" || op.engine() == "DNNLOWP");
   auto x_it = shape_info_.find(op.input(0));
   if (x_it == shape_info_.end()) {
     // We don't have a hint at the x input we try to deduce it from weight
@@ -451,13 +452,21 @@
     dims.push_back(K);
     current_dim_type_ = ShapeInfo::DimType::BATCH;
     current_max_batch_size_ = spec_.max_batch_size;
+    TensorProto::DataType w_data_type;
+    if (fp16) {
+      w_data_type = TensorProto_DataType_FLOAT;
+    } else if (int8_fc) {
+      w_data_type = TensorProto_DataType_UINT8;
+    } else {
+      w_data_type = w_shape.data_type();
+    }
     // Note: for FbFCPacked, weight is fp16 but actications are in fp32
     CheckAndSetTensorShapeAndType(
         op.input(0),
         ShapeInfo::DimType::BATCH,
         dims,
-        fp16 ? TensorProto_DataType_FLOAT : w_shape.data_type(),
-        false);
+        w_data_type,
+        int8_fc ? true : false);
   } else {
     ShapeInfo& x_shape_info = x_it->second;
     if (x_shape_info.dim_type != ShapeInfo::DimType::BATCH) {
@@ -472,12 +481,20 @@
       shape_info_[op.input(0)].shape, w_shape_info.shape, b_shape_info.shape};
   std::vector<TensorShape> output_shapes = InferOutput(op, input_shapes);
   CAFFE_ENFORCE_EQ(output_shapes.size(), 1);
+  TensorProto::DataType output_data_type;
+  if (fp16) {
+    output_data_type = TensorProto_DataType_FLOAT;
+  } else if (int8_fc) {
+    output_data_type = TensorProto_DataType_UINT8;
+  } else {
+    output_data_type = output_shapes[0].data_type();
+  }
   CheckAndSetTensorShapeAndType(
       op.output(0),
       ShapeInfo::DimType::BATCH,
       ConvertToVec(output_shapes[0].dims()),
-      fp16 ? TensorProto_DataType_FLOAT : output_shapes[0].data_type(),
-      false);
+      output_data_type,
+      int8_fc ? true : false);
 }
 
 void BoundShapeInferencer::InferCommonOp(const OperatorDef& op) {
@@ -511,7 +528,8 @@
           {"Int8AveragePool", 0},
           {"Int8FC", 1},
           {"Int8Conv", 1},
-          {"Int8SumRelu", 0}};
+          {"Int8SumRelu", 0},
+          {"Int8Relu", 0}};
       CAFFE_ENFORCE(
           type_info_from_input.find(op.type()) != type_info_from_input.end(),
           "Undefined quantized output data type, add it into type_info_from_input");
diff --git a/caffe2/opt/custom/glow_net_transform.cc b/caffe2/opt/custom/glow_net_transform.cc
index 679dd11..141522b 100644
--- a/caffe2/opt/custom/glow_net_transform.cc
+++ b/caffe2/opt/custom/glow_net_transform.cc
@@ -125,7 +125,11 @@
       if (kv.size() == 2) {
         auto dims = caffe2::split(',', kv.back());
         TensorShape input;
-        input.set_data_type(TensorProto_DataType_FLOAT);
+        if (kv.front().find("int8") != std::string::npos) {
+          input.set_data_type(TensorProto_DataType_UINT8);
+        } else {
+          input.set_data_type(TensorProto_DataType_FLOAT);
+        }
         bool valid = true;
         for (const auto& d : dims) {
           try {
diff --git a/caffe2/quantization/server/fbgemm_pack_op.cc b/caffe2/quantization/server/fbgemm_pack_op.cc
index 0938fdf..04ea044 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.cc
+++ b/caffe2/quantization/server/fbgemm_pack_op.cc
@@ -229,7 +229,9 @@
     : DNNLowPOp<uint8_t, FCFp32Op>(operator_def, ws),
       axis_w_(this->GetSingleArgument<int32_t>("axis_w", 1)),
       quantize_channelwise_(
-          this->GetSingleArgument<bool>("quantize_channelwise", false)) {
+          this->GetSingleArgument<bool>("quantize_channelwise", false)),
+      save_unpacked_weights_(
+          this->GetSingleArgument<bool>("save_unpacked_weights", false)) {
   if (this->debug_def().engine() == "DNNLOWP_ROWWISE") {
     quantize_channelwise_ = true;
   }
@@ -258,6 +260,13 @@
   QuantizeWeight<uint8_t>(
       InputBlob(0), K, N, Y->qparams, W_quantized, qfactory_.get());
 
+  if (save_unpacked_weights_) {
+    ReinitializeTensor(
+        &Y->original_tensor, filter.sizes(), at::dtype<int8_t>().device(CPU));
+    auto* buffer = Y->original_tensor.template mutable_data<int8_t>();
+    CAFFE_ENFORCE_EQ(Y->original_tensor.numel(), W_quantized.size());
+    memcpy(buffer, W_quantized.data(), W_quantized.size() * sizeof(int8_t));
+  }
   if (this->InputIsType<int8::Int8TensorCPU>(0) && quantize_channelwise_) {
     static int log_occurences = 0;
     if (log_occurences < 32) {
diff --git a/caffe2/quantization/server/fbgemm_pack_op.h b/caffe2/quantization/server/fbgemm_pack_op.h
index 54d3bda..b615b47 100644
--- a/caffe2/quantization/server/fbgemm_pack_op.h
+++ b/caffe2/quantization/server/fbgemm_pack_op.h
@@ -24,6 +24,7 @@
   int axis_w_;
   bool quantize_channelwise_;
   int nbits_in_non_outlier_; // only for DNNLOWP_ACC16
+  bool save_unpacked_weights_;
 
   INPUT_TAGS(FILTER, BIAS);
 };