[Quant][PT2E] Enable weight scale optimization in QConv PT2E (#105996) **Summary** After oneDNN 3.1 upgrade, we don't need to do the weight scale reciprocal calculation. So, remove the redundant reciprocal calculation to optimize QConv performance and using IDeep version API to implement it in this PR: - This QConv implementation expects to work functionally both with current IDeep version and the following IDeep upgrade in PR: https://github.com/pytorch/pytorch/pull/107565. - With the following IDeep upgrade in PR: https://github.com/pytorch/pytorch/pull/107565, the QConv has better performance since the redundant reciprocal calculation are removed. Pull Request resolved: https://github.com/pytorch/pytorch/pull/105996 Approved by: https://github.com/jgong5, https://github.com/jerryzh168 ghstack dependencies: #104580, #104581, #104588, #104590, #105455, #105456, #105639, #105906

commit: 780a5a0c7dd2b0886b3752abf64e3393c5f26cf3 [log] [tgz]
author: leslie-fang-intel <leslie.fang@intel.com> Sat Aug 26 07:50:41 2023 +0800
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Sat Aug 26 08:39:18 2023 +0000
tree: 669d9bd6437c738ab06b9eca4001a9141612ab39
parent: 9319dd1c7ca5fcf808db33ba6eaab9583b5f1e96 [diff]
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 5012097..05c71eb 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp

@@ -1473,6 +1473,7 @@
   // Scales of ONEDNN and PyTorch are reciprocal
   const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0 / act_scale);
 
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
   // TODO (leslie): optimize the performance here:
   // 1. Remove the reciprocal of weight scale, we have done the reciprocal of weight scale back in Ideep:
   // https://github.com/intel/ideep/blob/3c90e365526e19c110371d23831678a7e9d4353d/include/ideep/operators/conv.hpp#L163-L168
@@ -1491,6 +1492,26 @@
       weights_scales[i] = 1.0 / weight_scales[i].item().toDouble();
     }
   }
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+  // 1. If the weight scale generated by observer should with dtype float32
+  // https://github.com/pytorch/pytorch/blob/d2c24eca8a60c56b31ca967a44d5cc4522802aa6/torch/ao/quantization/observer.py#L323
+  // 2. If the weight scale got from the quantized tensor, like did in the UT. It's with dtype of double.
+  // https://github.com/pytorch/pytorch/blob/d2fa3f608b5e4f582a8aaf752f10efe4ca72a7d0/aten/src/ATen/quantized/Quantizer.cpp#L69
+  TORCH_CHECK(
+    weight_scales.scalar_type() == c10::ScalarType::Double || weight_scales.scalar_type() == c10::ScalarType::Float,
+    "weight_scales should be with data type Double or float");
+  if (weight_scales.scalar_type() == c10::ScalarType::Double) {
+    // For case 2, we will convert it from double to float, since ideep::scale_t is alias of std::vector<float>
+    weight_scales = weight_scales.to(c10::ScalarType::Float);
+  }
+  TORCH_CHECK(
+    weight_scales.ndimension() == 0 ||
+    (weight_scales.strides().size() == 1 || weight_scales.stride(0) == 1),
+    "weight_scales should be scalar tensor or contiguous 1D tensor.");
+  ideep::scale_t weights_scales(weight_scales.data_ptr<float>(), weight_scales.data_ptr<float>()+weight_scales.numel());
+#else
+  TORCH_CHECK(false, "Unexpected IDeep version to do qconv calculation.");
+#endif
 
   const ideep::zero_point_t src_zero_points = ideep::zero_point_t(1, act_zero_point);
   const ideep::zero_point_t dst_zero_points = ideep::zero_point_t(1, output_zero_point);

diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 9a42e90..25ab60a 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp

@@ -379,7 +379,13 @@
         "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight,"
         " whose zero point must be 0.");
     wgt_zero_points = std::vector<int32_t>(1, weight.q_zero_point());
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
     wgt_scales = ideep::scale_t(1, 1.0/weight.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+    wgt_scales = ideep::scale_t(1, weight.q_scale());
+#else
+    TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
   } else if (qtype == c10::kPerChannelAffine) {
     TORCH_CHECK(
         !transpose,
@@ -392,7 +398,13 @@
           wgt_zero_points[i]==0,
           "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight,"
           " whose zero point must be 0.");
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
       wgt_scales[i] = 1.0f / weight.q_per_channel_scales()[i].item<float>(); // Scales of ONEDNN and PyTorch are reciprocal
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+      wgt_scales[i] = weight.q_per_channel_scales()[i].item<float>();
+#else
+      TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
     }
   } else {
     TORCH_CHECK(false, "Unsupported qscheme: ", toString(qtype));
@@ -536,11 +548,23 @@
     TORCH_CHECK(
         weight_scales.numel() == 1,
         "Weight is quant per tensor, weight scale expects 1 element but got ", weight_scales.numel(), " elements.");
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
     weights_scales[0] = 1.0 / weight_scales.item().toDouble(); // Scales of ONEDNN and PyTorch are reciprocal
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+    weights_scales[0] = weight_scales.item().toDouble();
+#else
+    TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
   } else {
     // Weight is quant per channel
     for (int i = 0; i < weight_scales.numel(); ++i) {
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
       weights_scales[i] = 1.0 / weight_scales[i].item().toDouble();
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+      weights_scales[i] = weight_scales[i].item().toDouble();
+#else
+      TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
     }
   }
commit	780a5a0c7dd2b0886b3752abf64e3393c5f26cf3	[log] [tgz]
author	leslie-fang-intel <leslie.fang@intel.com>	Sat Aug 26 07:50:41 2023 +0800
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Sat Aug 26 08:39:18 2023 +0000
tree	669d9bd6437c738ab06b9eca4001a9141612ab39
parent	9319dd1c7ca5fcf808db33ba6eaab9583b5f1e96 [diff]