[Quant][PT2E] Enable weight scale optimization in QConv PT2E (#105996)

**Summary**
After oneDNN 3.1 upgrade, we don't need to do the weight scale reciprocal calculation. So, remove the redundant reciprocal calculation to optimize QConv performance and using IDeep version API to implement it in this PR:

- This QConv implementation expects to work functionally both with current IDeep version and the following IDeep upgrade in PR: https://github.com/pytorch/pytorch/pull/107565.
- With the following IDeep upgrade in PR: https://github.com/pytorch/pytorch/pull/107565, the QConv has better performance since the redundant reciprocal calculation are removed.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/105996
Approved by: https://github.com/jgong5, https://github.com/jerryzh168
ghstack dependencies: #104580, #104581, #104588, #104590, #105455, #105456, #105639, #105906
diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 5012097..05c71eb 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -1473,6 +1473,7 @@
   // Scales of ONEDNN and PyTorch are reciprocal
   const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0 / act_scale);
 
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
   // TODO (leslie): optimize the performance here:
   // 1. Remove the reciprocal of weight scale, we have done the reciprocal of weight scale back in Ideep:
   // https://github.com/intel/ideep/blob/3c90e365526e19c110371d23831678a7e9d4353d/include/ideep/operators/conv.hpp#L163-L168
@@ -1491,6 +1492,26 @@
       weights_scales[i] = 1.0 / weight_scales[i].item().toDouble();
     }
   }
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+  // 1. If the weight scale generated by observer should with dtype float32
+  // https://github.com/pytorch/pytorch/blob/d2c24eca8a60c56b31ca967a44d5cc4522802aa6/torch/ao/quantization/observer.py#L323
+  // 2. If the weight scale got from the quantized tensor, like did in the UT. It's with dtype of double.
+  // https://github.com/pytorch/pytorch/blob/d2fa3f608b5e4f582a8aaf752f10efe4ca72a7d0/aten/src/ATen/quantized/Quantizer.cpp#L69
+  TORCH_CHECK(
+    weight_scales.scalar_type() == c10::ScalarType::Double || weight_scales.scalar_type() == c10::ScalarType::Float,
+    "weight_scales should be with data type Double or float");
+  if (weight_scales.scalar_type() == c10::ScalarType::Double) {
+    // For case 2, we will convert it from double to float, since ideep::scale_t is alias of std::vector<float>
+    weight_scales = weight_scales.to(c10::ScalarType::Float);
+  }
+  TORCH_CHECK(
+    weight_scales.ndimension() == 0 ||
+    (weight_scales.strides().size() == 1 || weight_scales.stride(0) == 1),
+    "weight_scales should be scalar tensor or contiguous 1D tensor.");
+  ideep::scale_t weights_scales(weight_scales.data_ptr<float>(), weight_scales.data_ptr<float>()+weight_scales.numel());
+#else
+  TORCH_CHECK(false, "Unexpected IDeep version to do qconv calculation.");
+#endif
 
   const ideep::zero_point_t src_zero_points = ideep::zero_point_t(1, act_zero_point);
   const ideep::zero_point_t dst_zero_points = ideep::zero_point_t(1, output_zero_point);
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index 9a42e90..25ab60a 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -379,7 +379,13 @@
         "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight,"
         " whose zero point must be 0.");
     wgt_zero_points = std::vector<int32_t>(1, weight.q_zero_point());
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
     wgt_scales = ideep::scale_t(1, 1.0/weight.q_scale()); // Scales of ONEDNN and PyTorch are reciprocal
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+    wgt_scales = ideep::scale_t(1, weight.q_scale());
+#else
+    TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
   } else if (qtype == c10::kPerChannelAffine) {
     TORCH_CHECK(
         !transpose,
@@ -392,7 +398,13 @@
           wgt_zero_points[i]==0,
           "quantized::qconv_prepack: ONEDNN only supports symmetric quantization of weight,"
           " whose zero point must be 0.");
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
       wgt_scales[i] = 1.0f / weight.q_per_channel_scales()[i].item<float>(); // Scales of ONEDNN and PyTorch are reciprocal
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+      wgt_scales[i] = weight.q_per_channel_scales()[i].item<float>();
+#else
+      TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
     }
   } else {
     TORCH_CHECK(false, "Unsupported qscheme: ", toString(qtype));
@@ -536,11 +548,23 @@
     TORCH_CHECK(
         weight_scales.numel() == 1,
         "Weight is quant per tensor, weight scale expects 1 element but got ", weight_scales.numel(), " elements.");
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
     weights_scales[0] = 1.0 / weight_scales.item().toDouble(); // Scales of ONEDNN and PyTorch are reciprocal
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+    weights_scales[0] = weight_scales.item().toDouble();
+#else
+    TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
   } else {
     // Weight is quant per channel
     for (int i = 0; i < weight_scales.numel(); ++i) {
+#if defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION == 0
       weights_scales[i] = 1.0 / weight_scales[i].item().toDouble();
+#elif defined(IDEEP_VERSION_MAJOR) && IDEEP_VERSION_MAJOR>=3 && defined(IDEEP_VERSION_REVISION) && IDEEP_VERSION_REVISION > 0
+      weights_scales[i] = weight_scales[i].item().toDouble();
+#else
+      TORCH_CHECK(false, "Unexpected IDeep version to do qconv weight prepack.");
+#endif
     }
   }