[cpu][vec512] improve double load/store with mask (#116963) Pull Request resolved: https://github.com/pytorch/pytorch/pull/116963 Approved by: https://github.com/leslie-fang-intel ghstack dependencies: #116961, #116962

commit: 4e54a704516667ba5d5534d4cd7c91d15caa35dc [log] [tgz]
author: Jiong Gong <jiong.gong@intel.com> Mon Jan 08 19:34:23 2024 +0800
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Tue Jan 09 04:37:44 2024 +0000
tree: d8b68a63aa1ec1a43a217165c46ff344f2832f64
parent: 428807f9bcd9bb32e0369c6b9c6f12fa4a4080e9 [diff]
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 8ed32e4..b3ed758 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h

@@ -83,27 +83,15 @@
     if (count == size())
       return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
 
-
-    __at_align__ double tmp_values[size()];
-    // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
-    // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
-    // instructions while a loop would be compiled to one instruction.
-    for (const auto i : c10::irange(size())) {
-      tmp_values[i] = 0.0;
-    }
-    std::memcpy(
-        tmp_values,
-        reinterpret_cast<const double*>(ptr),
-        count * sizeof(double));
-    return _mm512_load_pd(tmp_values);
+    __mmask8 mask = (1 << count) - 1;
+    return _mm512_maskz_loadu_pd(mask, ptr);
   }
   void store(void* ptr, int count = size()) const {
     if (count == size()) {
       _mm512_storeu_pd(reinterpret_cast<double*>(ptr), values);
     } else if (count > 0) {
-      double tmp_values[size()];
-      _mm512_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
-      std::memcpy(ptr, tmp_values, count * sizeof(double));
+      __mmask8 mask = (1ULL << count) - 1;
+      _mm512_mask_storeu_pd(reinterpret_cast<double*>(ptr), mask, values);
     }
   }
   const double& operator[](int idx) const  = delete;
commit	4e54a704516667ba5d5534d4cd7c91d15caa35dc	[log] [tgz]
author	Jiong Gong <jiong.gong@intel.com>	Mon Jan 08 19:34:23 2024 +0800
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Tue Jan 09 04:37:44 2024 +0000
tree	d8b68a63aa1ec1a43a217165c46ff344f2832f64
parent	428807f9bcd9bb32e0369c6b9c6f12fa4a4080e9 [diff]