[cpu][vec512] improve double load/store with mask (#116963)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/116963
Approved by: https://github.com/leslie-fang-intel
ghstack dependencies: #116961, #116962
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
index 8ed32e4..b3ed758 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h
@@ -83,27 +83,15 @@
if (count == size())
return _mm512_loadu_pd(reinterpret_cast<const double*>(ptr));
-
- __at_align__ double tmp_values[size()];
- // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
- // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
- // instructions while a loop would be compiled to one instruction.
- for (const auto i : c10::irange(size())) {
- tmp_values[i] = 0.0;
- }
- std::memcpy(
- tmp_values,
- reinterpret_cast<const double*>(ptr),
- count * sizeof(double));
- return _mm512_load_pd(tmp_values);
+ __mmask8 mask = (1 << count) - 1;
+ return _mm512_maskz_loadu_pd(mask, ptr);
}
void store(void* ptr, int count = size()) const {
if (count == size()) {
_mm512_storeu_pd(reinterpret_cast<double*>(ptr), values);
} else if (count > 0) {
- double tmp_values[size()];
- _mm512_storeu_pd(reinterpret_cast<double*>(tmp_values), values);
- std::memcpy(ptr, tmp_values, count * sizeof(double));
+ __mmask8 mask = (1ULL << count) - 1;
+ _mm512_mask_storeu_pd(reinterpret_cast<double*>(ptr), mask, values);
}
}
const double& operator[](int idx) const = delete;