Added vectorized flip for uint8 (#90013) Following https://github.com/pytorch/pytorch/pull/89414#discussion_r1036224613 just refactoring and adding `flip` method for `Vectorized<uint8>`. This should speed up torch.flip horizontal implementation similarly to what is reported in https://github.com/pytorch/pytorch/pull/89414 Pull Request resolved: https://github.com/pytorch/pytorch/pull/90013 Approved by: https://github.com/peterbell10, https://github.com/lezcano

commit: 777ac632fb989119a95a95737d3d31c2ad1e52fe [log] [tgz]
author: vfdev-5 <vfdev.5@gmail.com> Mon Dec 05 12:23:25 2022 +0000
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Mon Dec 05 12:23:28 2022 +0000
tree: c744533cd20de1489cf020b8bbadfe289708b675
parent: 226e803ecb483d2488f2381afb86c1e849814714 [diff]
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index d0a8cb0..f9c8794 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h

@@ -256,8 +256,7 @@
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
-template<>
-inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+inline __m256i flip8(const __m256i & v) {
   const __m256i mask_int8 = _mm256_set_epi8(
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -266,6 +265,15 @@
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
 
 #endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 

diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index dd1235e..8656756 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h

@@ -227,8 +227,7 @@
   return _mm512_permutexvar_epi16(mask, v);
 }
 
-template<>
-inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+inline __m512i flip8(const __m512i & v) {
   const __m512i mask1 = _mm512_set_epi8(
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -240,6 +239,16 @@
   return _mm512_permutexvar_epi64(mask2, reversed_vec);
 }
 
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
+
 #endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
 
 }}}
commit	777ac632fb989119a95a95737d3d31c2ad1e52fe	[log] [tgz]
author	vfdev-5 <vfdev.5@gmail.com>	Mon Dec 05 12:23:25 2022 +0000
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Mon Dec 05 12:23:28 2022 +0000
tree	c744533cd20de1489cf020b8bbadfe289708b675
parent	226e803ecb483d2488f2381afb86c1e849814714 [diff]