Added vectorized flip for uint8 (#90013)

Following https://github.com/pytorch/pytorch/pull/89414#discussion_r1036224613 just refactoring and adding `flip` method for `Vectorized<uint8>`. This should speed up torch.flip horizontal implementation similarly to what is reported in https://github.com/pytorch/pytorch/pull/89414

Pull Request resolved: https://github.com/pytorch/pytorch/pull/90013
Approved by: https://github.com/peterbell10, https://github.com/lezcano
diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h
index d0a8cb0..f9c8794 100644
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@@ -256,8 +256,7 @@
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
-template<>
-inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+inline __m256i flip8(const __m256i & v) {
   const __m256i mask_int8 = _mm256_set_epi8(
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
@@ -266,6 +265,15 @@
   return _mm256_permute2x128_si256(reversed, reversed, 1);
 }
 
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
 
 #endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER)
 
diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h
index dd1235e..8656756 100644
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@@ -227,8 +227,7 @@
   return _mm512_permutexvar_epi16(mask, v);
 }
 
-template<>
-inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+inline __m512i flip8(const __m512i & v) {
   const __m512i mask1 = _mm512_set_epi8(
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
@@ -240,6 +239,16 @@
   return _mm512_permutexvar_epi64(mask2, reversed_vec);
 }
 
+template<>
+inline Vectorized<int8_t> flip(const Vectorized<int8_t> & v) {
+  return flip8(v);
+}
+
+template<>
+inline Vectorized<uint8_t> flip(const Vectorized<uint8_t> & v) {
+  return flip8(v);
+}
+
 #endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER)
 
 }}}