Factor out x86 SIMD intrinsic synonyms Change-Id: Idc4ac3ccd2ba19087cdb74a3e4a6774ac50386aa

commit: 9ebca469337d8cb137886939516ee0aa61312f3f [log] [tgz]
author: Geza Lore <gezalore@gmail.com> Wed Jun 01 13:53:16 2016 +0100
committer: Geza Lore <gezalore@gmail.com> Fri Jun 03 09:34:55 2016 +0100
tree: a3cdefea05605909a52743a23a58f698a5b755d1
parent: 73bc3119be7a5464a62554cdf7eefd3bec7bd94f [diff]
diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_mask6_sse4.c
index 5de3e23..28693a4 100644
--- a/vpx_dsp/x86/blend_mask6_sse4.c
+++ b/vpx_dsp/x86/blend_mask6_sse4.c

@@ -16,51 +16,20 @@
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+#include "vpx_dsp/x86/synonyms.h"
+
 #include "./vpx_dsp_rtcd.h"
 
 #define MASK_BITS 6
 
-static INLINE __m128i mm_loadl_32(const void *a) {
-  return _mm_cvtsi32_si128(*(const uint32_t*)a);
-}
-
-static INLINE __m128i mm_loadl_64(const void *a) {
-  return _mm_loadl_epi64((const __m128i*)a);
-}
-
-static INLINE __m128i mm_loadu_128(const void *a) {
-  return _mm_loadu_si128((const __m128i*)a);
-}
-
-static INLINE void mm_storel_32(void *const a, const __m128i v) {
-  *(uint32_t*)a = _mm_cvtsi128_si32(v);
-}
-
-static INLINE void mm_storel_64(void *const a, const __m128i v) {
-  _mm_storel_epi64((__m128i*)a, v);
-}
-
-static INLINE void mm_storeu_128(void *const a, const __m128i v) {
-  _mm_storeu_si128((__m128i*)a, v);
-}
-
-static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
-  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
-  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
-  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // Common kernels
 //////////////////////////////////////////////////////////////////////////////
 
 static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
                               const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = mm_loadl_32(src0);
-  const __m128i v_s1_b = mm_loadl_32(src1);
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
 
@@ -69,15 +38,15 @@
 
   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
 
-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
 
   return v_res_w;
 }
 
 static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
                               const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = mm_loadl_64(src0);
-  const __m128i v_s1_b = mm_loadl_64(src1);
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
 
@@ -86,7 +55,7 @@
 
   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
 
-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
 
   return v_res_w;
 }
@@ -106,7 +75,7 @@
   (void)w;
 
   do {
-    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_b = xx_loadl_32(mask);
     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
@@ -114,7 +83,7 @@
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -134,7 +103,7 @@
   (void)w;
 
   do {
-    const __m128i v_m0_b = mm_loadl_64(mask);
+    const __m128i v_m0_b = xx_loadl_64(mask);
     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
@@ -142,7 +111,7 @@
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -162,8 +131,8 @@
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_m0l_b = mm_loadl_64(mask + c);
-      const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
+      const __m128i v_m0l_b = xx_loadl_64(mask + c);
+      const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
       const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
       const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
@@ -176,7 +145,7 @@
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
     }
     dst += dst_stride;
     src0 += src0_stride;
@@ -202,7 +171,7 @@
   (void)w;
 
   do {
-    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_r_b = xx_loadl_64(mask);
     const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
 
     const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -212,7 +181,7 @@
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -234,7 +203,7 @@
   (void)w;
 
   do {
-    const __m128i v_r_b = mm_loadu_128(mask);
+    const __m128i v_r_b = xx_loadu_128(mask);
     const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
 
     const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -244,7 +213,7 @@
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -266,8 +235,8 @@
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
-      const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
       const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
       const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
 
@@ -283,7 +252,7 @@
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
     }
     dst += dst_stride;
     src0 += src0_stride;
@@ -307,8 +276,8 @@
   (void)w;
 
   do {
-    const __m128i v_ra_b = mm_loadl_32(mask);
-    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
     const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
 
     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -318,7 +287,7 @@
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -338,8 +307,8 @@
   (void)w;
 
   do {
-    const __m128i v_ra_b = mm_loadl_64(mask);
-    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
 
     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -349,7 +318,7 @@
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -370,8 +339,8 @@
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_ra_b = mm_loadu_128(mask + c);
-      const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
+      const __m128i v_ra_b = xx_loadu_128(mask + c);
+      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
       const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
 
       const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
@@ -386,7 +355,7 @@
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
     }
     dst += dst_stride;
     src0 += src0_stride;
@@ -412,22 +381,22 @@
   (void)w;
 
   do {
-    const __m128i v_ra_b = mm_loadl_64(mask);
-    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                            v_zmask_b);
     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
-    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
     const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -449,22 +418,22 @@
   (void)w;
 
   do {
-    const __m128i v_ra_b = mm_loadu_128(mask);
-    const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                            v_zmask_b);
     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
-    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
     const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
 
     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
 
-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -486,10 +455,10 @@
   do {
     int c;
     for (c = 0; c < w; c += 16) {
-      const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
-      const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
-      const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
-      const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
       const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
       const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
       const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
@@ -501,8 +470,8 @@
       const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
       const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
 
-      const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
-      const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
       const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
       const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
 
@@ -513,7 +482,7 @@
 
       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
 
-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
     }
     dst += dst_stride;
     src0 += src0_stride;
@@ -575,38 +544,38 @@
 
 static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadl_64(src0);
-  const __m128i v_s1_w = mm_loadl_64(src1);
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
 
   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
 
   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
 
-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
 
   return v_res_w;
 }
 
 static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadu_128(src0);
-  const __m128i v_s1_w = mm_loadu_128(src1);
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
 
   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
 
   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
 
-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
 
   return v_res_w;
 }
 
 static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadl_64(src0);
-  const __m128i v_s1_w = mm_loadl_64(src1);
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
 
   // Interleave
   const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@@ -622,15 +591,15 @@
   const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
 
   // Round
-  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
 
   return v_res_w;
 }
 
 static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
                                   const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadu_128(src0);
-  const __m128i v_s1_w = mm_loadu_128(src1);
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
 
   // Interleave
   const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@@ -650,7 +619,7 @@
   const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
 
   // Round
-  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
 
   return v_res_w;
 }
@@ -668,13 +637,13 @@
   const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
 
   do {
-    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_b = xx_loadl_32(mask);
     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
 
-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -718,13 +687,13 @@
   do {
     int c;
     for (c = 0; c < w; c += 8) {
-      const __m128i v_m0_b = mm_loadl_64(mask + c);
+      const __m128i v_m0_b = xx_loadl_64(mask + c);
       const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
 
-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
     }
     dst += dst_stride;
     src0 += src0_stride;
@@ -770,7 +739,7 @@
   const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
 
   do {
-    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_r_b = xx_loadl_64(mask);
     const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
 
     const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -778,7 +747,7 @@
 
     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
 
-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -824,7 +793,7 @@
   do {
     int c;
     for (c = 0; c < w; c += 8) {
-      const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
       const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
 
       const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -832,7 +801,7 @@
 
       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
 
-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
     }
     dst += dst_stride;
     src0 += src0_stride;
@@ -876,8 +845,8 @@
   const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
 
   do {
-    const __m128i v_ra_b = mm_loadl_32(mask);
-    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
     const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
 
     const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -885,7 +854,7 @@
 
     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
 
-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -929,8 +898,8 @@
   do {
     int c;
     for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = mm_loadl_64(mask + c);
-      const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
+      const __m128i v_ra_b = xx_loadl_64(mask + c);
+      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
       const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
 
       const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -938,7 +907,7 @@
 
       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
 
-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
     }
     dst += dst_stride;
     src0 += src0_stride;
@@ -984,20 +953,20 @@
   const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
 
   do {
-    const __m128i v_ra_b = mm_loadl_64(mask);
-    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
     const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
     const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
     const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                            v_zmask_b);
     const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
-    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
 
-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);
 
     dst += dst_stride;
     src0 += src0_stride;
@@ -1043,20 +1012,20 @@
   do {
     int c;
     for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
-      const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
+      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
       const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
       const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
       const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                              v_zmask_b);
       const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
 
-      const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
       const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
 
       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
 
-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
     }
     dst += dst_stride;
     src0 += src0_stride;

diff --git a/vpx_dsp/x86/synonyms.h b/vpx_dsp/x86/synonyms.h
new file mode 100644
index 0000000..0e7d67d
--- /dev/null
+++ b/vpx_dsp/x86/synonyms.h

@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_SYNONYS_H_
+#define VPX_DSP_X86_SYNONYS_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+  return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+  return _mm_load_si128((const __m128i*)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+  *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+  _mm_store_si128((__m128i*)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+#endif  // VPX_DSP_X86_SYNONYS_H_
commit	9ebca469337d8cb137886939516ee0aa61312f3f	[log] [tgz]
author	Geza Lore <gezalore@gmail.com>	Wed Jun 01 13:53:16 2016 +0100
committer	Geza Lore <gezalore@gmail.com>	Fri Jun 03 09:34:55 2016 +0100
tree	a3cdefea05605909a52743a23a58f698a5b755d1
parent	73bc3119be7a5464a62554cdf7eefd3bec7bd94f [diff]