Factor out x86 SIMD intrinsic synonyms
Change-Id: Idc4ac3ccd2ba19087cdb74a3e4a6774ac50386aa
diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_mask6_sse4.c
index 5de3e23..28693a4 100644
--- a/vpx_dsp/x86/blend_mask6_sse4.c
+++ b/vpx_dsp/x86/blend_mask6_sse4.c
@@ -16,51 +16,20 @@
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/synonyms.h"
+
#include "./vpx_dsp_rtcd.h"
#define MASK_BITS 6
-static INLINE __m128i mm_loadl_32(const void *a) {
- return _mm_cvtsi32_si128(*(const uint32_t*)a);
-}
-
-static INLINE __m128i mm_loadl_64(const void *a) {
- return _mm_loadl_epi64((const __m128i*)a);
-}
-
-static INLINE __m128i mm_loadu_128(const void *a) {
- return _mm_loadu_si128((const __m128i*)a);
-}
-
-static INLINE void mm_storel_32(void *const a, const __m128i v) {
- *(uint32_t*)a = _mm_cvtsi128_si32(v);
-}
-
-static INLINE void mm_storel_64(void *const a, const __m128i v) {
- _mm_storel_epi64((__m128i*)a, v);
-}
-
-static INLINE void mm_storeu_128(void *const a, const __m128i v) {
- _mm_storeu_si128((__m128i*)a, v);
-}
-
-static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
- return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
- const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
- return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
-}
-
//////////////////////////////////////////////////////////////////////////////
// Common kernels
//////////////////////////////////////////////////////////////////////////////
static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_b = mm_loadl_32(src0);
- const __m128i v_s1_b = mm_loadl_32(src1);
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
@@ -69,15 +38,15 @@
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
- const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_b = mm_loadl_64(src0);
- const __m128i v_s1_b = mm_loadl_64(src1);
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
@@ -86,7 +55,7 @@
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
- const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
@@ -106,7 +75,7 @@
(void)w;
do {
- const __m128i v_m0_b = mm_loadl_32(mask);
+ const __m128i v_m0_b = xx_loadl_32(mask);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
@@ -114,7 +83,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_32(dst, v_res_b);
+ xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -134,7 +103,7 @@
(void)w;
do {
- const __m128i v_m0_b = mm_loadl_64(mask);
+ const __m128i v_m0_b = xx_loadl_64(mask);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
@@ -142,7 +111,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_64(dst, v_res_b);
+ xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -162,8 +131,8 @@
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_m0l_b = mm_loadl_64(mask + c);
- const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
+ const __m128i v_m0l_b = xx_loadl_64(mask + c);
+ const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
@@ -176,7 +145,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
- mm_storeu_128(dst + c, v_res_b);
+ xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@@ -202,7 +171,7 @@
(void)w;
do {
- const __m128i v_r_b = mm_loadl_64(mask);
+ const __m128i v_r_b = xx_loadl_64(mask);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -212,7 +181,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_32(dst, v_res_b);
+ xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -234,7 +203,7 @@
(void)w;
do {
- const __m128i v_r_b = mm_loadu_128(mask);
+ const __m128i v_r_b = xx_loadu_128(mask);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -244,7 +213,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_64(dst, v_res_b);
+ xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -266,8 +235,8 @@
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
- const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
+ const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
@@ -283,7 +252,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
- mm_storeu_128(dst + c, v_res_b);
+ xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@@ -307,8 +276,8 @@
(void)w;
do {
- const __m128i v_ra_b = mm_loadl_32(mask);
- const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -318,7 +287,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_32(dst, v_res_b);
+ xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -338,8 +307,8 @@
(void)w;
do {
- const __m128i v_ra_b = mm_loadl_64(mask);
- const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -349,7 +318,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_64(dst, v_res_b);
+ xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -370,8 +339,8 @@
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_ra_b = mm_loadu_128(mask + c);
- const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
+ const __m128i v_ra_b = xx_loadu_128(mask + c);
+ const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
@@ -386,7 +355,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
- mm_storeu_128(dst + c, v_res_b);
+ xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@@ -412,22 +381,22 @@
(void)w;
do {
- const __m128i v_ra_b = mm_loadl_64(mask);
- const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
- const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_32(dst, v_res_b);
+ xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -449,22 +418,22 @@
(void)w;
do {
- const __m128i v_ra_b = mm_loadu_128(mask);
- const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
- const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
- mm_storel_64(dst, v_res_b);
+ xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@@ -486,10 +455,10 @@
do {
int c;
for (c = 0; c < w; c += 16) {
- const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
- const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
- const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
- const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
+ const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+ const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
@@ -501,8 +470,8 @@
const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
- const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
- const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
+ const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+ const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
@@ -513,7 +482,7 @@
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
- mm_storeu_128(dst + c, v_res_b);
+ xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@@ -575,38 +544,38 @@
static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = mm_loadl_64(src0);
- const __m128i v_s1_w = mm_loadl_64(src1);
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
- const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = mm_loadu_128(src0);
- const __m128i v_s1_w = mm_loadu_128(src1);
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
- const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = mm_loadl_64(src0);
- const __m128i v_s1_w = mm_loadl_64(src1);
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
// Interleave
const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@@ -622,15 +591,15 @@
const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
// Round
- const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
return v_res_w;
}
static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = mm_loadu_128(src0);
- const __m128i v_s1_w = mm_loadu_128(src1);
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
// Interleave
const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@@ -650,7 +619,7 @@
const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
// Round
- const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
return v_res_w;
}
@@ -668,13 +637,13 @@
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
- const __m128i v_m0_b = mm_loadl_32(mask);
+ const __m128i v_m0_b = xx_loadl_32(mask);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
- mm_storel_64(dst, v_res_w);
+ xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@@ -718,13 +687,13 @@
do {
int c;
for (c = 0; c < w; c += 8) {
- const __m128i v_m0_b = mm_loadl_64(mask + c);
+ const __m128i v_m0_b = xx_loadl_64(mask + c);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
- mm_storeu_128(dst + c, v_res_w);
+ xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;
@@ -770,7 +739,7 @@
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
- const __m128i v_r_b = mm_loadl_64(mask);
+ const __m128i v_r_b = xx_loadl_64(mask);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -778,7 +747,7 @@
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
- mm_storel_64(dst, v_res_w);
+ xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@@ -824,7 +793,7 @@
do {
int c;
for (c = 0; c < w; c += 8) {
- const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@@ -832,7 +801,7 @@
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
- mm_storeu_128(dst + c, v_res_w);
+ xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;
@@ -876,8 +845,8 @@
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
- const __m128i v_ra_b = mm_loadl_32(mask);
- const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -885,7 +854,7 @@
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
- mm_storel_64(dst, v_res_w);
+ xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@@ -929,8 +898,8 @@
do {
int c;
for (c = 0; c < w; c += 8) {
- const __m128i v_ra_b = mm_loadl_64(mask + c);
- const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
+ const __m128i v_ra_b = xx_loadl_64(mask + c);
+ const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@@ -938,7 +907,7 @@
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
- mm_storeu_128(dst + c, v_res_w);
+ xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;
@@ -984,20 +953,20 @@
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
- const __m128i v_ra_b = mm_loadl_64(mask);
- const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
- const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
- mm_storel_64(dst, v_res_w);
+ xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@@ -1043,20 +1012,20 @@
do {
int c;
for (c = 0; c < w; c += 8) {
- const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
- const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
+ const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
- const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
- mm_storeu_128(dst + c, v_res_w);
+ xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;
diff --git a/vpx_dsp/x86/synonyms.h b/vpx_dsp/x86/synonyms.h
new file mode 100644
index 0000000..0e7d67d
--- /dev/null
+++ b/vpx_dsp/x86/synonyms.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_SYNONYS_H_
+#define VPX_DSP_X86_SYNONYS_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+ return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+ return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+ return _mm_load_si128((const __m128i*)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+ return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+ *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+ _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+ _mm_store_si128((__m128i*)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+ _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+ return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+ const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+ return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+#endif // VPX_DSP_X86_SYNONYS_H_