Improve aom_mse_wxh_16bit_sse2() function
The function aom_mse_wxh_16bit_sse2() computes mse for a
given block size. Existing SSE2 operates at 32/64 bit level.
However, for the given input range, the precision at which
computations happen can be reduced. This CL, improves the
SSE2 implementation with reduced precision.
Change-Id: Ia8be6ef534384787e8fcc8665707d77a1348b4de
diff --git a/aom_dsp/x86/variance_sse2.c b/aom_dsp/x86/variance_sse2.c
index a4c3262..761ce5c 100644
--- a/aom_dsp/x86/variance_sse2.c
+++ b/aom_dsp/x86/variance_sse2.c
@@ -673,7 +673,7 @@
uint64_t sum = 0;
__m128i dst0_8x8, dst1_8x8, dst_16x8;
__m128i src0_16x4, src1_16x4, src_16x8;
- __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+ __m128i res0_32x4, res0_64x2, res1_64x2;
__m128i sub_result_16x8;
const __m128i zeros = _mm_setzero_si128();
__m128i square_result = _mm_setzero_si128();
@@ -688,26 +688,17 @@
sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
- res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
- res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
+ res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
- res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
- res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+ res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+ res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
- res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
- res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
- res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
- res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
-
- square_result = _mm_add_epi64(
- square_result,
- _mm_add_epi64(
- _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
- res3_64x4));
+ square_result =
+ _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
}
- const __m128i sum_1x64 =
+ const __m128i sum_64x1 =
_mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
- xx_storel_64(&sum, sum_1x64);
+ xx_storel_64(&sum, sum_64x1);
return sum;
}
@@ -716,7 +707,7 @@
uint64_t sum = 0;
__m128i dst_8x8, dst_16x8;
__m128i src_16x8;
- __m128i res0_32x4, res1_32x4, res0_64x4, res1_64x4, res2_64x4, res3_64x4;
+ __m128i res0_32x4, res0_64x2, res1_64x2;
__m128i sub_result_16x8;
const __m128i zeros = _mm_setzero_si128();
__m128i square_result = _mm_setzero_si128();
@@ -729,26 +720,17 @@
sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
- res0_32x4 = _mm_unpacklo_epi16(sub_result_16x8, zeros);
- res1_32x4 = _mm_unpackhi_epi16(sub_result_16x8, zeros);
+ res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
- res0_32x4 = _mm_madd_epi16(res0_32x4, res0_32x4);
- res1_32x4 = _mm_madd_epi16(res1_32x4, res1_32x4);
+ res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+ res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
- res0_64x4 = _mm_unpacklo_epi32(res0_32x4, zeros);
- res1_64x4 = _mm_unpackhi_epi32(res0_32x4, zeros);
- res2_64x4 = _mm_unpacklo_epi32(res1_32x4, zeros);
- res3_64x4 = _mm_unpackhi_epi32(res1_32x4, zeros);
-
- square_result = _mm_add_epi64(
- square_result,
- _mm_add_epi64(
- _mm_add_epi64(_mm_add_epi64(res0_64x4, res1_64x4), res2_64x4),
- res3_64x4));
+ square_result =
+ _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
}
- const __m128i sum_1x64 =
+ const __m128i sum_64x1 =
_mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
- xx_storel_64(&sum, sum_1x64);
+ xx_storel_64(&sum, sum_64x1);
return sum;
}