Update load_input_data() in x86

Split to load_input_data4() and load_input_data8().
Use pack with signed saturation instruction for high bitdepth.

Change-Id: Icda3e0129a6fdb4a51d1cafbdc652ae3a65f4e06
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 7e8089b..7f0ddb0 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -18,8 +18,8 @@
   __m128i in[2];
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -57,14 +57,14 @@
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-  in[4] = load_input_data(input + 8 * 4);
-  in[5] = load_input_data(input + 8 * 5);
-  in[6] = load_input_data(input + 8 * 6);
-  in[7] = load_input_data(input + 8 * 7);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8 * 1);
+  in[2] = load_input_data8(input + 8 * 2);
+  in[3] = load_input_data8(input + 8 * 3);
+  in[4] = load_input_data8(input + 8 * 4);
+  in[5] = load_input_data8(input + 8 * 5);
+  in[6] = load_input_data8(input + 8 * 6);
+  in[7] = load_input_data8(input + 8 * 7);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c
index 2d0318d..00301b8 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/vpx_dsp/x86/inv_txfm_sse2.c
@@ -27,8 +27,8 @@
   __m128i in[2];
 
   // Rows
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
   idct4_sse2(in);
 
   // Columns
@@ -491,10 +491,10 @@
   const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   __m128i in[8], step1[8], step2[8], tmp[4];
 
-  in[0] = load_input_data(input + 0 * 8);
-  in[1] = load_input_data(input + 1 * 8);
-  in[2] = load_input_data(input + 2 * 8);
-  in[3] = load_input_data(input + 3 * 8);
+  in[0] = load_input_data4(input + 0 * 8);
+  in[1] = load_input_data4(input + 1 * 8);
+  in[2] = load_input_data4(input + 2 * 8);
+  in[3] = load_input_data4(input + 3 * 8);
 
   transpose_16bit_4x4(in, in);
   // in[0]: 00 10 20 30  01 11 21 31
@@ -721,14 +721,14 @@
 
 static INLINE void idct16_load8x8(const tran_low_t *const input,
                                   __m128i *const in) {
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-  in[4] = load_input_data(input + 8 * 8);
-  in[5] = load_input_data(input + 8 * 10);
-  in[6] = load_input_data(input + 8 * 12);
-  in[7] = load_input_data(input + 8 * 14);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8 * 2);
+  in[2] = load_input_data8(input + 8 * 4);
+  in[3] = load_input_data8(input + 8 * 6);
+  in[4] = load_input_data8(input + 8 * 8);
+  in[5] = load_input_data8(input + 8 * 10);
+  in[6] = load_input_data8(input + 8 * 12);
+  in[7] = load_input_data8(input + 8 * 14);
 }
 
 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
@@ -1258,10 +1258,10 @@
   int i;
   // First 1-D inverse DCT
   // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
+  in[0] = load_input_data4(input + 0 * 16);
+  in[1] = load_input_data4(input + 1 * 16);
+  in[2] = load_input_data4(input + 2 * 16);
+  in[3] = load_input_data4(input + 3 * 16);
 
   transpose_16bit_4x4(in, in);
 
@@ -1651,14 +1651,14 @@
   int i;
 
   // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
+  in[0] = load_input_data8(input + 0 * 32);
+  in[1] = load_input_data8(input + 1 * 32);
+  in[2] = load_input_data8(input + 2 * 32);
+  in[3] = load_input_data8(input + 3 * 32);
+  in[4] = load_input_data8(input + 4 * 32);
+  in[5] = load_input_data8(input + 5 * 32);
+  in[6] = load_input_data8(input + 6 * 32);
+  in[7] = load_input_data8(input + 7 * 32);
 
   transpose_16bit_8x8(in, in);
   IDCT32_34
@@ -2008,10 +2008,10 @@
 static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
   int i;
   for (i = 0; i < 8; ++i) {
-    in[i] = load_input_data(input);
-    in[i + 8] = load_input_data(input + 8);
-    in[i + 16] = load_input_data(input + 16);
-    in[i + 24] = load_input_data(input + 24);
+    in[i] = load_input_data8(input);
+    in[i + 8] = load_input_data8(input + 8);
+    in[i + 16] = load_input_data8(input + 16);
+    in[i + 24] = load_input_data8(input + 24);
     input += 32;
   }
 }
diff --git a/vpx_dsp/x86/inv_txfm_sse2.h b/vpx_dsp/x86/inv_txfm_sse2.h
index cfe5f78..7db97db 100644
--- a/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/vpx_dsp/x86/inv_txfm_sse2.h
@@ -76,24 +76,23 @@
   return _mm_packs_epi32(t0, t1);
 }
 
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
 // highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  // in0: 0 X 1 X  2 X 3 X
-  // in1: 4 X 5 X  6 X 7 X
-  // t0:  0 4 X X  1 5 X X
-  // t1:  2 6 X X  3 7 X X
-  // t2:  0 2 4 6  X X X X
-  // t3:  1 3 5 7  X X X X
-  // rtn: 0 1 2 3  4 5 6 7
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i in = _mm_load_si128((const __m128i *)data);
+  return _mm_packs_epi32(in, zero);
+#else
+  return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
   const __m128i in0 = _mm_load_si128((const __m128i *)data);
   const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
-  const __m128i t0 = _mm_unpacklo_epi16(in0, in1);
-  const __m128i t1 = _mm_unpackhi_epi16(in0, in1);
-  const __m128i t2 = _mm_unpacklo_epi16(t0, t1);
-  const __m128i t3 = _mm_unpackhi_epi16(t0, t1);
-  return _mm_unpacklo_epi16(t2, t3);
+  return _mm_packs_epi32(in0, in1);
 #else
   return _mm_load_si128((const __m128i *)data);
 #endif
@@ -101,35 +100,35 @@
 
 static INLINE void load_buffer_8x8(const tran_low_t *const input,
                                    __m128i *const in) {
-  in[0] = load_input_data(input + 0 * 8);
-  in[1] = load_input_data(input + 1 * 8);
-  in[2] = load_input_data(input + 2 * 8);
-  in[3] = load_input_data(input + 3 * 8);
-  in[4] = load_input_data(input + 4 * 8);
-  in[5] = load_input_data(input + 5 * 8);
-  in[6] = load_input_data(input + 6 * 8);
-  in[7] = load_input_data(input + 7 * 8);
+  in[0] = load_input_data8(input + 0 * 8);
+  in[1] = load_input_data8(input + 1 * 8);
+  in[2] = load_input_data8(input + 2 * 8);
+  in[3] = load_input_data8(input + 3 * 8);
+  in[4] = load_input_data8(input + 4 * 8);
+  in[5] = load_input_data8(input + 5 * 8);
+  in[6] = load_input_data8(input + 6 * 8);
+  in[7] = load_input_data8(input + 7 * 8);
 }
 
 static INLINE void load_buffer_8x16(const tran_low_t *const input,
                                     __m128i *const in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
 
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
+  in[8] = load_input_data8(input + 8 * 16);
+  in[9] = load_input_data8(input + 9 * 16);
+  in[10] = load_input_data8(input + 10 * 16);
+  in[11] = load_input_data8(input + 11 * 16);
+  in[12] = load_input_data8(input + 12 * 16);
+  in[13] = load_input_data8(input + 13 * 16);
+  in[14] = load_input_data8(input + 14 * 16);
+  in[15] = load_input_data8(input + 15 * 16);
 }
 
 static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c
index 1a9fe51..f42ce49 100644
--- a/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -36,10 +36,10 @@
   __m128i tmp[4];
 
   // Rows. Load 4-row input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
+  in[0] = load_input_data4(input + 0 * 8);
+  in[1] = load_input_data4(input + 1 * 8);
+  in[2] = load_input_data4(input + 2 * 8);
+  in[3] = load_input_data4(input + 3 * 8);
 
   // 4x4 Transpose
   transpose_16bit_4x4(in, in);
@@ -342,14 +342,14 @@
   int i;
 
   // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
+  in[0] = load_input_data8(input + 0 * 32);
+  in[1] = load_input_data8(input + 1 * 32);
+  in[2] = load_input_data8(input + 2 * 32);
+  in[3] = load_input_data8(input + 3 * 32);
+  in[4] = load_input_data8(input + 4 * 32);
+  in[5] = load_input_data8(input + 5 * 32);
+  in[6] = load_input_data8(input + 6 * 32);
+  in[7] = load_input_data8(input + 7 * 32);
 
   transpose_16bit_8x8(in, in);
   idct32_34_first_half(in, stp1);
@@ -383,8 +383,8 @@
                               __m128i *in1) {
   int i;
   for (i = 0; i < 16; i++) {
-    in0[i] = load_input_data(input);
-    in1[i] = load_input_data(input + 8);
+    in0[i] = load_input_data8(input);
+    in1[i] = load_input_data8(input + 8);
     input += 32;
   }
 }