Update quantize SSSE3 SIMD to cover 32x32 transform case also.

Encode time of bus (speed 0) 50 frames @ 1500kbps goes from 2min14.4 to
2min10.1, i.e. a 2.3% overall speed increase.

Change-Id: I3699580e74ec26c7d24e03681bc47ba25ee1ee87
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index e7cefa5..330c60f 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -569,6 +569,9 @@
 prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
 
+prototype void vp9_quantize_b_32x32 "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
+specialize vp9_quantize_b_32x32 $ssse3_x86_64
+
 #
 # Structured Similarity (SSIM)
 #
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 6f2e13a..862923f 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -85,18 +85,19 @@
 }
 
 // This function works well for large transform size.
-static void quantize_sparse(int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(int16_t *coeff_ptr, intptr_t n_coeffs,
                             int skip_block,
                             int16_t *zbin_ptr, int16_t *round_ptr,
                             int16_t *quant_ptr, int16_t *quant_shift_ptr,
                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
                             int16_t *dequant_ptr, int zbin_oq_value,
                             uint16_t *eob_ptr, const int16_t *scan,
-                            int *idx_arr) {
+                            const int16_t *iscan) {
   int i, rc, eob;
   int zbins[2], nzbins[2], zbin;
   int x, y, z, sz;
   int idx = 0;
+  int idx_arr[1024];
 
   vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
   vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
@@ -179,20 +180,18 @@
   // Call different quantization for different transform size.
   if (n_coeffs >= 1024) {
     // Save index of picked coefficient in pre-scan pass.
-    int idx_arr[1024];
-
-    quantize_sparse(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
-                    n_coeffs, mb->skip_block,
-                    mb->plane[plane].zbin,
-                    mb->plane[plane].round,
-                    mb->plane[plane].quant,
-                    mb->plane[plane].quant_shift,
-                    BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
-                    BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                    xd->plane[plane].dequant,
-                    mb->plane[plane].zbin_extra,
-                    &xd->plane[plane].eobs[block],
-                    scan, idx_arr);
+    vp9_quantize_b_32x32(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+                         n_coeffs, mb->skip_block,
+                         mb->plane[plane].zbin,
+                         mb->plane[plane].round,
+                         mb->plane[plane].quant,
+                         mb->plane[plane].quant_shift,
+                         BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+                         BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                         xd->plane[plane].dequant,
+                         mb->plane[plane].zbin_extra,
+                         &xd->plane[plane].eobs[block],
+                         scan, iscan);
   }
   else {
     vp9_quantize_b(BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.asm b/vp9/encoder/x86/vp9_quantize_ssse3.asm
index 665bafa..b666abb 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.asm
@@ -15,10 +15,10 @@
 
 SECTION .text
 
-INIT_XMM ssse3
-cglobal quantize_b, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                              shift, qcoeff, dqcoeff, dequant, zbin_oq, \
-                              eob, scan, iscan
+%macro QUANTIZE_FN 1
+cglobal quantize_%1, 0, 6, 15, coeff, ncoeff, skip, zbin, round, quant, \
+                               shift, qcoeff, dqcoeff, dequant, zbin_oq, \
+                               eob, scan, iscan
   cmp                    dword skipm, 0
   jne .blank
 
@@ -57,6 +57,10 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   punpckhqdq                      m0, m0
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
@@ -77,9 +81,19 @@
   pand                           m13, m12
   mova        [qcoeffq+ncoeffq*2+ 0], m8
   mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
   punpckhqdq                      m3, m3
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
   mova       [dqcoeffq+ncoeffq*2+ 0], m8
   mova       [dqcoeffq+ncoeffq*2+16], m13
   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
@@ -99,6 +113,10 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
+%ifidn %1, b_32x32
+  paddw                           m6, m6
+  paddw                          m11, m11
+%endif
   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
   paddw                           m6, m1                   ; m6 += round
@@ -115,8 +133,18 @@
   pand                           m13, m12
   mova        [qcoeffq+ncoeffq*2+ 0], m14
   mova        [qcoeffq+ncoeffq*2+16], m13
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
   pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
   mova       [dqcoeffq+ncoeffq*2+ 0], m14
   mova       [dqcoeffq+ncoeffq*2+16], m13
   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
@@ -163,3 +191,8 @@
   jl .blank_loop
   mov                    word [eobq], 0
   RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b
+QUANTIZE_FN b_32x32