Merge "Use shifted value for sinpi8sqrt2" am: 8a88e03ea0 am: ecc49c9499
am: f43930b82b

Change-Id: Idffd7efa21e10690011a299b041b3b5491108480
diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc
index 39db3e4..cdcae4f 100644
--- a/libvpx/test/idct_test.cc
+++ b/libvpx/test/idct_test.cc
@@ -110,6 +110,10 @@
 }
 
 INSTANTIATE_TEST_CASE_P(C, IDCTTest, ::testing::Values(vp8_short_idct4x4llm_c));
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_neon));
+#endif
 #if HAVE_MMX
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_mmx));
diff --git a/libvpx/vp8/common/arm/neon/dequant_idct_neon.c b/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
index 58e1192..753051c 100644
--- a/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/libvpx/vp8/common/arm/neon/dequant_idct_neon.c
@@ -11,132 +11,129 @@
 #include <arm_neon.h>
 
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2       = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
-void vp8_dequant_idct_add_neon(
-        int16_t *input,
-        int16_t *dq,
-        unsigned char *dst,
-        int stride) {
-    unsigned char *dst0;
-    int32x2_t d14, d15;
-    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
-    int16x8_t q1, q2, q3, q4, q5, q6;
-    int16x8_t qEmpty = vdupq_n_s16(0);
-    int32x2x2_t d2tmp0, d2tmp1;
-    int16x4x2_t d2tmp2, d2tmp3;
+void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
+                               int stride) {
+  unsigned char *dst0;
+  int32x2_t d14, d15;
+  int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+  int16x8_t q1, q2, q3, q4, q5, q6;
+  int16x8_t qEmpty = vdupq_n_s16(0);
+  int32x2x2_t d2tmp0, d2tmp1;
+  int16x4x2_t d2tmp2, d2tmp3;
 
-    d14 = d15 = vdup_n_s32(0);
+  d14 = d15 = vdup_n_s32(0);
 
-    // load input
-    q3 = vld1q_s16(input);
-    vst1q_s16(input, qEmpty);
-    input += 8;
-    q4 = vld1q_s16(input);
-    vst1q_s16(input, qEmpty);
+  // load input
+  q3 = vld1q_s16(input);
+  vst1q_s16(input, qEmpty);
+  input += 8;
+  q4 = vld1q_s16(input);
+  vst1q_s16(input, qEmpty);
 
-    // load dq
-    q5 = vld1q_s16(dq);
-    dq += 8;
-    q6 = vld1q_s16(dq);
+  // load dq
+  q5 = vld1q_s16(dq);
+  dq += 8;
+  q6 = vld1q_s16(dq);
 
-    // load src from dst
-    dst0 = dst;
-    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
-    dst0 += stride;
-    d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
-    dst0 += stride;
-    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
-    dst0 += stride;
-    d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
+  // load src from dst
+  dst0 = dst;
+  d14 = vld1_lane_s32((const int32_t *)dst0, d14, 0);
+  dst0 += stride;
+  d14 = vld1_lane_s32((const int32_t *)dst0, d14, 1);
+  dst0 += stride;
+  d15 = vld1_lane_s32((const int32_t *)dst0, d15, 0);
+  dst0 += stride;
+  d15 = vld1_lane_s32((const int32_t *)dst0, d15, 1);
 
-    q1 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q3),
-                                         vreinterpretq_u16_s16(q5)));
-    q2 = vreinterpretq_s16_u16(vmulq_u16(vreinterpretq_u16_s16(q4),
-                                         vreinterpretq_u16_s16(q6)));
+  q1 = vreinterpretq_s16_u16(
+      vmulq_u16(vreinterpretq_u16_s16(q3), vreinterpretq_u16_s16(q5)));
+  q2 = vreinterpretq_s16_u16(
+      vmulq_u16(vreinterpretq_u16_s16(q4), vreinterpretq_u16_s16(q6)));
 
-    d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
-    d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
+  d12 = vqadd_s16(vget_low_s16(q1), vget_low_s16(q2));
+  d13 = vqsub_s16(vget_low_s16(q1), vget_low_s16(q2));
 
-    q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
+  q2 = vcombine_s16(vget_high_s16(q1), vget_high_s16(q2));
 
-    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
-    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+  q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+  q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
 
-    q3 = vshrq_n_s16(q3, 1);
-    q4 = vshrq_n_s16(q4, 1);
+  q4 = vshrq_n_s16(q4, 1);
 
-    q3 = vqaddq_s16(q3, q2);
-    q4 = vqaddq_s16(q4, q2);
+  q4 = vqaddq_s16(q4, q2);
 
-    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
-    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+  d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+  d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
 
-    d2 = vqadd_s16(d12, d11);
-    d3 = vqadd_s16(d13, d10);
-    d4 = vqsub_s16(d13, d10);
-    d5 = vqsub_s16(d12, d11);
+  d2 = vqadd_s16(d12, d11);
+  d3 = vqadd_s16(d13, d10);
+  d4 = vqsub_s16(d13, d10);
+  d5 = vqsub_s16(d12, d11);
 
-    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
-    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
-    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
-                      vreinterpret_s16_s32(d2tmp1.val[0]));
-    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
-                      vreinterpret_s16_s32(d2tmp1.val[1]));
+  d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+  d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+  d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                    vreinterpret_s16_s32(d2tmp1.val[0]));
+  d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                    vreinterpret_s16_s32(d2tmp1.val[1]));
 
-    // loop 2
-    q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
+  // loop 2
+  q2 = vcombine_s16(d2tmp2.val[1], d2tmp3.val[1]);
 
-    q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
-    q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
+  q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
+  q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
 
-    d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
-    d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
+  d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
+  d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
 
-    q3 = vshrq_n_s16(q3, 1);
-    q4 = vshrq_n_s16(q4, 1);
+  q4 = vshrq_n_s16(q4, 1);
 
-    q3 = vqaddq_s16(q3, q2);
-    q4 = vqaddq_s16(q4, q2);
+  q4 = vqaddq_s16(q4, q2);
 
-    d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
-    d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
+  d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
+  d11 = vqadd_s16(vget_high_s16(q3), vget_low_s16(q4));
 
-    d2 = vqadd_s16(d12, d11);
-    d3 = vqadd_s16(d13, d10);
-    d4 = vqsub_s16(d13, d10);
-    d5 = vqsub_s16(d12, d11);
+  d2 = vqadd_s16(d12, d11);
+  d3 = vqadd_s16(d13, d10);
+  d4 = vqsub_s16(d13, d10);
+  d5 = vqsub_s16(d12, d11);
 
-    d2 = vrshr_n_s16(d2, 3);
-    d3 = vrshr_n_s16(d3, 3);
-    d4 = vrshr_n_s16(d4, 3);
-    d5 = vrshr_n_s16(d5, 3);
+  d2 = vrshr_n_s16(d2, 3);
+  d3 = vrshr_n_s16(d3, 3);
+  d4 = vrshr_n_s16(d4, 3);
+  d5 = vrshr_n_s16(d5, 3);
 
-    d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
-    d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
-    d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
-                      vreinterpret_s16_s32(d2tmp1.val[0]));
-    d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
-                      vreinterpret_s16_s32(d2tmp1.val[1]));
+  d2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+  d2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+  d2tmp2 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[0]),
+                    vreinterpret_s16_s32(d2tmp1.val[0]));
+  d2tmp3 = vtrn_s16(vreinterpret_s16_s32(d2tmp0.val[1]),
+                    vreinterpret_s16_s32(d2tmp1.val[1]));
 
-    q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
-    q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
+  q1 = vcombine_s16(d2tmp2.val[0], d2tmp2.val[1]);
+  q2 = vcombine_s16(d2tmp3.val[0], d2tmp3.val[1]);
 
-    q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1),
-                                        vreinterpret_u8_s32(d14)));
-    q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2),
-                                        vreinterpret_u8_s32(d15)));
+  q1 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s32(d14)));
+  q2 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s32(d15)));
 
-    d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
-    d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
+  d14 = vreinterpret_s32_u8(vqmovun_s16(q1));
+  d15 = vreinterpret_s32_u8(vqmovun_s16(q2));
 
-    dst0 = dst;
-    vst1_lane_s32((int32_t *)dst0, d14, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d14, 1);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d15, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d15, 1);
-    return;
+  dst0 = dst;
+  vst1_lane_s32((int32_t *)dst0, d14, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst0, d14, 1);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst0, d15, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst0, d15, 1);
+  return;
 }
diff --git a/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c b/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
index 373afa6..1adb1c3 100644
--- a/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -11,113 +11,109 @@
 #include <arm_neon.h>
 
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2       = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
-void vp8_short_idct4x4llm_neon(
-        int16_t *input,
-        unsigned char *pred_ptr,
-        int pred_stride,
-        unsigned char *dst_ptr,
-        int dst_stride) {
-    int i;
-    uint32x2_t d6u32 = vdup_n_u32(0);
-    uint8x8_t d1u8;
-    int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
-    uint16x8_t q1u16;
-    int16x8_t q1s16, q2s16, q3s16, q4s16;
-    int32x2x2_t v2tmp0, v2tmp1;
-    int16x4x2_t v2tmp2, v2tmp3;
+void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
+                               int pred_stride, unsigned char *dst_ptr,
+                               int dst_stride) {
+  int i;
+  uint32x2_t d6u32 = vdup_n_u32(0);
+  uint8x8_t d1u8;
+  int16x4_t d2, d3, d4, d5, d10, d11, d12, d13;
+  uint16x8_t q1u16;
+  int16x8_t q1s16, q2s16, q3s16, q4s16;
+  int32x2x2_t v2tmp0, v2tmp1;
+  int16x4x2_t v2tmp2, v2tmp3;
 
-    d2 = vld1_s16(input);
-    d3 = vld1_s16(input + 4);
-    d4 = vld1_s16(input + 8);
-    d5 = vld1_s16(input + 12);
+  d2 = vld1_s16(input);
+  d3 = vld1_s16(input + 4);
+  d4 = vld1_s16(input + 8);
+  d5 = vld1_s16(input + 12);
 
-    // 1st for loop
-    q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
-    q2s16 = vcombine_s16(d3, d5);
+  // 1st for loop
+  q1s16 = vcombine_s16(d2, d4);  // Swap d3 d4 here
+  q2s16 = vcombine_s16(d3, d5);
 
-    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
-    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
 
-    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
-    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
+  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
+  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-    q3s16 = vshrq_n_s16(q3s16, 1);
-    q4s16 = vshrq_n_s16(q4s16, 1);
+  q4s16 = vshrq_n_s16(q4s16, 1);
 
-    q3s16 = vqaddq_s16(q3s16, q2s16);
-    q4s16 = vqaddq_s16(q4s16, q2s16);
+  q4s16 = vqaddq_s16(q4s16, q2s16);
 
-    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
-    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
+  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
+  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
 
-    d2 = vqadd_s16(d12, d11);
-    d3 = vqadd_s16(d13, d10);
-    d4 = vqsub_s16(d13, d10);
-    d5 = vqsub_s16(d12, d11);
+  d2 = vqadd_s16(d12, d11);
+  d3 = vqadd_s16(d13, d10);
+  d4 = vqsub_s16(d13, d10);
+  d5 = vqsub_s16(d12, d11);
 
-    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
-    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
-    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
-                      vreinterpret_s16_s32(v2tmp1.val[0]));
-    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
-                      vreinterpret_s16_s32(v2tmp1.val[1]));
+  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+                    vreinterpret_s16_s32(v2tmp1.val[0]));
+  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+                    vreinterpret_s16_s32(v2tmp1.val[1]));
 
-    // 2nd for loop
-    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
-    q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
+  // 2nd for loop
+  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]);
+  q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]);
 
-    q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
-    q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
+  q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2);
+  q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1);
 
-    d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
-    d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
+  d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
+  d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-    q3s16 = vshrq_n_s16(q3s16, 1);
-    q4s16 = vshrq_n_s16(q4s16, 1);
+  q4s16 = vshrq_n_s16(q4s16, 1);
 
-    q3s16 = vqaddq_s16(q3s16, q2s16);
-    q4s16 = vqaddq_s16(q4s16, q2s16);
+  q4s16 = vqaddq_s16(q4s16, q2s16);
 
-    d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
-    d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
+  d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
+  d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16));  // d1
 
-    d2 = vqadd_s16(d12, d11);
-    d3 = vqadd_s16(d13, d10);
-    d4 = vqsub_s16(d13, d10);
-    d5 = vqsub_s16(d12, d11);
+  d2 = vqadd_s16(d12, d11);
+  d3 = vqadd_s16(d13, d10);
+  d4 = vqsub_s16(d13, d10);
+  d5 = vqsub_s16(d12, d11);
 
-    d2 = vrshr_n_s16(d2, 3);
-    d3 = vrshr_n_s16(d3, 3);
-    d4 = vrshr_n_s16(d4, 3);
-    d5 = vrshr_n_s16(d5, 3);
+  d2 = vrshr_n_s16(d2, 3);
+  d3 = vrshr_n_s16(d3, 3);
+  d4 = vrshr_n_s16(d4, 3);
+  d5 = vrshr_n_s16(d5, 3);
 
-    v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
-    v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
-    v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
-                      vreinterpret_s16_s32(v2tmp1.val[0]));
-    v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
-                      vreinterpret_s16_s32(v2tmp1.val[1]));
+  v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4));
+  v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5));
+  v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]),
+                    vreinterpret_s16_s32(v2tmp1.val[0]));
+  v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]),
+                    vreinterpret_s16_s32(v2tmp1.val[1]));
 
-    q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
-    q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
+  q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]);
+  q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]);
 
-    // dc_only_idct_add
-    for (i = 0; i < 2; i++, q1s16 = q2s16) {
-        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
-        pred_ptr += pred_stride;
-        d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
-        pred_ptr += pred_stride;
+  // dc_only_idct_add
+  for (i = 0; i < 2; i++, q1s16 = q2s16) {
+    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0);
+    pred_ptr += pred_stride;
+    d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1);
+    pred_ptr += pred_stride;
 
-        q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16),
-                         vreinterpret_u8_u32(d6u32));
-        d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32));
+    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
 
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
-        dst_ptr += dst_stride;
-        vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
-        dst_ptr += dst_stride;
-    }
-    return;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0);
+    dst_ptr += dst_stride;
+    vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1);
+    dst_ptr += dst_stride;
+  }
+  return;
 }